diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9194d2ce..1abf329f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,8 +16,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-        os: ["ubuntu-latest"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        os: ["ubuntu-latest", "macos-latest"] #,"windows-latest"
 
     runs-on: ${{ matrix.os }}
     timeout-minutes: 30
@@ -53,5 +53,5 @@ jobs:
       - name: Run tests
         run: python -m pytest
 
-      #- name: Test building the doc
-      #  run: mkdocs build
+      - name: Test building the doc
+        run: mkdocs build
diff --git a/.gitignore b/.gitignore
index 7a6dd93f..ffd7edf6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,3 +149,6 @@ cookie.txt
 *.txt
 *.sh
 .DS_Store
+*.zarr/
+scripts/
+notebooks/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..94a3b7e2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,352 @@
+Creative Commons Attribution-NonCommercial 4.0 International
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright and
+certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+-   Considerations for licensors: Our public licenses are intended for
+    use by those authorized to give the public permission to use
+    material in ways otherwise restricted by copyright and certain other
+    rights. Our licenses are irrevocable. Licensors should read and
+    understand the terms and conditions of the license they choose
+    before applying it. Licensors should also secure all rights
+    necessary before applying our licenses so that the public can reuse
+    the material as expected. Licensors should clearly mark any material
+    not subject to the license. This includes other CC-licensed
+    material, or material used under an exception or limitation to
+    copyright. More considerations for licensors :
+    wiki.creativecommons.org/Considerations\_for\_licensors
+
+-   Considerations for the public: By using one of our public licenses,
+    a licensor grants the public permission to use the licensed material
+    under specified terms and conditions. If the licensor's permission
+    is not necessary for any reason–for example, because of any
+    applicable exception or limitation to copyright–then that use is not
+    regulated by the license. Our licenses grant only permissions under
+    copyright and certain other rights that a licensor has authority to
+    grant. Use of the licensed material may still be restricted for
+    other reasons, including because others have copyright or other
+    rights in the material. A licensor may make special requests, such
+    as asking that all changes be marked or described. Although not
+    required by our licenses, you are encouraged to respect those
+    requests where reasonable. More considerations for the public :
+    wiki.creativecommons.org/Considerations\_for\_licensees
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and conditions.
+
+-   Section 1 – Definitions.
+
+    -   a. Adapted Material means material subject to Copyright and
+        Similar Rights that is derived from or based upon the Licensed
+        Material and in which the Licensed Material is translated,
+        altered, arranged, transformed, or otherwise modified in a
+        manner requiring permission under the Copyright and Similar
+        Rights held by the Licensor. For purposes of this Public
+        License, where the Licensed Material is a musical work,
+        performance, or sound recording, Adapted Material is always
+        produced where the Licensed Material is synched in timed
+        relation with a moving image.
+    -   b. Adapter's License means the license You apply to Your
+        Copyright and Similar Rights in Your contributions to Adapted
+        Material in accordance with the terms and conditions of this
+        Public License.
+    -   c. Copyright and Similar Rights means copyright and/or similar
+        rights closely related to copyright including, without
+        limitation, performance, broadcast, sound recording, and Sui
+        Generis Database Rights, without regard to how the rights are
+        labeled or categorized. For purposes of this Public License, the
+        rights specified in Section 2(b)(1)-(2) are not Copyright and
+        Similar Rights.
+    -   d. Effective Technological Measures means those measures that,
+        in the absence of proper authority, may not be circumvented
+        under laws fulfilling obligations under Article 11 of the WIPO
+        Copyright Treaty adopted on December 20, 1996, and/or similar
+        international agreements.
+    -   e. Exceptions and Limitations means fair use, fair dealing,
+        and/or any other exception or limitation to Copyright and
+        Similar Rights that applies to Your use of the Licensed
+        Material.
+    -   f. Licensed Material means the artistic or literary work,
+        database, or other material to which the Licensor applied this
+        Public License.
+    -   g. Licensed Rights means the rights granted to You subject to
+        the terms and conditions of this Public License, which are
+        limited to all Copyright and Similar Rights that apply to Your
+        use of the Licensed Material and that the Licensor has authority
+        to license.
+    -   h. Licensor means the individual(s) or entity(ies) granting
+        rights under this Public License.
+    -   i. NonCommercial means not primarily intended for or directed
+        towards commercial advantage or monetary compensation. For
+        purposes of this Public License, the exchange of the Licensed
+        Material for other material subject to Copyright and Similar
+        Rights by digital file-sharing or similar means is NonCommercial
+        provided there is no payment of monetary compensation in
+        connection with the exchange.
+    -   j. Share means to provide material to the public by any means or
+        process that requires permission under the Licensed Rights, such
+        as reproduction, public display, public performance,
+        distribution, dissemination, communication, or importation, and
+        to make material available to the public including in ways that
+        members of the public may access the material from a place and
+        at a time individually chosen by them.
+    -   k. Sui Generis Database Rights means rights other than copyright
+        resulting from Directive 96/9/EC of the European Parliament and
+        of the Council of 11 March 1996 on the legal protection of
+        databases, as amended and/or succeeded, as well as other
+        essentially equivalent rights anywhere in the world.
+    -   l. You means the individual or entity exercising the Licensed
+        Rights under this Public License. Your has a corresponding
+        meaning.
+
+-   Section 2 – Scope.
+
+    -   a. License grant.
+        -   1. Subject to the terms and conditions of this Public
+            License, the Licensor hereby grants You a worldwide,
+            royalty-free, non-sublicensable, non-exclusive, irrevocable
+            license to exercise the Licensed Rights in the Licensed
+            Material to:
+            -   A. reproduce and Share the Licensed Material, in whole
+                or in part, for NonCommercial purposes only; and
+            -   B. produce, reproduce, and Share Adapted Material for
+                NonCommercial purposes only.
+        -   2. Exceptions and Limitations. For the avoidance of doubt,
+            where Exceptions and Limitations apply to Your use, this
+            Public License does not apply, and You do not need to comply
+            with its terms and conditions.
+        -   3. Term. The term of this Public License is specified in
+            Section 6(a).
+        -   4. Media and formats; technical modifications allowed. The
+            Licensor authorizes You to exercise the Licensed Rights in
+            all media and formats whether now known or hereafter
+            created, and to make technical modifications necessary to do
+            so. The Licensor waives and/or agrees not to assert any
+            right or authority to forbid You from making technical
+            modifications necessary to exercise the Licensed Rights,
+            including technical modifications necessary to circumvent
+            Effective Technological Measures. For purposes of this
+            Public License, simply making modifications authorized by
+            this Section 2(a)(4) never produces Adapted Material.
+        -   5. Downstream recipients.
+            -   A. Offer from the Licensor – Licensed Material. Every
+                recipient of the Licensed Material automatically
+                receives an offer from the Licensor to exercise the
+                Licensed Rights under the terms and conditions of this
+                Public License.
+            -   B. No downstream restrictions. You may not offer or
+                impose any additional or different terms or conditions
+                on, or apply any Effective Technological Measures to,
+                the Licensed Material if doing so restricts exercise of
+                the Licensed Rights by any recipient of the Licensed
+                Material.
+        -   6. No endorsement. Nothing in this Public License
+            constitutes or may be construed as permission to assert or
+            imply that You are, or that Your use of the Licensed
+            Material is, connected with, or sponsored, endorsed, or
+            granted official status by, the Licensor or others
+            designated to receive attribution as provided in Section
+            3(a)(1)(A)(i).
+    -   b. Other rights.
+        -   1. Moral rights, such as the right of integrity, are not
+            licensed under this Public License, nor are publicity,
+            privacy, and/or other similar personality rights; however,
+            to the extent possible, the Licensor waives and/or agrees
+            not to assert any such rights held by the Licensor to the
+            limited extent necessary to allow You to exercise the
+            Licensed Rights, but not otherwise.
+        -   2. Patent and trademark rights are not licensed under this
+            Public License.
+        -   3. To the extent possible, the Licensor waives any right to
+            collect royalties from You for the exercise of the Licensed
+            Rights, whether directly or through a collecting society
+            under any voluntary or waivable statutory or compulsory
+            licensing scheme. In all other cases the Licensor expressly
+            reserves any right to collect such royalties, including when
+            the Licensed Material is used other than for NonCommercial
+            purposes.
+
+-   Section 3 – License Conditions.
+
+    Your exercise of the Licensed Rights is expressly made subject to
+    the following conditions.
+
+    -   a. Attribution.
+        -   1. If You Share the Licensed Material (including in modified
+            form), You must:
+            -   A. retain the following if it is supplied by the
+                Licensor with the Licensed Material:
+                -   i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if designated);
+                -   ii. a copyright notice;
+                -   iii. a notice that refers to this Public License;
+                -   iv. a notice that refers to the disclaimer of
+                    warranties;
+                -   v. a URI or hyperlink to the Licensed Material to
+                    the extent reasonably practicable;
+            -   B. indicate if You modified the Licensed Material and
+                retain an indication of any previous modifications; and
+            -   C. indicate the Licensed Material is licensed under this
+                Public License, and include the text of, or the URI or
+                hyperlink to, this Public License.
+        -   2. You may satisfy the conditions in Section 3(a)(1) in any
+            reasonable manner based on the medium, means, and context in
+            which You Share the Licensed Material. For example, it may
+            be reasonable to satisfy the conditions by providing a URI
+            or hyperlink to a resource that includes the required
+            information.
+        -   3. If requested by the Licensor, You must remove any of the
+            information required by Section 3(a)(1)(A) to the extent
+            reasonably practicable.
+        -   4. If You Share Adapted Material You produce, the Adapter's
+            License You apply must not prevent recipients of the Adapted
+            Material from complying with this Public License.
+
+-   Section 4 – Sui Generis Database Rights.
+
+    Where the Licensed Rights include Sui Generis Database Rights that
+    apply to Your use of the Licensed Material:
+
+    -   a. for the avoidance of doubt, Section 2(a)(1) grants You the
+        right to extract, reuse, reproduce, and Share all or a
+        substantial portion of the contents of the database for
+        NonCommercial purposes only;
+    -   b. if You include all or a substantial portion of the database
+        contents in a database in which You have Sui Generis Database
+        Rights, then the database in which You have Sui Generis Database
+        Rights (but not its individual contents) is Adapted Material;
+        and
+    -   c. You must comply with the conditions in Section 3(a) if You
+        Share all or a substantial portion of the contents of the
+        database.
+
+    For the avoidance of doubt, this Section 4 supplements and does not
+    replace Your obligations under this Public License where the
+    Licensed Rights include other Copyright and Similar Rights.
+
+-   Section 5 – Disclaimer of Warranties and Limitation of Liability.
+
+    -   a. Unless otherwise separately undertaken by the Licensor, to
+        the extent possible, the Licensor offers the Licensed Material
+        as-is and as-available, and makes no representations or
+        warranties of any kind concerning the Licensed Material, whether
+        express, implied, statutory, or other. This includes, without
+        limitation, warranties of title, merchantability, fitness for a
+        particular purpose, non-infringement, absence of latent or other
+        defects, accuracy, or the presence or absence of errors, whether
+        or not known or discoverable. Where disclaimers of warranties
+        are not allowed in full or in part, this disclaimer may not
+        apply to You.
+    -   b. To the extent possible, in no event will the Licensor be
+        liable to You on any legal theory (including, without
+        limitation, negligence) or otherwise for any direct, special,
+        indirect, incidental, consequential, punitive, exemplary, or
+        other losses, costs, expenses, or damages arising out of this
+        Public License or use of the Licensed Material, even if the
+        Licensor has been advised of the possibility of such losses,
+        costs, expenses, or damages. Where a limitation of liability is
+        not allowed in full or in part, this limitation may not apply to
+        You.
+    -   c. The disclaimer of warranties and limitation of liability
+        provided above shall be interpreted in a manner that, to the
+        extent possible, most closely approximates an absolute
+        disclaimer and waiver of all liability.
+
+-   Section 6 – Term and Termination.
+
+    -   a. This Public License applies for the term of the Copyright and
+        Similar Rights licensed here. However, if You fail to comply
+        with this Public License, then Your rights under this Public
+        License terminate automatically.
+    -   b. Where Your right to use the Licensed Material has terminated
+        under Section 6(a), it reinstates:
+
+        -   1. automatically as of the date the violation is cured,
+            provided it is cured within 30 days of Your discovery of the
+            violation; or
+        -   2. upon express reinstatement by the Licensor.
+
+        For the avoidance of doubt, this Section 6(b) does not affect
+        any right the Licensor may have to seek remedies for Your
+        violations of this Public License.
+
+    -   c. For the avoidance of doubt, the Licensor may also offer the
+        Licensed Material under separate terms or conditions or stop
+        distributing the Licensed Material at any time; however, doing
+        so will not terminate this Public License.
+    -   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+        License.
+
+-   Section 7 – Other Terms and Conditions.
+
+    -   a. The Licensor shall not be bound by any additional or
+        different terms or conditions communicated by You unless
+        expressly agreed.
+    -   b. Any arrangements, understandings, or agreements regarding the
+        Licensed Material not stated herein are separate from and
+        independent of the terms and conditions of this Public License.
+
+-   Section 8 – Interpretation.
+
+    -   a. For the avoidance of doubt, this Public License does not, and
+        shall not be interpreted to, reduce, limit, restrict, or impose
+        conditions on any use of the Licensed Material that could
+        lawfully be made without permission under this Public License.
+    -   b. To the extent possible, if any provision of this Public
+        License is deemed unenforceable, it shall be automatically
+        reformed to the minimum extent necessary to make it enforceable.
+        If the provision cannot be reformed, it shall be severed from
+        this Public License without affecting the enforceability of the
+        remaining terms and conditions.
+    -   c. No term or condition of this Public License will be waived
+        and no failure to comply consented to unless expressly agreed to
+        by the Licensor.
+    -   d. Nothing in this Public License constitutes or may be
+        interpreted as a limitation upon, or waiver of, any privileges
+        and immunities that apply to the Licensor or You, including from
+        the legal processes of any jurisdiction or authority.
+
+Creative Commons is not a party to its public licenses. Notwithstanding,
+Creative Commons may elect to apply one of its public licenses to
+material it publishes and in those instances will be considered the
+"Licensor." The text of the Creative Commons public licenses is
+dedicated to the public domain under the CC0 Public Domain Dedication.
+Except for the limited purpose of indicating that material is shared
+under a Creative Commons public license or as otherwise permitted by the
+Creative Commons policies published at creativecommons.org/policies,
+Creative Commons does not authorize the use of the trademark "Creative
+Commons" or any other trademark or logo of Creative Commons without its
+prior written consent including, without limitation, in connection with
+any unauthorized modifications to any of its public licenses or any
+other arrangements, understandings, or agreements concerning use of
+licensed material. For the avoidance of doubt, this paragraph does not
+form part of the public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/docs/API/available_datasets.md b/docs/API/available_datasets.md
deleted file mode 100644
index fa630b8a..00000000
--- a/docs/API/available_datasets.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Available Datasets
-
-::: openqdc.datasets
diff --git a/docs/API/basedataset.md b/docs/API/basedataset.md
new file mode 100644
index 00000000..cdaeee77
--- /dev/null
+++ b/docs/API/basedataset.md
@@ -0,0 +1 @@
+::: openqdc.datasets.base
diff --git a/docs/API/datasets/alchemy.md b/docs/API/datasets/alchemy.md
new file mode 100644
index 00000000..096774c3
--- /dev/null
+++ b/docs/API/datasets/alchemy.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.alchemy
diff --git a/docs/API/datasets/ani.md b/docs/API/datasets/ani.md
new file mode 100644
index 00000000..4f79f587
--- /dev/null
+++ b/docs/API/datasets/ani.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.ani
diff --git a/docs/API/datasets/comp6.md b/docs/API/datasets/comp6.md
new file mode 100644
index 00000000..e473e211
--- /dev/null
+++ b/docs/API/datasets/comp6.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.comp6
diff --git a/docs/API/datasets/des.md b/docs/API/datasets/des.md
new file mode 100644
index 00000000..dbff5035
--- /dev/null
+++ b/docs/API/datasets/des.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.des
diff --git a/docs/API/datasets/gdml.md b/docs/API/datasets/gdml.md
new file mode 100644
index 00000000..a91cf993
--- /dev/null
+++ b/docs/API/datasets/gdml.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.gdml
diff --git a/docs/API/datasets/geom.md b/docs/API/datasets/geom.md
new file mode 100644
index 00000000..f290eb93
--- /dev/null
+++ b/docs/API/datasets/geom.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.geom.GEOM
diff --git a/docs/API/datasets/iso_17.md b/docs/API/datasets/iso_17.md
new file mode 100644
index 00000000..01a04e67
--- /dev/null
+++ b/docs/API/datasets/iso_17.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.iso_17
diff --git a/docs/API/datasets/l7.md b/docs/API/datasets/l7.md
new file mode 100644
index 00000000..512e7f37
--- /dev/null
+++ b/docs/API/datasets/l7.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.l7
diff --git a/docs/API/datasets/md22.md b/docs/API/datasets/md22.md
new file mode 100644
index 00000000..d793b5cf
--- /dev/null
+++ b/docs/API/datasets/md22.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.md22
diff --git a/docs/API/datasets/metcalf.md b/docs/API/datasets/metcalf.md
new file mode 100644
index 00000000..58566b02
--- /dev/null
+++ b/docs/API/datasets/metcalf.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.metcalf
diff --git a/docs/API/datasets/molecule3d.md b/docs/API/datasets/molecule3d.md
new file mode 100644
index 00000000..d7b6a5a4
--- /dev/null
+++ b/docs/API/datasets/molecule3d.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.molecule3d
diff --git a/docs/API/datasets/multixcqm9.md b/docs/API/datasets/multixcqm9.md
new file mode 100644
index 00000000..55993cd7
--- /dev/null
+++ b/docs/API/datasets/multixcqm9.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.multixcqm9
diff --git a/docs/API/datasets/nabladft.md b/docs/API/datasets/nabladft.md
new file mode 100644
index 00000000..a69d68d7
--- /dev/null
+++ b/docs/API/datasets/nabladft.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.nabladft
diff --git a/docs/API/datasets/orbnet_denali.md b/docs/API/datasets/orbnet_denali.md
new file mode 100644
index 00000000..1b4ee6a7
--- /dev/null
+++ b/docs/API/datasets/orbnet_denali.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.orbnet_denali
diff --git a/docs/API/datasets/pcqm.md b/docs/API/datasets/pcqm.md
new file mode 100644
index 00000000..6cd1b92b
--- /dev/null
+++ b/docs/API/datasets/pcqm.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.pcqm
diff --git a/docs/API/datasets/proteinfragments.md b/docs/API/datasets/proteinfragments.md
new file mode 100644
index 00000000..d5aa28a5
--- /dev/null
+++ b/docs/API/datasets/proteinfragments.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.proteinfragments
diff --git a/docs/API/datasets/qm1b.md b/docs/API/datasets/qm1b.md
new file mode 100644
index 00000000..b92dfff4
--- /dev/null
+++ b/docs/API/datasets/qm1b.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qm1b
diff --git a/docs/API/datasets/qm7x.md b/docs/API/datasets/qm7x.md
new file mode 100644
index 00000000..d649b40d
--- /dev/null
+++ b/docs/API/datasets/qm7x.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qm7x
diff --git a/docs/API/datasets/qmugs.md b/docs/API/datasets/qmugs.md
new file mode 100644
index 00000000..06773b68
--- /dev/null
+++ b/docs/API/datasets/qmugs.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qmugs
diff --git a/docs/API/datasets/qmx.md b/docs/API/datasets/qmx.md
new file mode 100644
index 00000000..b7343767
--- /dev/null
+++ b/docs/API/datasets/qmx.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qmx
diff --git a/docs/API/datasets/revmd17.md b/docs/API/datasets/revmd17.md
new file mode 100644
index 00000000..e63ba031
--- /dev/null
+++ b/docs/API/datasets/revmd17.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.revmd17
diff --git a/docs/API/datasets/sn2_rxn.md b/docs/API/datasets/sn2_rxn.md
new file mode 100644
index 00000000..9095532c
--- /dev/null
+++ b/docs/API/datasets/sn2_rxn.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.sn2_rxn
diff --git a/docs/API/datasets/solvated_peptides.md b/docs/API/datasets/solvated_peptides.md
new file mode 100644
index 00000000..a6139c12
--- /dev/null
+++ b/docs/API/datasets/solvated_peptides.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.solvated_peptides
diff --git a/docs/API/datasets/spice.md b/docs/API/datasets/spice.md
new file mode 100644
index 00000000..c0e95b79
--- /dev/null
+++ b/docs/API/datasets/spice.md
@@ -0,0 +1,2 @@
+
+::: openqdc.datasets.potential.spice
diff --git a/docs/API/datasets/splinter.md b/docs/API/datasets/splinter.md
new file mode 100644
index 00000000..00789cfa
--- /dev/null
+++ b/docs/API/datasets/splinter.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.splinter
diff --git a/docs/API/datasets/tmqm.md b/docs/API/datasets/tmqm.md
new file mode 100644
index 00000000..70b56781
--- /dev/null
+++ b/docs/API/datasets/tmqm.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.tmqm
diff --git a/docs/API/datasets/transition1x.md b/docs/API/datasets/transition1x.md
new file mode 100644
index 00000000..63eceaa3
--- /dev/null
+++ b/docs/API/datasets/transition1x.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.transition1x
diff --git a/docs/API/datasets/vqm24.md b/docs/API/datasets/vqm24.md
new file mode 100644
index 00000000..ed117b9f
--- /dev/null
+++ b/docs/API/datasets/vqm24.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.vqm24
diff --git a/docs/API/datasets/waterclusters.md b/docs/API/datasets/waterclusters.md
new file mode 100644
index 00000000..f1f90883
--- /dev/null
+++ b/docs/API/datasets/waterclusters.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.waterclusters
diff --git a/docs/API/datasets/waterclusters3_30.md b/docs/API/datasets/waterclusters3_30.md
new file mode 100644
index 00000000..3f0ccf7f
--- /dev/null
+++ b/docs/API/datasets/waterclusters3_30.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.waterclusters3_30
diff --git a/docs/API/datasets/x40.md b/docs/API/datasets/x40.md
new file mode 100644
index 00000000..799738c5
--- /dev/null
+++ b/docs/API/datasets/x40.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.x40
diff --git a/docs/API/formats.md b/docs/API/formats.md
new file mode 100644
index 00000000..fab98169
--- /dev/null
+++ b/docs/API/formats.md
@@ -0,0 +1 @@
+::: openqdc.datasets.structure
diff --git a/docs/API/methods.md b/docs/API/methods.md
index 7814334e..ce1d94a4 100644
--- a/docs/API/methods.md
+++ b/docs/API/methods.md
@@ -1,3 +1,7 @@
 # QM Methods
 
-::: openqdc.methods
+::: openqdc.methods.enums
+
+# Isolated Atom Energies
+
+::: openqdc.methods.atom_energies
diff --git a/docs/API/regressor.md b/docs/API/regressor.md
new file mode 100644
index 00000000..dff0ad98
--- /dev/null
+++ b/docs/API/regressor.md
@@ -0,0 +1 @@
+::: openqdc.utils.regressor
diff --git a/docs/API/units.md b/docs/API/units.md
new file mode 100644
index 00000000..0401bdc4
--- /dev/null
+++ b/docs/API/units.md
@@ -0,0 +1,3 @@
+# UNITS
+
+::: openqdc.utils.units
diff --git a/docs/API/utils.md b/docs/API/utils.md
new file mode 100644
index 00000000..35fae5c8
--- /dev/null
+++ b/docs/API/utils.md
@@ -0,0 +1 @@
+::: openqdc.utils
diff --git a/docs/_overrides/main.html b/docs/_overrides/main.html
deleted file mode 100644
index 2eafd76b..00000000
--- a/docs/_overrides/main.html
+++ /dev/null
@@ -1,46 +0,0 @@
-{% extends "base.html" %}
-
-{% block content %}
-{{ super() }}
-
-<style>
-    .jp-RenderedHTMLCommon p {
-        font-size: .8rem;
-        line-height: 1.6;
-    }
-
-    .jp-RenderedHTMLCommon li {
-        font-size: .8rem;
-        line-height: 1.6;
-    }
-
-    .jp-RenderedHTMLCommon h1 {
-        margin: 0 0 1.25em;
-        color: var(--md-default-fg-color--light);
-        font-weight: 300;
-        font-size: 2em;
-        line-height: 1.3;
-        letter-spacing: -0.01em;
-    }
-
-    .jp-RenderedHTMLCommon h2 {
-        margin: 1.6em 0 .64em;
-        font-weight: 300;
-        font-size: 1.965em;
-        line-height: 1.4;
-        letter-spacing: -0.01em;
-    }
-
-    .jp-RenderedHTMLCommon h3 {
-        margin: 1.6em 0 .8em;
-        font-weight: 400;
-        font-size: 1.57em;
-        line-height: 1.5;
-        letter-spacing: -0.01em;
-    }
-
-    .jp-RenderedHTMLCommon hr {
-        border: none;
-    }
-</style>
-{% endblock content %}
diff --git a/docs/assets/StorageView.png b/docs/assets/StorageView.png
new file mode 100644
index 00000000..8d398926
Binary files /dev/null and b/docs/assets/StorageView.png differ
diff --git a/docs/assets/qdc_logo.png b/docs/assets/qdc_logo.png
new file mode 100644
index 00000000..a8138dcc
Binary files /dev/null and b/docs/assets/qdc_logo.png differ
diff --git a/docs/cli.md b/docs/cli.md
new file mode 100644
index 00000000..8e2cd53b
--- /dev/null
+++ b/docs/cli.md
@@ -0,0 +1,113 @@
+# CLI for dataset downloading and uploading
+You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).
+
+## Datasets
+Print a formatted table of the available openQDC datasets and some informations.
+
+Usage:
+
+    openqdc datasets [OPTIONS]
+
+Options:
+
+    --help          Show this message and exit.
+
+## Cache
+Get the current local cache path of openQDC
+
+Usage:
+
+    openqdc cache [OPTIONS]
+
+Options:
+
+    --help          Show this message and exit.
+
+
+## Download
+Download preprocessed ml-ready datasets from the main openQDC hub.
+
+Usage:
+
+    openqdc download DATASETS... [OPTIONS]
+
+Options:
+
+    --help          Show this message and exit.
+    --overwrite     Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]
+    --cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]
+    --as-zarr       Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]
+    --gs            Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]
+
+Example:
+
+    openqdc download Spice
+
+## Fetch
+Download the raw datasets files from the main openQDC hub
+
+Note:
+
+    Special case: if the dataset is "all", "potential", "interaction".
+
+Usage:
+
+    openqdc fetch DATASETS... [OPTIONS]
+
+Options:
+
+    --help          Show this message and exit.
+    --overwrite     Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]
+    --cache-dir     Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]
+
+Example:
+
+    openqdc fetch Spice
+
+## Preprocess
+Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.
+
+Usage:
+
+    openqdc preprocess DATASETS... [OPTIONS]
+
+Options:
+
+    --help         Show this message and exit.
+    --overwrite    Whether to overwrite the current cached datasets. [default: overwrite]
+    --upload       Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]
+    --as-zarr      Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]
+
+Example:
+
+    openqdc preprocess Spice QMugs
+
+## Upload
+Upload a preprocessed dataset to the remote storage
+
+Usage:
+
+    openqdc upload DATASETS... [OPTIONS]
+
+Options:
+
+    --help          Show this message and exit.
+    --overwrite     Whether to overwrite the remote files if they are present. [default: overwrite]
+    --as-zarr       Whether to upload the zarr files if available. [default: no-as-zarr]
+
+Example:
+
+    openqdc upload Spice --overwrite
+
+## Convert
+Convert a preprocessed dataset from a memmap dataset to a zarr dataset.
+
+Usage:
+
+    openqdc convert DATASETS... [OPTIONS]
+
+Options:
+
+    --help          Show this message and exit.
+    --overwrite     Whether to overwrite the current zarr cached datasets. [default: no-overwrite]
+    --download      Whether to force the re-download of the memmap datasets. [default: no-download]
diff --git a/docs/contribute.md b/docs/contribute.md
new file mode 100644
index 00000000..e0e22721
--- /dev/null
+++ b/docs/contribute.md
@@ -0,0 +1,59 @@
+# Contribute
+
+The below documents the development lifecycle of OpenQDC.
+
+## Setup a dev environment
+
+```bash
+mamba env create -n openqdc -f env.yml
+mamba activate datamol
+pip install -e .
+```
+
+## Pre commit installation
+
+```bash
+pre-commit install
+pre-commit run --all-files
+```
+
+## Continuous Integration
+
+OpenQDC uses Github Actions to:
+
+- **Build and test** `openQDC`.
+  - Multiple combinations of OS and Python versions are tested.
+- **Check** the code:
+  - Formatting with `black`.
+  - Static type check with `mypy`.
+  - Modules import formatting with `isort`.
+  - Pre-commit hooks.
+- **Documentation**:
+  - Google docstring format.
+  - build and deploy the documentation on `main` and for every new git tag.
+
+
+## Run tests
+
+```bash
+pytest
+```
+
+## Build the documentation
+
+You can build and serve the documentation locally with:
+
+```bash
+# Build and serve the doc
+mike serve
+```
+
+or with
+
+```bash
+mkdocs serve
+```
+
+### Multi-versionning
+
+The doc is built for eash push on `main` and every git tags using [mike](https://github.com/jimporter/mike). Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.
diff --git a/docs/css/custom-openqdc.css b/docs/css/custom-openqdc.css
new file mode 100644
index 00000000..a1d97cf7
--- /dev/null
+++ b/docs/css/custom-openqdc.css
@@ -0,0 +1,92 @@
+:root {
+    --openqdc-primary: ##201342;
+    --openqdc-secondary: #4A1E7E;
+
+    /* Primary color shades */
+    --md-primary-fg-color: var(--openqdc-primary);
+    --md-primary-fg-color--light: var(--openqdc-primary);
+    --md-primary-fg-color--dark: var(--openqdc-primary);
+    --md-primary-bg-color: var(--openqdc-secondary);
+    --md-primary-bg-color--light: var(--openqdc-secondary);
+    --md-text-link-color: var(--openqdc-secondary);
+
+    /* Accent color shades */
+    --md-accent-fg-color: var(--openqdc-secondary);
+    --md-accent-fg-color--transparent: var(--openqdc-secondary);
+    --md-accent-bg-color: var(--openqdc-secondary);
+    --md-accent-bg-color--light: var(--openqdc-secondary);
+  }
+
+  :root>* {
+    /* Code block color shades */
+    --md-code-bg-color: hsla(0, 0%, 96%, 1);
+    --md-code-fg-color: hsla(200, 18%, 26%, 1);
+
+    /* Footer */
+    --md-footer-bg-color: var(--openqdc-primary);
+    /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */
+    --md-footer-fg-color: var(--openqdc-secondary);
+    --md-footer-fg-color--light: var(--openqdc-secondary);
+    --md-footer-fg-color--lighter: var(--openqdc-secondary);
+
+  }
+
+  .md-header {
+    background-image: linear-gradient(to right, #131036, #4A1E7E);
+  }
+
+  .md-footer {
+    background-image: linear-gradient(to right, #131036, #4A1E7E);
+  }
+
+  .md-tabs {
+    background-image: linear-gradient(to right, #F4F6F9, #b39bce);
+  }
+
+  .md-header__topic {
+    color: rgb(255, 255, 255);
+  }
+
+  .md-source__repository,
+  .md-source__icon,
+  .md-search__input,
+  .md-search__input::placeholder,
+  .md-search__input~.md-search__icon,
+  .md-footer__inner.md-grid,
+  .md-copyright__highlight,
+  .md-copyright,
+  .md-footer-meta.md-typeset a,
+  .md-version {
+    color: rgb(255, 255, 255) !important;
+  }
+
+  .md-search__form {
+    background-color: rgba(255, 255, 255, 0.2);
+  }
+
+  .md-search__input {
+    color: #222222 !important;
+  }
+
+  .md-header__topic {
+    color: rgb(255, 255, 255);
+    font-size: 1.4em;
+  }
+
+  /* Increase the size of the logo */
+  .md-header__button.md-logo img,
+  .md-header__button.md-logo svg {
+    height: 2rem !important;
+  }
+
+  /* Reduce the margin around the logo */
+  .md-header__button.md-logo {
+    margin: 0.4em;
+    padding: 0.4em;
+  }
+
+  /* Remove the `In` and `Out` block in rendered Jupyter notebooks */
+  .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt,
+  .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt {
+    display: none !important;
+  }
diff --git a/docs/data_storage.md b/docs/data_storage.md
new file mode 100644
index 00000000..b24bec3b
--- /dev/null
+++ b/docs/data_storage.md
@@ -0,0 +1,33 @@
+## Dataset structure
+
+For a dataset with N geometries, M atoms across all geometries, ne energy labels,
+and nf force labels, we use zarr or memory-mapped arrays of various sizes:
+
+- (M, 5) for atomic numbers (1),
+charges (1), and positions (3) of individual geometries;
+
+- (N, 2) for the beginning and end indices of
+each geometry in the previous array;
+
+- (N, ne) for the energy labels of each geometry, extendable to
+store other geometry-level QM properties such as HOMO-LUMO gap;
+
+- (M, nf , 3) for the force labels
+of each geometry, extendable to store other atom-level QM properties.
+
+
+The memory-mapped files efficiently access data stored on disk or in the cloud without reading
+them into memory, enabling training on machines with smaller RAM than the dataset size and
+accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing,
+batching and iteration.
+
+![](assets/StorageView.png)
+
+
+## Formats
+
+We currently support the following formats:
+
+1) Zarr : https://zarr.readthedocs.io/en/stable/index.html
+
+2) Memmap : https://numpy.org/doc/stable/index.html
diff --git a/docs/dataset_upload.md b/docs/dataset_upload.md
new file mode 100644
index 00000000..e4740f75
--- /dev/null
+++ b/docs/dataset_upload.md
@@ -0,0 +1,69 @@
+# How to Add a Dataset to OpenQDC
+
+Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC?
+If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:
+
+1. Opening a PR to add a new dataset
+2. Request a new dataset through Google Form
+
+## OpenQDC PR Guidelines
+
+Implement your dataset in the OpenQDC repository by following the guidelines below:
+
+### Dataset class
+
+- The dataset class should be implemented in the `openqdc/datasets` directory.
+- The dataset class should inherit from the `openqdc.datasets.base.BaseDataset` class.
+- Add your `dataset.py` file to the `openqdc/datasets/potential` or `openqdc/datasets/interaction/` directory based on the type of energy.
+- Implement the following for your dataset:
+  - Add the metadata of the dataset:
+    - Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset.
+    - `__links__`: Dictionary of name and link to download the dataset.
+    - `__name__`: Name of the dataset. This will create a folder with the name of the dataset in the cache directory.
+    - The original units for the dataset `__energy_unit__` and `__distance_unit__`.
+    - `__force_mask__`: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans.
+    - `__energy_methods__`: List of the `QmMethod` methods present in the dataset.
+  - `read_raw_entries(self)` -> `List[Dict[str, Any]]`: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys:
+    - `atomic_inputs` : Atomic inputs of the molecule. numpy.Float32.
+    - `name`: Atomic numbers of the atoms in the molecule. numpy.Object.
+    - `subset`: Positions of the atoms in the molecule.  numpy.Object.
+    - `energies`: Energies of the molecule. numpy.Float64.
+    - `n_atoms`: Number of atoms in the molecule. numpy.Int32
+    - `forces`: Forces of the molecule. [Optional] numpy.Float32.
+  - Add the dataset import to the `openqdc/datasets/<type_of_dataset>/__init__.py` file and to `openqdc/__init__.py`.
+
+### Test the dataset
+
+Try to run the openQDC CLI pipeline with the dataset you implemented.
+
+Run the following command to download the dataset:
+
+- Fetch the dataset files
+```bash
+openqdc fetch DATASET_NAME
+```
+
+- Preprocess the dataset
+```bash
+openqdc preprocess DATASET_NAME
+```
+
+- Load it on python and check if the dataset is correctly loaded.
+```python
+from openqdc import DATASET_NAME
+ds=DATASET_NAME()
+```
+
+If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.
+
+- Select for your PR the `dataset` label.
+
+Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.
+
+## OpenQDC Google Form
+
+Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you.
+You can fill out the Google Form [here](https://docs.google.com/forms/d/e/1FAIpQLSeh0YHRn-OoqPpUbrL7G-EOu3LtZC24rtQWwbjJaZ-2V8P2vQ/viewform?usp=sf_link)
+
+As the openQDC team will strive to provide a high quality curation and upload,
+please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.
diff --git a/docs/index.md b/docs/index.md
index 264211f1..db497b10 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,30 +1,65 @@
-# openQDC
+# Overview
 
-Open Quantum Data Commons
+OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.
 
-## Setup Datasets
+- 🐍 Simple pythonic API
+- 🕹️  ML-Ready: all you manipulate are `torch.Tensor`,`jax.Array` or `numpy.Array`objects.
+- ⚛️ Quantum Ready: The quantum methods are checked and standardized to provide addictional values.
+- ✅ Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels.
+- 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
+- 📈 Data: have access to 1.5+ billion datapoints
 
-Use the scripts in `setup/` to download the datasets. For more information, see the [README](setup/README.md) in the `setup/` directory.
+Visit our website at TOFILL <IDK>.
+
+## Installation
+
+Use mamba:
 
-# Install the library in dev mode
 ```bash
-# Install the deps
-mamba env create -n qdc -f env.yml
+mamba install -c conda-forge openqdc
+```
 
-# Activate the environment
-mamba activate  qdc
+_**Tips:** You can replace `mamba` by `conda`._
 
-# Install the qdc library in dev mode
-pip install -e .
+_**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._
 
-```
+## Quick API Tour
 
-## Development lifecycle
+```python
+from openqdc as Spice
 
-### Tests
+# Load the original dataset
+dataset = Spice()
 
-You can run tests locally with:
+# Load the dataset with a different units
+dataset = Spice(
+    energy_unit = "kcal/mol",
+    distance_unit = "ang",
+    energy_type = "formation",
+    array_format = "torch"
+)
 
-```bash
-pytest .
+# Access the data
+data = dataset[0]
+
+# Get relevant statistics
+dataset.get_statistics()
+
+# Get dataset metadata
+dataset.average_n_atoms
+dataset.chemical_species
+dataset.charges
+
+# Compute physical descriptors
+dataset.calculate_descriptors(
+    descriptor_name = "soap"
+)
 ```
+
+## How to cite
+
+Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link).
+
+## Compatibilities
+
+OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.
diff --git a/docs/licensing.md b/docs/licensing.md
new file mode 100644
index 00000000..ec5a3857
--- /dev/null
+++ b/docs/licensing.md
@@ -0,0 +1,3 @@
+```
+{!LICENSE!}
+```
diff --git a/docs/normalization_e0s.md b/docs/normalization_e0s.md
new file mode 100644
index 00000000..426e7d0d
--- /dev/null
+++ b/docs/normalization_e0s.md
@@ -0,0 +1,38 @@
+# Overview of QM Methods and Normalization
+
+OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize
+the usage of different level of theories used for Quantum Mechanics Single Point Calculations
+to add value and information to the datasets.
+
+## Level of Theory
+
+To avoid inconsistencies, level of theories are standardized and categorized into Python Enums
+consisting of a functional, a basis set, and a correction method.
+OpenQDC covers more than 106 functionals, 20 basis sets, and 11
+correction methods.
+OpenQDC provides the computed the isolated atom energies `e0` for each QM method.
+
+
+## Normalization
+
+
+We provide support of energies through "physical" and "regression" normalization to conserve the size extensivity of chemical systems.
+OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies `e0`
+physically interpretable and extensivity-conserving normalization method. Alternatively, we pre-
+compute the average contribution of each atom species to potential energy via linear or ridge
+regression, centering the distribution at 0 and providing uncertainty estimation for the computed
+values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.
+
+### Physical Normalization
+
+`e0` energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from
+the potential energy to obtain the atomization energy. This normalization method is physically interpretable and
+only remove the atom energy contribution from the potential energy.
+
+
+### Regression Normalization
+
+`e0` energies are calculated for each atom in the dataset from fitting a regression model to the potential energy.
+The `e0` energies are then subtracted from the potential energy to obtain the atomization energy. This normalization
+provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy.
+The resulting formation energy is centered at 0.
diff --git a/docs/usage.md b/docs/usage.md
new file mode 100644
index 00000000..af62f453
--- /dev/null
+++ b/docs/usage.md
@@ -0,0 +1,42 @@
+# Usage
+
+## How to use
+
+OpenQDC has been designed to be used with a single import:
+
+```python
+import openqdc as qdc
+dataset = qdc.QM9()
+```
+
+All `openQDC` functions are available under `qdc`.
+Or if you want to directly import a specific dataset:
+
+```python
+from openqdc as Spice
+# Spice dataset with distance unit in angstrom instead of bohr
+dataset = Spice(distance_unit="ang",
+                array_format = "jax"
+)
+dataset[0] # dict of jax array
+```
+
+Or if you prefer handling `ase.Atoms` objects:
+
+```python
+dataset.get_ase_atoms(0)
+```
+
+## Iterators
+
+OpenQDC provides a simple way to get the data as iterators:
+
+```python
+for data in dataset.as_iter(atoms=True):
+    print(data) # Atoms object
+    break
+```
+
+## Lazy loading
+
+OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`.
diff --git a/env.yml b/env.yml
index 16ccc3c2..87a9ccac 100644
--- a/env.yml
+++ b/env.yml
@@ -11,10 +11,15 @@ dependencies:
   - gcsfs
   - typer
   - prettytable
+  - s3fs
+  - pydantic
+  - python-dotenv
+
 
   # Scientific
   - pandas
   - numpy
+  - zarr
 
   # Chem
   - datamol #==0.9.0
@@ -36,6 +41,7 @@ dependencies:
   - ruff
 
   # Doc
+  - mike
   - mkdocs
   - mkdocs-material
   - mkdocs-material-extensions
diff --git a/mkdocs.yml b/mkdocs.yml
index caac43c9..fdb8856a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,39 +1,91 @@
-site_name: "Open Quantum Data Commons (openQDC)"
+site_name: "OpenQDC"
 site_description: "I don't know... Something about data and Quantum stuff I guess :D"
-site_url: "https://github.com/OpenDrugDiscovery/openQDC"
 repo_url: "https://github.com/OpenDrugDiscovery/openQDC"
 repo_name: "openQDC"
 copyright: Copyright 2023 Valence Labs
 
+site_url: "https://github.com/OpenDrugDiscovery/openQDC"
 remote_branch: "privpage"
 use_directory_urls: false
 docs_dir: "docs"
 
+# Fail on warnings to detect issues with types and docstring
+strict: true
+
 nav:
   - Overview: index.md
+  - Usage:
+    - Base usage : usage.md
+    - CLI: cli.md
   - Available Datasets: datasets.md
-  #- Tutorials:
-  #  #- Really hard example: tutorials/usage.ipynb
+  - QM methods: normalization_e0s.md
+  - Data structure: data_storage.md
+  - Tutorials:
+    - Really hard example: tutorials/usage.ipynb
   - API:
-    - Datasets: API/available_datasets.md
-    - Isolated Atoms Energies: API/isolated_atom_energies.md
+    - QM methods: API/methods.md
+    - Normalization regressor: API/regressor.md
+    - Main class: API/basedataset.md
+    - Format loading: API/formats.md
+    - Datasets:
+      - Potential Energy:
+        - Alchemy : API/datasets/alchemy.md
+        - ANI : API/datasets/ani.md
+        - Spice : API/datasets/spice.md
+        - GEOM : API/datasets/geom.md
+        - Qmugs : API/datasets/qmugs.md
+        - ISO_17 : API/datasets/iso_17.md
+        - Comp6 : API/datasets/comp6.md
+        - GDML : API/datasets/gdml.md
+        - Molecule3D : API/datasets/molecule3d.md
+        - Orbnet Denali : API/datasets/orbnet_denali.md
+        - SN2 RXN : API/datasets/sn2_rxn.md
+        - QM7X : API/datasets/qm7x.md
+        - QM1B : API/datasets/qm1b.md
+        - NablaDFT : API/datasets/nabladft.md
+        - Solvated Peptides : API/datasets/solvated_peptides.md
+        - Waterclusters3_30 : API/datasets/waterclusters3_30.md
+        - SCAN Waterclusters : API/datasets/waterclusters.md
+        - TMQM : API/datasets/tmqm.md
+        - PCQM : API/datasets/pcqm.md
+        - RevMD17 : API/datasets/revmd17.md
+        - MD22 : API/datasets/md22.md
+        - Transition1X : API/datasets/transition1x.md
+        - MultixcQM9 : API/datasets/multixcqm9.md
+        - QMX : API/datasets/qmx.md
+        - Protein Fragments : API/datasets/proteinfragments.md
+        - VQM24 : API/datasets/vqm24.md
+      - Interaction Energy:
+        - DES : API/datasets/des.md
+        - L7 : API/datasets/l7.md
+        - X40 : API/datasets/x40.md
+        - Metcalf : API/datasets/metcalf.md
+        - Splinter : API/datasets/splinter.md
+    - Units: API/units.md
+    - Utils: API/utils.md
+  - Contribute:
+    - Mantaining: contribute.md
+    - Add a dataset: dataset_upload.md
+  - License: licensing.md
+
 theme:
   name: material
-  custom_dir: docs/_overrides
-  palette:
-    primary: teal
-    accent: purple
+  #custom_dir: docs/_overrides
   features:
     - navigation.tabs
-    - navigation.expand
+    #- navigation.expand
+  #favicon: assets/qdc_logo.png
+  logo: assets/qdc_logo.png
 
 
 extra_css:
   - css/custom.css
+  - css/custom-openqdc.css
 
 extra_javascript:
   - javascripts/config.js
   - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
+  #- https://unpkg.com/mermaid@10.9.0/dist/mermaid.min.js
 
 markdown_extensions:
   - admonition
@@ -53,11 +105,14 @@ markdown_extensions:
   - toc:
       permalink: true
 
+watch:
+  - openqdc/
+
 plugins:
   - search
   - mkdocstrings:
-      watch:
-        - openqdc/
+      #watch:
+      #  - openqdc/
       handlers:
         python:
           setup_commands:
@@ -69,7 +124,11 @@ plugins:
           rendering:
             show_root_heading: yes
             heading_level: 3
-            show_if_no_docstring: true
+            show_if_no_docstring: false
   - mkdocs-jupyter:
       execute: False
       # kernel_name: python3
+
+extra:
+  version:
+    provider: mike
diff --git a/openqdc/__init__.py b/openqdc/__init__.py
index 7e2eb2cf..c6be72d4 100644
--- a/openqdc/__init__.py
+++ b/openqdc/__init__.py
@@ -15,6 +15,7 @@ def get_project_root():
     "__version__": "openqdc._version",
     "BaseDataset": "openqdc.datasets.base",
     # POTENTIAL
+    "Alchemy": "openqdc.datasets.potential.alchemy",
     "ANI1": "openqdc.datasets.potential.ani",
     "ANI1CCX": "openqdc.datasets.potential.ani",
     "ANI1CCX_V2": "openqdc.datasets.potential.ani",
@@ -39,6 +40,7 @@ def get_project_root():
     "NablaDFT": "openqdc.datasets.potential.nabladft",
     "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
     "WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
+    "SCANWaterClusters": "openqdc.datasets.potential.waterclusters",
     "TMQM": "openqdc.datasets.potential.tmqm",
     "PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
     "PCQM_PM6": "openqdc.datasets.potential.pcqm",
@@ -47,6 +49,13 @@ def get_project_root():
     "Transition1X": "openqdc.datasets.potential.transition1x",
     "MultixcQM9": "openqdc.datasets.potential.multixcqm9",
     "MultixcQM9_V2": "openqdc.datasets.potential.multixcqm9",
+    "QM7": "openqdc.datasets.potential.qmx",
+    "QM7b": "openqdc.datasets.potential.qmx",
+    "QM8": "openqdc.datasets.potential.qmx",
+    "QM9": "openqdc.datasets.potential.qmx",
+    "ProteinFragments": "openqdc.datasets.potential.proteinfragments",
+    "MDDataset": "openqdc.datasets.potential.proteinfragments",
+    "VQM24": "openqdc.datasets.potential.vqm24",
     # INTERACTION
     "DES5M": "openqdc.datasets.interaction.des",
     "DES370K": "openqdc.datasets.interaction.des",
@@ -58,6 +67,7 @@ def get_project_root():
     "Splinter": "openqdc.datasets.interaction.splinter",
     # DEBUG
     "Dummy": "openqdc.datasets.potential.dummy",
+    "PredefinedDataset": "openqdc.datasets.potential.dummy",
     # ALL
     "AVAILABLE_DATASETS": "openqdc.datasets",
     "AVAILABLE_POTENTIAL_DATASETS": "openqdc.datasets.potential",
@@ -105,9 +115,10 @@ def __dir__():
     from .datasets.interaction.x40 import X40
 
     # POTENTIAL
+    from .datasets.potential.alchemy import Alchemy
     from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
     from .datasets.potential.comp6 import COMP6
-    from .datasets.potential.dummy import Dummy
+    from .datasets.potential.dummy import Dummy, PredefinedDataset
     from .datasets.potential.gdml import GDML
     from .datasets.potential.geom import GEOM
     from .datasets.potential.iso_17 import ISO17
@@ -117,13 +128,17 @@ def __dir__():
     from .datasets.potential.nabladft import NablaDFT
     from .datasets.potential.orbnet_denali import OrbnetDenali
     from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6
+    from .datasets.potential.proteinfragments import MDDataset, ProteinFragments
     from .datasets.potential.qm1b import QM1B, QM1B_SMALL
     from .datasets.potential.qm7x import QM7X, QM7X_V2
     from .datasets.potential.qmugs import QMugs, QMugs_V2
+    from .datasets.potential.qmx import QM7, QM8, QM9, QM7b
     from .datasets.potential.revmd17 import RevMD17
     from .datasets.potential.sn2_rxn import SN2RXN
     from .datasets.potential.solvated_peptides import SolvatedPeptides
     from .datasets.potential.spice import Spice, SpiceV2, SpiceVL2
     from .datasets.potential.tmqm import TMQM
     from .datasets.potential.transition1x import Transition1X
+    from .datasets.potential.vqm24 import VQM24
+    from .datasets.potential.waterclusters import SCANWaterClusters
     from .datasets.potential.waterclusters3_30 import WaterClusters
diff --git a/openqdc/cli.py b/openqdc/cli.py
index 1d985090..7b32c9ae 100644
--- a/openqdc/cli.py
+++ b/openqdc/cli.py
@@ -1,3 +1,4 @@
+import os
 from typing import List, Optional
 
 import typer
@@ -12,27 +13,40 @@
     AVAILABLE_INTERACTION_DATASETS,
     AVAILABLE_POTENTIAL_DATASETS,
 )
+from openqdc.utils.io import get_local_cache
 
 app = typer.Typer(help="OpenQDC CLI")
 
 
 def sanitize(dictionary):
+    """
+    Sanitize dataset names to be used in the CLI.
+    """
     return {k.lower().replace("_", "").replace("-", ""): v for k, v in dictionary.items()}
 
 
 SANITIZED_AVAILABLE_DATASETS = sanitize(AVAILABLE_DATASETS)
 
 
-def exist_dataset(dataset):
+def exist_dataset(dataset) -> bool:
+    """
+    Check if dataset is available in the openQDC datasets.
+    """
     if dataset not in sanitize(AVAILABLE_DATASETS):
         logger.error(f"{dataset} is not available. Please open an issue on Github for the team to look into it.")
         return False
     return True
 
 
-def format_entry(empty_dataset):
+def format_entry(empty_dataset, max_num_to_display: int = 6):
+    """
+    Format the entry for the table.
+    max_num_to_display: int = 6,
+        Maximum number of energy methods to display. Used to keep the table format
+        readable in case of datasets with many energy methods. [ex. MultiXQM9]
+    """
     energy_methods = [str(x) for x in empty_dataset.__energy_methods__]
-    max_num_to_display = 6
+
     if len(energy_methods) > 6:
         entry = ",".join(energy_methods[:max_num_to_display]) + "..."
     else:
@@ -46,7 +60,7 @@ def download(
     overwrite: Annotated[
         bool,
         typer.Option(
-            help="Whether to overwrite or force the re-download of the datasets.",
+            help="Whether to force the re-download of the datasets and overwrite the current cached dataset.",
         ),
     ] = False,
     cache_dir: Annotated[
@@ -55,6 +69,19 @@ def download(
             help="Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used.",
         ),
     ] = None,
+    as_zarr: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to use a zarr format for the datasets instead of memmap.",
+        ),
+    ] = False,
+    gs: Annotated[
+        bool,
+        typer.Option(
+            help="Whether source to use for downloading. If True, Google Storage will be used."
+            + "Otherwise, AWS S3 will be used",
+        ),
+    ] = False,
 ):
     """
     Download preprocessed ml-ready datasets from the main openQDC hub.
@@ -62,18 +89,25 @@ def download(
     Example:
         openqdc download Spice QMugs
     """
+    if gs:
+        os.environ["OPENQDC_DOWNLOAD_API"] = "gs"
+
     for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
         if exist_dataset(dataset):
-            if SANITIZED_AVAILABLE_DATASETS[dataset].no_init().is_cached() and not overwrite:
+            ds = SANITIZED_AVAILABLE_DATASETS[dataset].no_init()
+            ds.read_as_zarr = as_zarr
+            if ds.is_cached() and not overwrite:
                 logger.info(f"{dataset} is already cached. Skipping download")
             else:
-                SANITIZED_AVAILABLE_DATASETS[dataset](overwrite_local_cache=True, cache_dir=cache_dir)
+                SANITIZED_AVAILABLE_DATASETS[dataset](
+                    overwrite_local_cache=True, cache_dir=cache_dir, read_as_zarr=as_zarr, skip_statistics=True
+                )
 
 
 @app.command()
 def datasets():
     """
-    Print a table of the available openQDC datasets and some informations.
+    Print a formatted table of the available openQDC datasets and some informations.
     """
     table = PrettyTable(["Name", "Type of Energy", "Forces", "Level of theory"])
     for dataset in AVAILABLE_DATASETS:
@@ -98,7 +132,7 @@ def fetch(
     overwrite: Annotated[
         bool,
         typer.Option(
-            help="Whether to overwrite or force the re-download of the files.",
+            help="Whether to overwrite or force the re-download of the raw files.",
         ),
     ] = False,
     cache_dir: Annotated[
@@ -109,17 +143,14 @@ def fetch(
     ] = None,
 ):
     """
-    Download the raw datasets files from the main openQDC hub.
-    overwrite: bool = False,
-        If True, the files will be re-downloaded and overwritten.
-    cache_dir: Optional[str] = None,
-        Path to the cache. If not provided, the default cache directory will be used.
-    Special case: if the dataset is "all", "potential", "interaction".
-        all: all available datasets will be downloaded.
-        potential: all the potential datasets will be downloaded
-        interaction: all the interaction datasets will be downloaded
-    Example:
-        openqdc fetch Spice
+    Download the raw datasets files from the main openQDC hub.\n
+    Special case: if the dataset is "all", "potential", "interaction".\n
+    all: all available datasets will be downloaded.\n
+    potential: all the potential datasets will be downloaded\n
+    interaction: all the interaction datasets will be downloaded\n\n
+
+    Example:\n
+    openqdc fetch Spice
     """
     if datasets[0].lower() == "all":
         dataset_names = list(sanitize(AVAILABLE_DATASETS).keys())
@@ -143,18 +174,27 @@ def preprocess(
     overwrite: Annotated[
         bool,
         typer.Option(
-            help="Whether to overwrite or force the re-download of the datasets.",
+            help="Whether to overwrite the current cached datasets.",
         ),
     ] = True,
     upload: Annotated[
         bool,
         typer.Option(
-            help="Whether to try the upload to the remote storage.",
+            help="Whether to attempt the upload to the remote storage. Must have write permissions.",
+        ),
+    ] = False,
+    as_zarr: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to preprocess as a zarr format or a memmap format.",
         ),
     ] = False,
 ):
     """
     Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.
+
+    Example:
+        openqdc preprocess Spice QMugs
     """
     for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
         if exist_dataset(dataset):
@@ -166,5 +206,137 @@ def preprocess(
                 raise e
 
 
+@app.command()
+def upload(
+    datasets: List[str],
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to overwrite the remote files if they are present.",
+        ),
+    ] = True,
+    as_zarr: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to upload the zarr files if available.",
+        ),
+    ] = False,
+):
+    """
+    Upload a preprocessed dataset to the remote storage.
+
+    Example:
+        openqdc upload Spice --overwrite
+    """
+    for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
+        if exist_dataset(dataset):
+            logger.info(f"Uploading {SANITIZED_AVAILABLE_DATASETS[dataset].__name__}")
+            try:
+                SANITIZED_AVAILABLE_DATASETS[dataset](skip_statistics=True).upload(overwrite=overwrite, as_zarr=as_zarr)
+            except Exception as e:
+                logger.error(f"Error while uploading {dataset}. {e}. Did you preprocess the dataset first?")
+                raise e
+
+
+@app.command()
+def convert(
+    datasets: List[str],
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to overwrite the current zarr cached datasets.",
+        ),
+    ] = False,
+    download: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to force the re-download of the memmap datasets.",
+        ),
+    ] = False,
+):
+    """
+    Convert a preprocessed dataset from a memmap dataset to a zarr dataset.
+    """
+    import os
+    from os.path import join as p_join
+
+    import numpy as np
+    import zarr
+
+    from openqdc.utils.io import load_pkl
+
+    def silent_remove(filename):
+        """
+        Zarr zip files are currently not overwritable. This function is used to remove the file if it exists.
+        """
+        try:
+            os.remove(filename)
+        except OSError:
+            pass
+
+    for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
+        if exist_dataset(dataset):
+            logger.info(f"Converting {SANITIZED_AVAILABLE_DATASETS[dataset].__name__}")
+            try:
+                ds = SANITIZED_AVAILABLE_DATASETS[dataset](overwrite_local_cache=download, skip_statistics=True)
+                # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True)
+
+                pkl = load_pkl(p_join(ds.preprocess_path, "props.pkl"))
+                metadata = p_join(ds.preprocess_path, "metadata.zip")
+                if overwrite:
+                    silent_remove(metadata)
+                group = zarr.group(zarr.storage.ZipStore(metadata))
+                for key, value in pkl.items():
+                    # sub=group.create_group(key)
+                    if key in ["name", "subset"]:
+                        data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)
+                        data[:] = value[0][:]
+                        data2 = group.create_dataset(key + "_ptr", shape=value[1].shape, dtype=np.int32)
+                        data2[:] = value[1][:]
+                    else:
+                        data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)
+                        data[:] = value[:]
+
+                force_attrs = {
+                    "unit": str(ds.force_unit),
+                    "level_of_theory": ds.force_methods,
+                }
+
+                energy_attrs = {"unit": str(ds.energy_unit), "level_of_theory": ds.energy_methods}
+
+                atomic_inputs_attrs = {
+                    "unit": str(ds.distance_unit),
+                }
+                attrs = {"forces": force_attrs, "energies": energy_attrs, "atomic_inputs": atomic_inputs_attrs}
+
+                # os.makedirs(p_join(ds.root, "zips",  ds.__name__), exist_ok=True)
+                for key, value in ds.data.items():
+                    if key not in ds.data_keys:
+                        continue
+                    print(key, value.shape)
+
+                    zarr_path = p_join(ds.preprocess_path, key + ".zip")  # ds.__name__,
+                    if overwrite:
+                        silent_remove(zarr_path)
+                    z = zarr.open(
+                        zarr.storage.ZipStore(zarr_path), "w", zarr_version=2, shape=value.shape, dtype=value.dtype
+                    )
+                    z[:] = value[:]
+                    if key in attrs:
+                        z.attrs.update(attrs[key])
+
+            except Exception as e:
+                logger.error(f"Error while converting {dataset}. {e}. Did you preprocess the dataset first?")
+                raise e
+
+
+@app.command()
+def cache():
+    """
+    Get the current local cache path of openQDC
+    """
+    print(f"openQDC local cache:\n {get_local_cache()}")
+
+
 if __name__ == "__main__":
     app()
diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 026bfd75..8a480125 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -1,13 +1,18 @@
 """The BaseDataset defining shared functionality between all datasets."""
 
 import os
-import pickle as pkl
+
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
 from functools import partial
 from itertools import compress
 from os.path import join as p_join
 from typing import Callable, Dict, List, Optional, Union
 
 import numpy as np
+from ase import Atoms
 from ase.io.extxyz import write_extxyz
 from loguru import logger
 from sklearn.utils import Bunch
@@ -22,6 +27,7 @@
     StatisticManager,
     TotalEnergyStats,
 )
+from openqdc.datasets.structure import MemMapDataset, ZarrDataset
 from openqdc.utils.constants import MAX_CHARGE, NB_ATOMIC_FEATURES
 from openqdc.utils.descriptors import get_descriptor
 from openqdc.utils.exceptions import (
@@ -32,7 +38,6 @@
     copy_exists,
     dict_to_atoms,
     get_local_cache,
-    pull_locally,
     push_remote,
     set_cache_dir,
 )
@@ -76,6 +81,7 @@ class BaseDataset(DatasetPropertyMixIn):
 
     energy_target_names = []
     force_target_names = []
+    read_as_zarr = False
     __energy_methods__ = []
     __force_mask__ = []
     __isolated_atom_energies__ = []
@@ -99,7 +105,9 @@ def __init__(
         cache_dir: Optional[str] = None,
         recompute_statistics: bool = False,
         transform: Optional[Callable] = None,
-        regressor_kwargs={
+        skip_statistics: bool = False,
+        read_as_zarr: bool = False,
+        regressor_kwargs: Dict = {
             "solver_type": "linear",
             "sub_sample": None,
             "stride": 1,
@@ -107,29 +115,28 @@ def __init__(
     ) -> None:
         """
 
-        Parameters
-        ----------
-        energy_unit
-            Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"]
-        distance_unit
-            Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"]
-        array_format
-            Format to return arrays in. Supported formats: ["numpy", "torch", "jax"]
-        energy_type
-            Type of isolated atom energy to use for the dataset. Default: "formation"
-            Supported types: ["formation", "regression", "null", None]
-        overwrite_local_cache
-            Whether to overwrite the locally cached dataset.
-        cache_dir
-            Cache directory location. Defaults to "~/.cache/openqdc"
-        recompute_statistics
-            Whether to recompute the statistics of the dataset.
-        transform, optional
-            transformation to apply to the __getitem__ calls
-        regressor_kwargs
-            Dictionary of keyword arguments to pass to the regressor.
-            Default: {"solver_type": "linear", "sub_sample": None, "stride": 1}
-            solver_type can be one of ["linear", "ridge"]
+        Parameters:
+            energy_unit:
+                Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"]
+            distance_unit:
+                Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"]
+            array_format:
+                Format to return arrays in. Supported formats: ["numpy", "torch", "jax"]
+            energy_type:
+                Type of isolated atom energy to use for the dataset. Default: "formation"
+                Supported types: ["formation", "regression", "null", None]
+            overwrite_local_cache:
+                Whether to overwrite the locally cached dataset.
+            cache_dir:
+                Cache directory location. Defaults to "~/.cache/openqdc"
+            recompute_statistics:
+                Whether to recompute the statistics of the dataset.
+            transform:
+                transformation to apply to the __getitem__ calls
+            regressor_kwargs:
+                Dictionary of keyword arguments to pass to the regressor.
+                Default: {"solver_type": "linear", "sub_sample": None, "stride": 1}
+                solver_type can be one of ["linear", "ridge"]
         """
         set_cache_dir(cache_dir)
         # self._init_lambda_fn()
@@ -138,8 +145,10 @@ def __init__(
         self.recompute_statistics = recompute_statistics
         self.regressor_kwargs = regressor_kwargs
         self.transform = transform
+        self.read_as_zarr = read_as_zarr
         self.energy_type = energy_type if energy_type is not None else "null"
         self.refit_e0s = recompute_statistics or overwrite_local_cache
+        self.skip_statistics = skip_statistics
         if not self.is_preprocessed():
             raise DatasetNotAvailableError(self.__name__)
         else:
@@ -152,6 +161,12 @@ def _init_lambda_fn(self):
         self._fn_distance = lambda x: x
         self._fn_forces = lambda x: x
 
+    @property
+    def dataset_wrapper(self):
+        if not hasattr(self, "_dataset_wrapper"):
+            self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()
+        return self._dataset_wrapper
+
     @property
     def config(self):
         assert len(self.__links__) > 0, "No links provided for fetching"
@@ -171,7 +186,8 @@ def _post_init(
     ) -> None:
         self._set_units(None, None)
         self._set_isolated_atom_energies()
-        self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
+        if not self.skip_statistics:
+            self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
         self._set_units(energy_unit, distance_unit)
         self._convert_data()
         self._set_isolated_atom_energies()
@@ -331,6 +347,10 @@ def convert_forces(self, x):
     def set_energy_unit(self, value: str):
         """
         Set a new energy unit for the dataset.
+
+        Parameters:
+            value:
+                New energy unit to set.
         """
         # old_unit = self.energy_unit
         # self.__energy_unit__ = value
@@ -340,6 +360,10 @@ def set_energy_unit(self, value: str):
     def set_distance_unit(self, value: str):
         """
         Set a new distance unit for the dataset.
+
+        Parameters:
+            value:
+                New distance unit to set.
         """
         # old_unit = self.distance_unit
         # self.__distance_unit__ = value
@@ -351,9 +375,22 @@ def set_array_format(self, format: str):
         self.array_format = format
 
     def read_raw_entries(self):
+        """
+        Preprocess the raw (aka from the fetched source) into a list of dictionaries.
+        """
         raise NotImplementedError
 
-    def collate_list(self, list_entries):
+    def collate_list(self, list_entries: List[Dict]) -> Dict:
+        """
+        Collate a list of entries into a single dictionary.
+
+        Parameters:
+            list_entries:
+                List of dictionaries containing the entries to collate.
+
+        Returns:
+            Dictionary containing the collated entries.
+        """
         # concatenate entries
         res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
 
@@ -364,54 +401,29 @@ def collate_list(self, list_entries):
 
         return res
 
-    def save_preprocess(self, data_dict, upload=False, overwrite=True):
+    def save_preprocess(
+        self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False
+    ):
         """
         Save the preprocessed data to the cache directory and optionally upload it to the remote storage.
-        data_dict : dict
-            Dictionary containing the preprocessed data.
-        upload : bool, Defult: False
-            Whether to upload the preprocessed data to the remote storage or only saving it locally.
-        overwrite : bool, Default: False
-            Whether to overwrite the preprocessed data if it already exists.
-            Only used if upload is True. Cache is always overwritten locally.
+
+        Parameters:
+            data_dict:
+                Dictionary containing the preprocessed data.
+            upload:
+                Whether to upload the preprocessed data to the remote storage or only saving it locally.
+            overwrite:
+                Whether to overwrite the preprocessed data if it already exists.
+                Only used if upload is True. Cache is always overwritten locally.
         """
         # save memmaps
         logger.info("Preprocessing data and saving it to cache.")
-        for key in self.data_keys:
-            local_path = p_join(self.preprocess_path, f"{key}.mmap")
-            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
-            out[:] = data_dict.pop(key)[:]
-            out.flush()
-            if upload:
-                push_remote(local_path, overwrite=overwrite)
-
-        # save smiles and subset
-        local_path = p_join(self.preprocess_path, "props.pkl")
-
-        # assert that (required) pkl keys are present in data_dict
-        assert all([key in data_dict.keys() for key in self.pkl_data_keys])
-
-        # store unique and inverse indices for str-based pkl keys
-        for key in self.pkl_data_keys:
-            if self.pkl_data_types[key] == str:
-                data_dict[key] = np.unique(data_dict[key], return_inverse=True)
-
-        with open(local_path, "wb") as f:
-            pkl.dump(data_dict, f)
+        paths = self.dataset_wrapper.save_preprocess(
+            self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types
+        )
         if upload:
-            push_remote(local_path, overwrite=overwrite)
-
-    def _convert_on_loading(self, x, key):
-        if key == "energies":
-            return self.convert_energy(x)
-        elif key == "forces":
-            return self.convert_forces(x)
-        elif key == "atomic_inputs":
-            x = np.array(x, dtype=np.float32)
-            x[:, -3:] = self.convert_distance(x[:, -3:])
-            return x
-        else:
-            return x
+            for local_path in paths:
+                push_remote(local_path, overwrite=overwrite)  # make it async?
 
     def read_preprocess(self, overwrite_local_cache=False):
         logger.info("Reading preprocessed data.")
@@ -421,62 +433,106 @@ def read_preprocess(self, overwrite_local_cache=False):
                      Distance: {self.distance_unit},\n\
                      Forces: {self.force_unit if self.force_methods else 'None'}"
         )
-        self.data = {}
-        for key in self.data_keys:
-            filename = p_join(self.preprocess_path, f"{key}.mmap")
-            pull_locally(filename, overwrite=overwrite_local_cache)
-            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(*self.data_shapes[key])
-
-        filename = p_join(self.preprocess_path, "props.pkl")
-        pull_locally(filename, overwrite=overwrite_local_cache)
-        with open(filename, "rb") as f:
-            tmp = pkl.load(f)
-            all_pkl_keys = set(tmp.keys()) - set(self.data_keys)
-            # assert required pkl_keys are present in all_pkl_keys
-            assert all([key in all_pkl_keys for key in self.pkl_data_keys])
-            for key in all_pkl_keys:
-                x = tmp.pop(key)
-                if len(x) == 2:
-                    self.data[key] = x[0][x[1]]
-                else:
-                    self.data[key] = x
 
+        self.data = self.dataset_wrapper.load_data(
+            self.preprocess_path,
+            self.data_keys,
+            self.data_types,
+            self.data_shapes,
+            self.pkl_data_keys,
+            overwrite_local_cache,
+        )  # this should be async if possible
         for key in self.data:
             logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
-    def is_preprocessed(self):
+    def _convert_on_loading(self, x, key):
+        if key == "energies":
+            return self.convert_energy(x)
+        elif key == "forces":
+            return self.convert_forces(x)
+        elif key == "atomic_inputs":
+            x = np.array(x, dtype=np.float32)
+            x[:, -3:] = self.convert_distance(x[:, -3:])
+            return x
+        else:
+            return x
+
+    def is_preprocessed(self) -> bool:
         """
         Check if the dataset is preprocessed and available online or locally.
+
+        Returns:
+            True if the dataset is available remotely or locally, False otherwise.
         """
-        predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
+        predicats = [
+            copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f"{key}")))
+            for key in self.data_keys
+        ]
+        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]
         return all(predicats)
 
-    def is_cached(self):
+    def is_cached(self) -> bool:
         """
         Check if the dataset is cached locally.
+
+        Returns:
+            True if the dataset is cached locally, False otherwise.
         """
-        predicats = [os.path.exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [os.path.exists(p_join(self.preprocess_path, "props.pkl"))]
+        predicats = [
+            os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f"{key}")))
+            for key in self.data_keys
+        ]
+        predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]
         return all(predicats)
 
-    def preprocess(self, upload: bool = False, overwrite: bool = True):
+    def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):
         """
         Preprocess the dataset and save it.
-        upload : bool, Defult: False
-            Whether to upload the preprocessed data to the remote storage or only saving it locally.
-        overwrite : bool, Default: False
-            Whether to overwrite the preprocessed data if it already exists.
-            Only used if upload is True. Cache is always overwritten locally.
+
+        Parameters:
+            upload:
+                Whether to upload the preprocessed data to the remote storage or only saving it locally.
+            overwrite:
+                hether to overwrite the preprocessed data if it already exists.
+                Only used if upload is True. Cache is always overwritten locally.
+            as_zarr:
+                Whether to save the data as zarr files
         """
         if overwrite or not self.is_preprocessed():
             entries = self.read_raw_entries()
             res = self.collate_list(entries)
-            self.save_preprocess(res, upload, overwrite)
+            self.save_preprocess(res, upload, overwrite, as_zarr)
+
+    def upload(self, overwrite: bool = False, as_zarr: bool = False):
+        """
+        Upload the preprocessed data to the remote storage. Must be called after preprocess and
+        need to have write privileges.
+
+        Parameters:
+            overwrite:
+                Whether to overwrite the remote data if it already exists
+            as_zarr:
+                Whether to upload the data as zarr files
+        """
+        for key in self.data_keys:
+            local_path = p_join(self.preprocess_path, f"{key}.mmap" if not as_zarr else f"{key}.zip")
+            push_remote(local_path, overwrite=overwrite)
+        local_path = p_join(self.preprocess_path, "props.pkl" if not as_zarr else "metadata.zip")
+        push_remote(local_path, overwrite=overwrite)
 
-    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext=True):
+    def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):
         """
-        Save the entry at index idx as an extxyz file.
+        Save a single entry at index idx as an extxyz file.
+
+        Parameters:
+            idx:
+                Index of the entry
+            energy_method:
+                Index of the energy method to use
+            path:
+                Path to save the xyz file. If None, the current working directory is used.
+            ext:
+                Whether to include additional informations like forces and other metadatas (extxyz format)
         """
         if path is None:
             path = os.getcwd()
@@ -486,6 +542,12 @@ def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None,
     def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):
         """
         Save dataset as single xyz file (extended xyz format).
+
+        Parameters:
+            energy_method:
+                Index of the energy method to use
+            path:
+                Path to save the xyz file
         """
         with open(p_join(path if path else os.getcwd(), f"{self.__name__}.xyz"), "w") as f:
             for atoms in tqdm(
@@ -495,16 +557,20 @@ def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):
             ):
                 write_extxyz(f, atoms, append=True)
 
-    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext=True):
+    def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:
         """
         Get the ASE atoms object for the entry at index idx.
 
-        Parameters
-        ----------
-        idx : int
-            Index of the entry.
-        ext : bool, optional
-            Whether to include additional informations
+        Parameters:
+            idx:
+                Index of the entry.
+            energy_method:
+                Index of the energy method to use
+            ext:
+                Whether to include additional informations
+
+        Returns:
+            ASE atoms object
         """
         entry = self[idx]
         at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)
@@ -537,24 +603,23 @@ def calculate_descriptors(
         """
         Compute the descriptors for the dataset.
 
-        Parameters
-        ----------
-        descriptor_name : str
-            Name of the descriptor to use. Supported descriptors are ["soap"]
-        chemical_species : Optional[List[str]], optional
-            List of chemical species to use for the descriptor computation, by default None.
-            If None, the chemical species of the dataset are used.
-        n_samples : Optional[Union[List[int],int, float]], optional
-            Number of samples to use for the computation, by default None. If None, all the dataset is used.
-            If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.
-        progress : bool, optional
-            Whether to show a progress bar, by default True.
-        **descriptor_kwargs : dict
-            Keyword arguments to pass to the descriptor instantiation of the model.
-
-        Returns
-        -------
-        Dict[str, np.ndarray]
+        Parameters:
+            descriptor_name:
+                Name of the descriptor to use. Supported descriptors are ["soap"]
+            chemical_species:
+                List of chemical species to use for the descriptor computation, by default None.
+                If None, the chemical species of the dataset are used.
+            n_samples:
+                Number of samples to use for the computation, by default None.
+                If None, all the dataset is used.
+                If a list of integers is provided, the descriptors are computed for
+                each of the specified idx of samples.
+            progress:
+                Whether to show a progress bar, by default True.
+            **descriptor_kwargs : dict
+                Keyword arguments to pass to the descriptor instantiation of the model.
+
+        Returns:
             Dictionary containing the following keys:
                 - values : np.ndarray of shape (N, M) containing the descriptors for the dataset
                 - idxs : np.ndarray of shape (N,) containing the indices of the samples used
@@ -577,14 +642,18 @@ def wrapper(idx):
         datum["idxs"] = idxs
         return datum
 
-    def as_iter(self, atoms: bool = False, energy_method: int = 0):
+    def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:
         """
         Return the dataset as an iterator.
 
-        Parameters
-        ----------
-        atoms : bool, optional
-            Whether to return the items as ASE atoms object, by default False
+        Parameters:
+            atoms:
+                Whether to return the items as ASE atoms object, by default False
+            energy_method:
+                Index of the energy method to use
+
+        Returns:
+            Iterator of the dataset
         """
 
         func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__
@@ -592,12 +661,21 @@ def as_iter(self, atoms: bool = False, energy_method: int = 0):
         for i in range(len(self)):
             yield func(i)
 
-    def get_statistics(self, return_none: bool = True):
+    def __iter__(self):
+        for idxs in range(len(self)):
+            yield self[idxs]
+
+    def get_statistics(self, return_none: bool = True) -> Dict:
         """
         Get the converted statistics of the dataset.
-        return_none : bool, optional
-            Whether to return None if the statistics for the forces are not available, by default True
-            Otherwise, the statistics for the forces are set to 0.0
+
+        Parameters:
+            return_none :
+                Whether to return None if the statistics for the forces are not available, by default True
+                Otherwise, the statistics for the forces are set to 0.0
+
+        Returns:
+            Dictionary containing the statistics of the dataset
         """
         selected_stats = self.statistics.get_results()
         if len(selected_stats) == 0:
diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index d90be07e..6788c41c 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -74,13 +74,19 @@ def _create_subsets(self, **kwargs):
 
 class DES370K(BaseInteractionDataset, IDES):
     """
-    DE Shaw Research interaction energy of over 370K
-    small molecule dimers as described in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
+    DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies
+    computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules
+    and ions) including water and functional groups found in proteins. Dimer geometries are generated using
+    QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.
+
+    Usage:
+    ```python
+    from openqdc.datasets import DES370K
+    dataset = DES370K()
+    ```
+
+    Reference:
+        https://www.nature.com/articles/s41597-021-00833-x
     """
 
     __name__ = "des370k_interaction"
@@ -173,13 +179,18 @@ def read_raw_entries(self) -> List[Dict]:
 
 class DES5M(DES370K):
     """
-    DE Shaw Research interaction energy calculations for
-    over 5M small molecule dimers as described in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
+    DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies
+    computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using
+    QM based optimization and MD simulations.
+
+    Usage:
+    ```python
+    from openqdc.datasets import DES5M
+    dataset = DES5M()
+    ```
+
+    Reference:
+        https://www.nature.com/articles/s41597-021-00833-x
     """
 
     __name__ = "des5m_interaction"
@@ -242,18 +253,19 @@ class DES5M(DES370K):
 
 class DESS66(DES370K):
     """
-    DE Shaw Research interaction energy
-    estimates of all 66 conformers from
-    the original S66 dataset as described
-    in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-
-    Data was downloaded from Zenodo:
-    https://zenodo.org/records/5676284
+    DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS
+    dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.
+    The protocol for estimating energies is based on the DES370K paper.
+
+    Usage:
+    ```python
+    from openqdc.datasets import DESS66
+    dataset = DESS66()
+    ```
+
+    Reference:
+        https://www.nature.com/articles/s41597-021-00833-x\n
+        S66: https://pubs.acs.org/doi/10.1021/ct2002946
     """
 
     __name__ = "des_s66"
@@ -266,19 +278,18 @@ def _create_subsets(self, **kwargs):
 
 class DESS66x8(DESS66):
     """
-    DE Shaw Research interaction energy
-    estimates of all 528 conformers from
-    the original S66x8 dataset as described
-    in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-
-    Data was downloaded from Zenodo:
-
-    https://zenodo.org/records/5676284
+    DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS
+    dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve
+    giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.
+
+    Usage:
+    ```python
+    from openqdc.datasets import DESS66x8
+    dataset = DESS66x8()
+    ```
+
+    Reference:
+        https://www.nature.com/articles/s41597-021-00833-x
     """
 
     __name__ = "des_s66x8"
diff --git a/openqdc/datasets/interaction/l7.py b/openqdc/datasets/interaction/l7.py
index 75a63cd5..7307638c 100644
--- a/openqdc/datasets/interaction/l7.py
+++ b/openqdc/datasets/interaction/l7.py
@@ -7,15 +7,18 @@
 
 class L7(YamlDataset):
     """
-    The L7 interaction energy dataset as described in:
-
-    Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
-    Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
-    Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
-    DOI: 10.1021/ct400036b
-
-    Data was downloaded and extracted from:
-    http://cuby4.molecular.cz/dataset_l7.html
+    The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with
+    energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are
+    taken from crystal X-ray data and optimized with a DFT method specific to the complex.
+
+    Usage:
+    ```python
+    from openqdc.datasets import L7
+    dataset = L7()
+    ```
+
+    Reference:
+        https://pubs.acs.org/doi/10.1021/ct400036b
     """
 
     __name__ = "l7"
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index faf5324f..889370e8 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -84,20 +84,19 @@ def read_xyz(fname, subset):
 
 class Metcalf(BaseInteractionDataset):
     """
-    Hydrogen-bonded dimers of NMA with 126 molecules as described in:
-
-    Approaches for machine learning intermolecular interaction energies and
-    application to energy components from symmetry adapted perturbation theory.
-    Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus,
-    Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill;
-    J. Chem. Phys. 21 February 2020; 152 (7): 074103.
-    https://doi.org/10.1063/1.5142636
-
-    Further details:
-    "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules
-    (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries
-    for the 126 individual monomers were obtained and paired with NMA in broad
-    arrays of spatial configurations to generate thousands of complexes for training.
+    Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to
+    156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and
+    the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various
+    components.
+
+    Usage:
+    ```python
+    from openqdc.datasets import Metcalf
+    dataset = Metcalf()
+    ```
+
+    Reference:
+        https://doi.org/10.1063/1.5142636
     """
 
     __name__ = "metcalf"
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index bda10129..6ba3b4d5 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -12,13 +12,18 @@
 
 class Splinter(BaseInteractionDataset):
     """
-    A dataset of over 1.7 million protein-ligand
-    interactions as described in the paper:
+    Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated
+    by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies
+    and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.
 
-    A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions.
-    Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al.
-    Sci Data 10, 619 (2023).
-    https://doi.org/10.1038/s41597-023-02443-1
+    Usage:
+    ```python
+    from openqdc.datasets import Splinter
+    dataset = Splinter()
+    ```
+
+    Reference:
+        https://doi.org/10.1038/s41597-023-02443-1
     """
 
     __energy_unit__ = "kcal/mol"
diff --git a/openqdc/datasets/interaction/x40.py b/openqdc/datasets/interaction/x40.py
index 64da5d87..d56a976d 100644
--- a/openqdc/datasets/interaction/x40.py
+++ b/openqdc/datasets/interaction/x40.py
@@ -8,16 +8,21 @@
 
 class X40(YamlDataset):
     """
-    X40 interaction dataset of 40 dimer pairs as
-    introduced in the following paper:
-
-    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
-    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
-    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
-    DOI: 10.1021/ct300647k
-
-    Dataset retrieved and processed from:
-    http://cuby4.molecular.cz/dataset_x40.html
+    X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules
+    where the halogens participate in various interaction types such as electrostatic interactions, london
+    dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic
+    molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries
+    are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are
+    computed with CCSD(T)/CBS level of theory.
+
+    Usage:
+    ```python
+    from openqdc.datasets import X40
+    dataset = X40()
+    ```
+
+    Reference:
+        https://pubs.acs.org/doi/10.1021/ct300647k
     """
 
     __name__ = "x40"
diff --git a/openqdc/datasets/io.py b/openqdc/datasets/io.py
index 1e621f72..7316768b 100644
--- a/openqdc/datasets/io.py
+++ b/openqdc/datasets/io.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable, List, Optional
+from typing import Callable, Dict, List, Optional
 
 import datamol as dm
 import numpy as np
@@ -17,6 +17,8 @@ def try_retrieve(obj, callable, default=None):
 
 
 class FromFileDataset(BaseDataset, ABC):
+    """Abstract class for datasets that read from a common format file like xzy, netcdf, gro, hdf5, etc."""
+
     def __init__(
         self,
         path: List[str],
@@ -28,6 +30,7 @@ def __init__(
         array_format: Optional[str] = "numpy",
         level_of_theory: Optional[QmMethod] = None,
         transform: Optional[Callable] = None,
+        skip_statistics: bool = False,
         regressor_kwargs={
             "solver_type": "linear",
             "sub_sample": None,
@@ -35,18 +38,37 @@ def __init__(
         },
     ):
         """
-        Create a dataset from a xyz file.
+        Create a dataset from a list of files.
 
         Parameters
         ----------
         path : List[str]
             The path to the file or a list of paths.
+        dataset_name : Optional[str], optional
+            The name of the dataset, by default None.
+        energy_type : Optional[str], optional
+            The type of isolated atom energy by default "regression".
+            Supported types: ["formation", "regression", "null", None]
+        energy_unit
+            Energy unit of the dataset. Default is "hartree".
+        distance_unit
+            Distance unit of the dataset. Default is "ang".
+        level_of_theory: Optional[QmMethod, str]
+            The level of theory of the dataset.
+            Used if energy_type is "formation" to fetch the correct isolated atom energies.
+        transform, optional
+            transformation to apply to the __getitem__ calls
+        regressor_kwargs
+            Dictionary of keyword arguments to pass to the regressor.
+            Default: {"solver_type": "linear", "sub_sample": None, "stride": 1}
+            solver_type can be one of ["linear", "ridge"]
         """
         self.path = [path] if isinstance(path, str) else path
         self.__name__ = self.__class__.__name__ if dataset_name is None else dataset_name
         self.recompute_statistics = True
         self.refit_e0s = True
         self.energy_type = energy_type
+        self.skip_statistics = skip_statistics
         self.__energy_unit__ = energy_unit
         self._original_unit = self.energy_unit
         self.__distance_unit__ = distance_unit
@@ -62,29 +84,19 @@ def __init__(
         self.set_array_format(array_format)
         self._post_init(True, energy_unit, distance_unit)
 
-    def __str__(self):
-        return self.__name__.lower()
-
-    def __repr__(self):
-        return str(self)
-
     @abstractmethod
     def read_as_atoms(self, path: str) -> List[Atoms]:
         """
-        Method that reads a path and return a list of Atoms objects.
+        Method that reads a file and return a list of Atoms objects.
+        path : str
+            The path to the file.
         """
         raise NotImplementedError
 
-    def collate_list(self, list_entries):
-        res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
-        csum = np.cumsum(res.get("n_atoms"))
-        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
-        x[1:, 0], x[:, 1] = csum[:-1], csum
-        res["position_idx_range"] = x
-
-        return res
-
-    def read_raw_entries(self):
+    def read_raw_entries(self) -> List[Dict]:
+        """
+        Process the files and return a list of data objects.
+        """
         entries_list = []
         for path in self.path:
             for entry in self.read_as_atoms(path):
@@ -96,6 +108,11 @@ def _read_and_preprocess(self):
         self.data = self.collate_list(entries_list)
 
     def _convert_to_record(self, obj: Atoms):
+        """
+        Convert an Atoms object to a record for the openQDC dataset processing.
+        obj : Atoms
+            The ase.Atoms object to convert
+        """
         name = obj.info.get("name", None)
         subset = obj.info.get("subset", str(self))
         positions = obj.positions
@@ -116,8 +133,18 @@ def _convert_to_record(self, obj: Atoms):
             n_atoms=np.array([len(positions)], dtype=np.int32),
         )
 
+    def __str__(self):
+        return self.__name__.lower()
+
+    def __repr__(self):
+        return str(self)
+
 
 class XYZDataset(FromFileDataset):
+    """
+    Baseclass to read datasets from xyz and extxyz files.
+    """
+
     def read_as_atoms(self, path):
         from ase.io import iread
 
diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
index b59fcad7..35721dde 100644
--- a/openqdc/datasets/potential/__init__.py
+++ b/openqdc/datasets/potential/__init__.py
@@ -1,6 +1,7 @@
+from .alchemy import Alchemy
 from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
 from .comp6 import COMP6
-from .dummy import Dummy
+from .dummy import Dummy, PredefinedDataset
 from .gdml import GDML
 from .geom import GEOM
 from .iso_17 import ISO17
@@ -10,18 +11,23 @@
 from .nabladft import NablaDFT
 from .orbnet_denali import OrbnetDenali
 from .pcqm import PCQM_B3LYP, PCQM_PM6
+from .proteinfragments import MDDataset, ProteinFragments
 from .qm1b import QM1B, QM1B_SMALL
 from .qm7x import QM7X, QM7X_V2
 from .qmugs import QMugs, QMugs_V2
+from .qmx import QM7, QM8, QM9, QM7b
 from .revmd17 import RevMD17
 from .sn2_rxn import SN2RXN
 from .solvated_peptides import SolvatedPeptides
 from .spice import Spice, SpiceV2, SpiceVL2
 from .tmqm import TMQM
 from .transition1x import Transition1X
+from .vqm24 import VQM24
+from .waterclusters import SCANWaterClusters
 from .waterclusters3_30 import WaterClusters
 
 AVAILABLE_POTENTIAL_DATASETS = {
+    "Alchemy": Alchemy,
     "ANI1": ANI1,
     "ANI1CCX": ANI1CCX,
     "ANI1CCX_V2": ANI1CCX_V2,
@@ -42,6 +48,10 @@
     "QMugs_V2": QMugs_V2,
     "QM1B": QM1B,
     "QM1B_SMALL": QM1B_SMALL,
+    "QM7": QM7,
+    "QM7b": QM7b,
+    "QM8": QM8,
+    "QM9": QM9,
     "SN2RXN": SN2RXN,
     "SolvatedPeptides": SolvatedPeptides,
     "Spice": Spice,
@@ -50,8 +60,12 @@
     "TMQM": TMQM,
     "Transition1X": Transition1X,
     "WaterClusters": WaterClusters,
+    "SCANWaterClusters": SCANWaterClusters,
     "MultixcQM9": MultixcQM9,
     "MultixcQM9_V2": MultixcQM9_V2,
     "RevMD17": RevMD17,
     "MD22": MD22,
+    "VQM24": VQM24,
+    "ProteinFragments": ProteinFragments,
+    "MDDataset": MDDataset,
 }
diff --git a/openqdc/datasets/potential/alchemy.py b/openqdc/datasets/potential/alchemy.py
new file mode 100644
index 00000000..24c17cd9
--- /dev/null
+++ b/openqdc/datasets/potential/alchemy.py
@@ -0,0 +1,95 @@
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+# ['gdb_idx', 'atom number', 'zpve\n(Ha, zero point vibrational energy)',
+#'Cv\n(cal/molK, heat capacity at 298.15 K)', 'gap\n(Ha, LUMO-HOMO)',
+# 'G\n(Ha, Free energy at 298.15 K)', 'HOMO\n(Ha, energy of HOMO)',
+# 'U\n(Ha, internal energy at 298.15 K)', 'alpha\n(a_0^3, Isotropic polarizability)',
+# 'U0\n(Ha, internal energy at 0 K)', 'H\n(Ha, enthalpy at 298.15 K)',
+# 'LUMO\n(Ha, energy of LUMO)', 'mu\n(D, dipole moment)',
+# 'R2\n(a_0^2, electronic spatial extent)']
+
+
+def read_mol(file, energy):
+    try:
+        mol = dm.read_sdf(file, remove_hs=False)[0]
+        positions = mol.GetConformer().GetPositions()
+        x = get_atomic_number_and_charge(mol)
+        n_atoms = positions.shape[0]
+        res = dict(
+            atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),
+            name=np.array([dm.to_smiles(mol)]),
+            energies=np.array([energy], dtype=np.float64)[:, None],
+            n_atoms=np.array([n_atoms], dtype=np.int32),
+            subset=np.array([f"atoms_{n_atoms}"]),
+        )
+
+    except Exception as e:
+        print(f"Skipping due to {e}")
+        res = None
+
+    return res
+
+
+# e B3LYP/6-31G(2df,p) model with the density fitting
+# approximation for electron repulsion integrals. The auxiliary basis cc-pVDZ-jkf
+
+
+class Alchemy(BaseDataset):
+    """
+    Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.
+    Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level
+    with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used
+    to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G
+    is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the
+    B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The
+    auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange
+    matrix.
+
+    Usage:
+    ```python
+    from openqdc.datasets import Alchemy
+    dataset = Alchemy()
+    ```
+
+    Reference:
+        https://arxiv.org/abs/1906.09427
+        https://alchemy.tencent.com/
+    """
+
+    __name__ = "alchemy"
+
+    __energy_methods__ = [
+        PotentialMethod.WB97X_6_31G_D,  # "wb97x/6-31g(d)"
+    ]
+
+    energy_target_names = [
+        "ωB97x:6-31G(d) Energy",
+    ]
+
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __links__ = {"alchemy.zip": "https://alchemy.tencent.com/data/alchemy-v20191129.zip"}
+
+    def read_raw_entries(self):
+        dir_path = p_join(self.root, "Alchemy-v20191129")
+        full_csv = pd.read_csv(p_join(dir_path, "final_version.csv"))
+        energies = full_csv["U0\n(Ha, internal energy at 0 K)"].tolist()
+        atom_folder = full_csv["atom number"]
+        gdb_idx = full_csv["gdb_idx"]
+        idxs = full_csv.index.tolist()
+        samples = []
+        for i in tqdm(idxs):
+            sdf_file = p_join(dir_path, f"atom_{atom_folder[i]}", f"{gdb_idx[i]}.sdf")
+            energy = energies[i]
+            samples.append(read_mol(sdf_file, energy))
+        return samples
diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py
index bcff384f..aac35635 100644
--- a/openqdc/datasets/potential/ani.py
+++ b/openqdc/datasets/potential/ani.py
@@ -39,19 +39,22 @@ def extract_ani2_entries(properties):
 
 class ANI1(BaseDataset):
     """
-    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
-    organic molecules with energy labels calculated using DFT. The molecules
-    contain 4 distinct atoms, C, N, O and H.
-
-    Usage
+    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic
+    molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the
+    wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules
+    are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary
+    point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT
+    level.
+
+    Usage:
     ```python
     from openqdc.datasets import ANI1
     dataset = ANI1()
     ```
 
     References:
-    - ANI-1: https://www.nature.com/articles/sdata2017193
-    - Github: https://github.com/aiqm/ANI1x_datasets
+        https://www.nature.com/articles/sdata2017193\n
+        https://github.com/aiqm/ANI1x_datasets
     """
 
     __name__ = "ani1"
@@ -79,9 +82,6 @@ def config(self):
         return dict(dataset_name="ani", links=self.__links__)
 
     def __smiles_converter__(self, x):
-        """util function to convert string to smiles: useful if the smiles is
-        encoded in a different format than its display format
-        """
         return "-".join(x.decode("ascii").split("-")[:-1])
 
     @property
@@ -96,64 +96,23 @@ def read_raw_entries(self):
         return samples
 
 
-class ANI1CCX(ANI1):
-    """
-    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected
-    conformations are then labelled using a high accuracy CCSD(T)*/CBS method.
-
-    Usage
-    ```python
-    from openqdc.datasets import ANI1CCX
-    dataset = ANI1CCX()
-    ```
-
-    References:
-    - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4
-    - Github: https://github.com/aiqm/ANI1x_datasets
-    """
-
-    __name__ = "ani1ccx"
-    __energy_unit__ = "hartree"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "hartree/ang"
-
-    __energy_methods__ = [
-        PotentialMethod.NONE,  # "ccsd(t)/cbs",
-        PotentialMethod.NONE,  # "ccsd(t)/cc-pvdz",
-        PotentialMethod.NONE,  # "ccsd(t)/cc-pvtz",
-        PotentialMethod.NONE,  # "tccsd(t)/cc-pvdz",
-    ]
-
-    energy_target_names = [
-        "CCSD(T)*:CBS Total Energy",
-        "NPNO-CCSD(T):cc-pVDZ Correlation Energy",
-        "NPNO-CCSD(T):cc-pVTZ Correlation Energy",
-        "TPNO-CCSD(T):cc-pVDZ Correlation Energy",
-    ]
-    force_target_names = []
-    __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}
-
-    def __smiles_converter__(self, x):
-        """util function to convert string to smiles: useful if the smiles is
-        encoded in a different format than its display format
-        """
-        return x
-
-
 class ANI1X(ANI1):
     """
     The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to
-    a total of 5,496,771 conformers with 63,865 unique molecules.
+    a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,
+    generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques
+    are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and
+    (4) torsion sampling.
 
-    Usage
+    Usage:
     ```python
     from openqdc.datasets import ANI1X
     dataset = ANI1X()
     ```
 
     References:
-    - ANI-1x: https://doi.org/10.1063/1.5023802
-    - Github: https://github.com/aiqm/ANI1x_datasets
+        https://doi.org/10.1063/1.5023802\n
+        https://github.com/aiqm/ANI1x_datasets
     """
 
     __name__ = "ani1x"
@@ -162,14 +121,14 @@ class ANI1X(ANI1):
     __forces_unit__ = "hartree/ang"
 
     __energy_methods__ = [
-        "hf/cc-pvdz",
-        "hf/cc-pvqz",
-        "hf/cc-pvtz",
-        "mp2/cc-pvdz",
-        "mp2/cc-pvqz",
-        "mp2/cc-pvtz",
-        "wb97x/6-31g(d)",
-        "wb97x/cc-pvtz",
+        PotentialMethod.NONE,  # "hf/cc-pvdz",
+        PotentialMethod.NONE,  # "hf/cc-pvqz",
+        PotentialMethod.NONE,  # "hf/cc-pvtz",
+        PotentialMethod.NONE,  # "mp2/cc-pvdz",
+        PotentialMethod.NONE,  # "mp2/cc-pvqz",
+        PotentialMethod.NONE,  # "mp2/cc-pvtz",
+        PotentialMethod.NONE,  # "wb97x/6-31g(d)",
+        PotentialMethod.NONE,  # "wb97x/cc-pvtz",
     ]
 
     energy_target_names = [
@@ -194,6 +153,47 @@ class ANI1X(ANI1):
     def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
 
+    def __smiles_converter__(self, x):
+        return x
+
+
+class ANI1CCX(ANI1):
+    """
+    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active
+    learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import ANI1CCX
+    dataset = ANI1CCX()
+    ```
+
+    References:
+        https://doi.org/10.1038/s41467-019-10827-4\n
+        https://github.com/aiqm/ANI1x_datasets
+    """
+
+    __name__ = "ani1ccx"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+
+    __energy_methods__ = [
+        PotentialMethod.NONE,  # "ccsd(t)/cbs",
+        PotentialMethod.NONE,  # "ccsd(t)/cc-pvdz",
+        PotentialMethod.NONE,  # "ccsd(t)/cc-pvtz",
+        PotentialMethod.NONE,  # "tccsd(t)/cc-pvdz",
+    ]
+
+    energy_target_names = [
+        "CCSD(T)*:CBS Total Energy",
+        "NPNO-CCSD(T):cc-pVDZ Correlation Energy",
+        "NPNO-CCSD(T):cc-pVTZ Correlation Energy",
+        "TPNO-CCSD(T):cc-pVDZ Correlation Energy",
+    ]
+    force_target_names = []
+    __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}
+
     def __smiles_converter__(self, x):
         """util function to convert string to smiles: useful if the smiles is
         encoded in a different format than its display format
@@ -202,6 +202,21 @@ def __smiles_converter__(self, x):
 
 
 class ANI1CCX_V2(ANI1CCX):
+    """
+    ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels
+    for each conformation.
+
+    Usage:
+    ```python
+    from openqdc.datasets import ANI1CCX_V2
+    dataset = ANI1CCX_V2()
+    ```
+
+    References:
+        https://doi.org/10.1038/s41467-019-10827-4\n
+        https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1ccx_v2"
 
     __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
@@ -211,19 +226,20 @@ class ANI1CCX_V2(ANI1CCX):
 
 class ANI2X(ANI1):
     """
-    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL,
-    and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k
-    chemical isomers, optimized using the LBFGS algorithm and labeled with ωB97X/6-31G*.
+    The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.
+    It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized
+    using the LBFGS algorithm and labeled with ωB97X/6-31G*. The same sampling techniques as done in ANI-1X are
+    used for generating geometries.
 
-    Usage
+    Usage:
     ```python
-    from openqdc.datasets import ANI@X
+    from openqdc.datasets import ANI2X
     dataset = ANI2X()
     ```
 
     References:
-    - ANI-2x: https://doi.org/10.1021/acs.jctc.0c00121
-    - Github: https://github.com/aiqm/ANI1x_datasets
+        https://doi.org/10.1021/acs.jctc.0c00121
+        https://github.com/aiqm/ANI1x_datasets
     """
 
     __name__ = "ani2x"
@@ -258,9 +274,6 @@ class ANI2X(ANI1):
     }
 
     def __smiles_converter__(self, x):
-        """util function to convert string to smiles: useful if the smiles is
-        encoded in a different format than its display format
-        """
         return x
 
     def read_raw_entries(self):
diff --git a/openqdc/datasets/potential/comp6.py b/openqdc/datasets/potential/comp6.py
index d5998e0a..fe24825c 100644
--- a/openqdc/datasets/potential/comp6.py
+++ b/openqdc/datasets/potential/comp6.py
@@ -7,19 +7,43 @@
 
 class COMP6(BaseDataset):
     """
-    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space
-    developed for testing the ANI-1x potential. It is curated from 6 benchmark sets:
-    S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides.
+    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the
+    ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and
+    Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using
+    the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and
+    molecular dipoles.
 
-    Usage
+    Details of the benchmark sets are as follows:
+        S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and
+    mixed influence interactions.\n
+        ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular
+    dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small
+    proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point
+    calculations are performed to calculate energies and forces.\n
+        GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.
+    The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence
+    criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal
+    mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\n
+        GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11
+    and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are
+    generated via DNMS.\n
+        Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n
+        DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.
+    Structures are optimized similar to GDB7to9.
+
+    Usage:
     ```python
     from openqdc.datasets import COMP6
     dataset = COMP6()
     ```
 
     References:
-    - https://aip.scitation.org/doi/abs/10.1063/1.5023802
-    - Github: https://github.com/isayev/COMP6
+        https://aip.scitation.org/doi/abs/10.1063/1.5023802\n
+        https://github.com/isayev/COMP6\n
+        S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\n
+        GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\n
+        GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\n
+        DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h
     """
 
     __name__ = "comp6"
diff --git a/openqdc/datasets/potential/gdml.py b/openqdc/datasets/potential/gdml.py
index 24f283e6..24c74754 100644
--- a/openqdc/datasets/potential/gdml.py
+++ b/openqdc/datasets/potential/gdml.py
@@ -8,25 +8,32 @@
 class GDML(BaseDataset):
     """
     Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio
-    molecular dynamics (AIMD) trajectories. The dataset consists of,
-    - Benzene: 627000 samples
-    - Uracil: 133000 samples
-    - Naptalene: 326000 samples
-    - Aspirin: 211000 samples
-    - Salicylic Acid: 320000 samples
-    - Malonaldehyde: 993000 samples
-    - Ethanol: 555000 samples
-    - Toluene: 100000 samples
+    molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene
+    (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin
+    (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),
+    Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for
+    each conformation are computed using the PBE + vdW-TS electronic structure method.
+    molecular dynamics (AIMD) trajectories.
 
-    Usage
+    The dataset consists of the following trajectories:
+        Benzene: 627000 samples\n
+        Uracil: 133000 samples\n
+        Naptalene: 326000 samples\n
+        Aspirin: 211000 samples\n
+        Salicylic Acid: 320000 samples\n
+        Malonaldehyde: 993000 samples\n
+        Ethanol: 555000 samples\n
+        Toluene: 100000 samples\n
+
+    Usage:
     ```python
     from openqdc.datasets import GDML
     dataset = GDML()
     ```
 
     References:
-    - https://www.science.org/doi/10.1126/sciadv.1603015
-    - http://www.sgdml.org/#datasets
+        https://www.science.org/doi/10.1126/sciadv.1603015
+        http://www.sgdml.org/#datasets
     """
 
     __name__ = "gdml"
diff --git a/openqdc/datasets/potential/geom.py b/openqdc/datasets/potential/geom.py
index d07a3d93..7c86b1e3 100644
--- a/openqdc/datasets/potential/geom.py
+++ b/openqdc/datasets/potential/geom.py
@@ -61,9 +61,11 @@ def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str,
 
 class GEOM(BaseDataset):
     """
-    The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules
-    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology,
-    and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method.
+    Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules
+    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.
+    For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and
+    the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the
+    conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.
 
     Usage:
     ```python
@@ -72,8 +74,9 @@ class GEOM(BaseDataset):
     ```
 
     References:
-    - https://www.nature.com/articles/s41597-022-01288-4
-    - https://github.com/learningmatter-mit/geom
+        https://www.nature.com/articles/s41597-022-01288-4\n
+        https://github.com/learningmatter-mit/geom\n
+        CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d
     """
 
     __name__ = "geom"
diff --git a/openqdc/datasets/potential/iso_17.py b/openqdc/datasets/potential/iso_17.py
index fe6aab5c..5672650b 100644
--- a/openqdc/datasets/potential/iso_17.py
+++ b/openqdc/datasets/potential/iso_17.py
@@ -7,11 +7,12 @@
 
 class ISO17(BaseDataset):
     """
-    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed
-    composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist
-    of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution
-    of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the
-    Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.
+    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of
+    atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing
+    5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics
+    trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient
+    approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der
+    Waals correction method.
 
     Usage:
     ```python
@@ -20,7 +21,11 @@ class ISO17(BaseDataset):
     ```
 
     References:
-    - https://paperswithcode.com/dataset/iso17
+        https://arxiv.org/abs/1706.08566\n
+        https://arxiv.org/abs/1609.08259\n
+        https://www.nature.com/articles/sdata201422\n
+        https://pubmed.ncbi.nlm.nih.gov/10062328/\n
+        https://pubmed.ncbi.nlm.nih.gov/19257665/
     """
 
     __name__ = "iso_17"
diff --git a/openqdc/datasets/potential/md22.py b/openqdc/datasets/potential/md22.py
index b9976426..0eb4a72c 100644
--- a/openqdc/datasets/potential/md22.py
+++ b/openqdc/datasets/potential/md22.py
@@ -40,6 +40,22 @@ def create_path(filename, root):
 
 
 class MD22(RevMD17):
+    """
+    MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,
+    ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories
+    are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD
+    level of theory.
+
+    Usage:
+    ```python
+    from openqdc.datasets import MD22
+    dataset = MD22()
+    ```
+
+    Reference:
+        https://arxiv.org/abs/2209.14865
+    """
+
     __name__ = "md22"
     __links__ = {
         f"{x}.npz": f"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz"
diff --git a/openqdc/datasets/potential/molecule3d.py b/openqdc/datasets/potential/molecule3d.py
index ec1dbd00..fa4f4683 100644
--- a/openqdc/datasets/potential/molecule3d.py
+++ b/openqdc/datasets/potential/molecule3d.py
@@ -67,9 +67,10 @@ def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray]
 
 class Molecule3D(BaseDataset):
     """
-    Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies
-    calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the
-    PubChem database and cleaned by removing invalid molecule files.
+    Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the
+    B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing
+    molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,
+    or with damaged log files.
 
     Usage:
     ```python
@@ -78,8 +79,8 @@ class Molecule3D(BaseDataset):
     ```
 
     References:
-    - https://arxiv.org/abs/2110.01717
-    - https://github.com/divelab/MoleculeX
+        https://arxiv.org/abs/2110.01717\n
+        https://github.com/divelab/MoleculeX
     """
 
     __name__ = "molecule3d"
diff --git a/openqdc/datasets/potential/multixcqm9.py b/openqdc/datasets/potential/multixcqm9.py
index 70dab1ea..41d7a4dc 100644
--- a/openqdc/datasets/potential/multixcqm9.py
+++ b/openqdc/datasets/potential/multixcqm9.py
@@ -37,20 +37,21 @@ def read_xyz_files(folder_path):
 
 class MultixcQM9(BaseDataset):
     """
-    MultixcQM9 is a dataset of  molecular and reaction energies from
-    multi-level quantum chemical methods consisting of 133 K QM9 molecules
-    calculated with 76 different DFT functionals and three different basis sets
-    (228 energy numbers for each molecule) + 1 GFN2-XTB calculation.
+    MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting
+    of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets
+    resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the
+    molecules are used directly from Kim et al. which uses G4MP2 method.
 
     Usage:
     ```python
-    from openqdc.datasets import NablaDFT
-    dataset = NablaDFT()
+    from openqdc.datasets import MultixcQM9
+    dataset = MultixcQM9()
     ```
 
     References:
-    - https://www.nature.com/articles/s41597-023-02690-2
-    - https://github.com/chemsurajit/largeDFTdata
+        https://www.nature.com/articles/s41597-023-02690-2\n
+        https://github.com/chemsurajit/largeDFTdata\n
+        https://www.nature.com/articles/s41597-019-0121-7\n
     """
 
     __name__ = "multixcqm9"
diff --git a/openqdc/datasets/potential/nabladft.py b/openqdc/datasets/potential/nabladft.py
index 4700ade5..f83f1c00 100644
--- a/openqdc/datasets/potential/nabladft.py
+++ b/openqdc/datasets/potential/nabladft.py
@@ -52,7 +52,11 @@ class NablaDFT(BaseDataset):
     """
     NablaDFT is a dataset constructed from a subset of the
     [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules
-    with 5,340,152 unique conformations generated using ωB97X-D/def2-SVP level of theory.
+    with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of
+    conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that
+    cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.
+    This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at
+    wB97X-D/def2-XVP levels are used to generate the energy.
 
     Usage:
     ```python
@@ -61,8 +65,8 @@ class NablaDFT(BaseDataset):
     ```
 
     References:
-    - https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D
-    - https://github.com/AIRI-Institute/nablaDFT
+        https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\n
+        https://github.com/AIRI-Institute/nablaDFT
     """
 
     __name__ = "nabladft"
@@ -76,6 +80,15 @@ class NablaDFT(BaseDataset):
     __forces_unit__ = "hartree/bohr"
     __links__ = {"nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"}
 
+    @property
+    def data_types(self):
+        return {
+            "atomic_inputs": np.float32,
+            "position_idx_range": np.int32,
+            "energies": np.float32,
+            "forces": np.float32,
+        }
+
     @requires_package("nablaDFT")
     def read_raw_entries(self):
         from nablaDFT.dataset import HamiltonianDatabase
diff --git a/openqdc/datasets/potential/orbnet_denali.py b/openqdc/datasets/potential/orbnet_denali.py
index 6a7c3f47..1dd70468 100644
--- a/openqdc/datasets/potential/orbnet_denali.py
+++ b/openqdc/datasets/potential/orbnet_denali.py
@@ -36,10 +36,14 @@ def read_archive(mol_id, conf_dict, base_path, energy_target_names: List[str]) -
 
 class OrbnetDenali(BaseDataset):
     """
-    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. It performs
-    DFT (ωB97X-D3/def2-TZVP) calculations on molecules and geometries consisting of organic molecules
-    and chemistries, with protonation and tautomeric states, non-covalent interactions, common salts,
-    and counterions, spanning the most common elements in bio and organic chemistry.
+    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range
+    of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and
+    counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.
+    First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer
+    generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using
+    normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of
+    theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of
+    theory.
 
     Usage:
     ```python
@@ -48,8 +52,8 @@ class OrbnetDenali(BaseDataset):
     ```
 
     References:
-    - https://arxiv.org/pdf/2107.00299.pdf
-    - https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867
+        https://arxiv.org/abs/2107.00299\n
+        https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867
     """
 
     __name__ = "orbnet_denali"
@@ -74,13 +78,6 @@ def read_raw_entries(self):
             for mol_id, group in df.groupby("mol_id")
         }
 
-        # print(df.head())
-        # tmp = df.to_dict('index')
-        # for i, k in enumerate(tmp):
-        #     print(k, tmp[k])
-        #     if i > 10:
-        #         break
-        # exit()
         fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)
         res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True)
         samples = sum(res, [])
diff --git a/openqdc/datasets/potential/pcqm.py b/openqdc/datasets/potential/pcqm.py
index 535b90dc..cd32b838 100644
--- a/openqdc/datasets/potential/pcqm.py
+++ b/openqdc/datasets/potential/pcqm.py
@@ -66,6 +66,23 @@ def read_preprocessed_archive(path):
 
 
 class PCQM_PM6(BaseDataset):
+    """
+    PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized
+    molecular geometries and electronic properties. To generate the dataset, only molecules with weights less
+    than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel
+    and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also
+    computed using the PM6 method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import PCQM_PM6
+    dataset = PCQM_PM6()
+    ```
+
+    References:
+        https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740
+    """
+
     __name__ = "pubchemqc_pm6"
     __energy_methods__ = [PotentialMethod.PM6]
 
@@ -93,6 +110,15 @@ def collate_list(self, list_entries):
             res = None
         return res
 
+    @property
+    def data_types(self):
+        return {
+            "atomic_inputs": np.float32,
+            "position_idx_range": np.int32,
+            "energies": np.float32,
+            "forces": np.float32,
+        }
+
     def read_raw_entries(self):
         arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
         f = lambda x: self.collate_list(read_preprocessed_archive(x))
@@ -150,6 +176,21 @@ def collate_and_save_list(self, list_entries):
 
 
 class PCQM_B3LYP(PCQM_PM6):
+    """
+    PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to
+    biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,
+    the electronic structure and properties are calculated using B3LIP/6-31G* method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import PCQM_B3LYP
+    dataset = PCQM_B3LYP()
+    ```
+
+    References:
+        https://arxiv.org/abs/2305.18454
+    """
+
     __name__ = "pubchemqc_b3lyp"
     __energy_methods__ = ["b3lyp/6-31g*"]
     energy_target_names = ["b3lyp"]
diff --git a/openqdc/datasets/potential/proteinfragments.py b/openqdc/datasets/potential/proteinfragments.py
new file mode 100644
index 00000000..d6289750
--- /dev/null
+++ b/openqdc/datasets/potential/proteinfragments.py
@@ -0,0 +1,192 @@
+import os
+from os.path import join as p_join
+
+import numpy as np
+from tqdm import tqdm
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.package_utils import requires_package
+
+
+def convert_entries(r, e, f, z, subset):
+    coordinates = r
+    species = z
+    forces = f
+    energies = e
+    n_atoms = coordinates.shape[0]
+    flattened_coordinates = coordinates[:].reshape((-1, 3))
+    xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
+    res = dict(
+        name=np.array([subset]),
+        subset=np.array([subset]),
+        energies=energies[:].reshape((-1, 1)).astype(np.float64),
+        atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
+        n_atoms=np.array([n_atoms], dtype=np.int32),
+        forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
+    )
+    return res
+
+
+@requires_package("apsw")
+def read_db(path):
+    database = Database(path)
+    subset = os.path.basename(path).split(".")[0]
+    # Read an entry from the database.
+    # entry = 0
+    n = len(database)
+    entries = []
+    for entry in tqdm(range(n)):
+        q, s, z, r, e, f, d = database[entry]
+        entries.append(convert_entries(r, e, f, z, subset))
+    return entries
+
+    # assert entry < len(database)
+    # q, s, z, r, e, f, d = database[entry]
+    # with np.printoptions(threshold=7):
+    #  print(f'entry {entry} of {len(database)}')
+    #  print('total charge\n', q)
+    #  print('number of unpaired electrons\n', s)
+    #  print('atomic numbers\n', z)
+    #  print('positions [Å]\n', r)
+    #  print('energy [eV]\n', e)
+    #  print('forces [eV/Å]\n', f)
+    #  print('dipole [e*Å]\n', d)
+
+
+class Database:
+    @requires_package("apsw")
+    def __init__(self, filename):
+        import apsw
+
+        self.cursor = apsw.Connection(filename, flags=apsw.SQLITE_OPEN_READONLY).cursor()
+
+    def __len__(self):
+        return self.cursor.execute("""SELECT * FROM metadata WHERE id=1""").fetchone()[-1]
+
+    def __getitem__(self, idx):
+        data = self.cursor.execute("""SELECT * FROM data WHERE id=""" + str(idx)).fetchone()
+        return self._unpack_data_tuple(data)
+
+    def _deblob(self, buffer, dtype, shape=None):
+        array = np.frombuffer(buffer, dtype)
+        if not np.little_endian:
+            array = array.byteswap()
+        array.shape = shape
+        return np.copy(array)
+
+    def _unpack_data_tuple(self, data):
+        n = len(data[3]) // 4  # A single int32 is 4 bytes long.
+        q = np.asarray([0.0 if data[1] is None else data[1]], dtype=np.float32)
+        s = np.asarray([0.0 if data[2] is None else data[2]], dtype=np.float32)
+        z = self._deblob(data[3], dtype=np.int32, shape=(n,))
+        r = self._deblob(data[4], dtype=np.float32, shape=(n, 3))
+        e = np.asarray([0.0 if data[5] is None else data[5]], dtype=np.float32)
+        f = self._deblob(data[6], dtype=np.float32, shape=(n, 3))
+        d = self._deblob(data[7], dtype=np.float32, shape=(1, 3))
+        return q, s, z, r, e, f, d
+
+
+class ProteinFragments(BaseDataset):
+    """
+    ProteinFragments is a dataset constructed from a subset of the
+    the data was generated from a top-down and bottom-up approach:
+
+    Top-down:
+        Fragments are generated by cutting out a spherical
+        region around an atom (including solvent molecules)
+        and saturating all dangling bonds.
+        Sampling was done with the Molecular Dynamics (MD) method from
+        conventional FF at room temperature.
+
+    Bottom-up:
+        Fragments are generated by constructing chemical graphs
+        of one to eight nonhydrogen atoms.
+        Sampling of multiple conformers per fragments was done with
+        MD simulations at high temperatures or normal mode sampling.
+
+
+    Usage:
+    ```python
+    from openqdc.datasets import ProteinFragments
+    dataset = ProteinFragments()
+    ```
+
+    References:
+        https://www.science.org/doi/10.1126/sciadv.adn4397
+    """
+
+    __name__ = "proteinfragments"
+    # PBE0/def2-TZVPP+MBD
+    __energy_methods__ = [
+        PotentialMethod.WB97X_6_31G_D,  # "wb97x/6-31g(d)"
+    ]
+
+    energy_target_names = [
+        "ωB97x:6-31G(d) Energy",
+    ]
+    # PBE0/def2-TZVPP+MBD
+    __energy_unit__ = "ev"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "ev/ang"
+    __links__ = {
+        f"{name}.db": f"https://zenodo.org/records/10720941/files/{name}.db?download=1"
+        for name in ["general_protein_fragments"]
+    }
+
+    @property
+    def root(self):
+        return p_join(get_local_cache(), "proteinfragments")
+
+    @property
+    def config(self):
+        assert len(self.__links__) > 0, "No links provided for fetching"
+        return dict(dataset_name="proteinfragments", links=self.__links__)
+
+    @property
+    def preprocess_path(self):
+        path = p_join(self.root, "preprocessed", self.__name__)
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    def read_raw_entries(self):
+        samples = []
+        for name in self.__links__:
+            raw_path = p_join(self.root, f"{name}")
+            samples.extend(read_db(raw_path))
+        return samples
+
+
+class MDDataset(ProteinFragments):
+    """
+    MDDataset is a subset of the proteinfragments dataset that
+    generated from the molecular dynamics with their model.
+    The sampling was done with Molecular Dynamics
+    at room temperature 300K in various solvent phase:
+
+    Subsets:
+        Polyalanine:
+            All the polyalanine are sampled in gas phase. AceAla15Lys is
+            a polyalanine peptides capped with an N-terminal acetyl group
+            and a protonated lysine residue at the C-terminus,
+            Acela15nme is polyalanine peptide capped with an N-terminal acetyl group
+            and a C-terminal N-methyl amide group\n
+        Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)
+
+    Usage:
+    ```python
+    from openqdc.datasets import MDDataset
+    dataset = MDDataset()
+    ```
+
+    References:
+        https://www.science.org/doi/10.1126/sciadv.adn4397
+    """
+
+    __name__ = "mddataset"
+
+    __links__ = {
+        f"{name}.db": f"https://zenodo.org/records/10720941/files/{name}.db?download=1"
+        for name in ["acala15nme_folding_clusters", "crambin", "minimahopping_acala15lysh", "minimahopping_acala15nme"]
+    }
diff --git a/openqdc/datasets/potential/qm1b.py b/openqdc/datasets/potential/qm1b.py
index 5e10ed23..edccae0d 100644
--- a/openqdc/datasets/potential/qm1b.py
+++ b/openqdc/datasets/potential/qm1b.py
@@ -78,11 +78,11 @@ def extract_from_row(row, file_idx=None):
 
 class QM1B(BaseDataset):
     """
-    QM1B is a low-resolution DFT dataset generated using PySCF IPU.
-    It is composed of one billion training examples containing 9-11 heavy atoms.
-    It was created by taking 1.09M SMILES strings from the GDB-11 database and
-    computing molecular properties (e.g. HOMO-LUMO gap) for a set of up to 1000
-    conformers per molecule at the B3LYP/STO-3G level of theory.
+    QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom
+    PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are
+    subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.
+    Electronic properties for each conformation are then calculated using the density functional B3LYP
+    and the basis set STO-3G.
 
     Usage:
     ```python
@@ -91,8 +91,8 @@ class QM1B(BaseDataset):
     ```
 
     References:
-    - https://arxiv.org/pdf/2311.01135
-    - https://github.com/graphcore-research/qm1b-dataset/
+        https://arxiv.org/pdf/2311.01135\n
+        https://github.com/graphcore-research/qm1b-dataset/
     """
 
     __name__ = "qm1b"
@@ -144,8 +144,7 @@ def extract_parallel(df, i):
 
 class QM1B_SMALL(QM1B):
     """
-    QM1B_SMALL is a subset of the QM1B dataset containing a
-    maximum of 15 random conformers per molecule.
+    QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.
 
     Usage:
     ```python
diff --git a/openqdc/datasets/potential/qm7x.py b/openqdc/datasets/potential/qm7x.py
index 7bf1323c..351162c0 100644
--- a/openqdc/datasets/potential/qm7x.py
+++ b/openqdc/datasets/potential/qm7x.py
@@ -35,12 +35,15 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names):
 
 class QM7X(BaseDataset):
     """
-    QM7X is a collection of almost 4.2 million conformers from 6,950 unique molecules. It contains DFT
-    energy and force labels at the PBE0+MBD level of theory. It consists of structures for molecules with
-    up to seven heavy (C, N, O, S, Cl) atoms from the GDB13 database. For each molecule, (meta-)stable
-    equilibrium structures including constitutional/structural isomers and stereoisomers are
-    searched using density-functional tight binding (DFTB). Then, for each (meta-)stable structure, 100
-    off-equilibrium structures are obtained and labeled with PBE0+MBD.
+    QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with
+    up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,
+    OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-
+    stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure
+    is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)
+    interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non
+    -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of
+    normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has
+    energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.
 
     Usage:
     ```python
@@ -49,8 +52,8 @@ class QM7X(BaseDataset):
     ```
 
     References:
-    - https://arxiv.org/abs/2006.15139
-    - https://zenodo.org/records/4288677
+        https://arxiv.org/abs/2006.15139\n
+        https://zenodo.org/records/4288677
     """
 
     __name__ = "qm7x"
@@ -59,9 +62,9 @@ class QM7X(BaseDataset):
 
     energy_target_names = ["ePBE0+MBD", "eDFTB+MBD"]
 
-    __force_mask__ = [True, True]
+    __force_mask__ = [True, False]
 
-    force_target_names = ["pbe0FOR", "vdwFOR"]
+    force_target_names = ["pbe0FOR"]
 
     __energy_unit__ = "ev"
     __distance_unit__ = "ang"
@@ -81,6 +84,16 @@ def read_raw_entries(self):
 
 
 class QM7X_V2(QM7X):
+    """
+    QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.
+
+    Usage:
+    ```python
+    from openqdc.datasets import QM7X_V2
+    dataset = QM7X_V2()
+    ```
+    """
+
     __name__ = "qm7x_v2"
     __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]
     __force_mask__ = QM7X.__force_mask__ + [False]
diff --git a/openqdc/datasets/potential/qmugs.py b/openqdc/datasets/potential/qmugs.py
index 6cc38900..b819b214 100644
--- a/openqdc/datasets/potential/qmugs.py
+++ b/openqdc/datasets/potential/qmugs.py
@@ -38,8 +38,9 @@ def read_mol(mol_dir):
 class QMugs(BaseDataset):
     """
     The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules
-    extracted from the ChEMBL database. The atomic and molecular properties are calculated using both,
-    semi-empirical methods (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP).
+    extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB
+    method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical
+    method (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP).
 
     Usage:
     ```python
@@ -48,8 +49,9 @@ class QMugs(BaseDataset):
     ```
 
     References:
-    - https://www.nature.com/articles/s41597-022-01390-7#ethics
-    - https://www.research-collection.ethz.ch/handle/20.500.11850/482129
+        https://arxiv.org/abs/2107.00367\n
+        https://www.nature.com/articles/s41597-022-01390-7#ethics\n
+        https://www.research-collection.ethz.ch/handle/20.500.11850/482129
     """
 
     __name__ = "qmugs"
@@ -76,6 +78,16 @@ def read_raw_entries(self):
 
 
 class QMugs_V2(QMugs):
+    """
+    QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.
+
+    Usage:
+    ```python
+    from openqdc.datasets import QMugs_V2
+    dataset = QMugs_V2()
+    ```
+    """
+
     __name__ = "qmugs_v2"
     __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]
     energy_target_names = QMugs.energy_target_names + ["PM6"]
diff --git a/openqdc/datasets/potential/qmx.py b/openqdc/datasets/potential/qmx.py
new file mode 100644
index 00000000..2dfb8443
--- /dev/null
+++ b/openqdc/datasets/potential/qmx.py
@@ -0,0 +1,402 @@
+import os
+from abc import ABC
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+import pandas as pd
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils import read_qc_archive_h5
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+
+def extract_ani2_entries(properties):
+    coordinates = properties["coordinates"]
+    species = properties["species"]
+    forces = properties["forces"]
+    energies = properties["energies"]
+    n_atoms = coordinates.shape[1]
+    n_entries = coordinates.shape[0]
+    flattened_coordinates = coordinates[:].reshape((-1, 3))
+    xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
+    res = dict(
+        name=np.array(["ANI2"] * n_entries),
+        subset=np.array([str(n_atoms)] * n_entries),
+        energies=energies[:].reshape((-1, 1)).astype(np.float64),
+        atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
+        n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
+        forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
+    )
+    return res
+
+
+class QMX(ABC, BaseDataset):
+    """
+    QMX dataset base abstract class
+    """
+
+    __name__ = "qm9"
+
+    __energy_methods__ = [
+        PotentialMethod.WB97X_6_31G_D,  # "wb97x/6-31g(d)"
+    ]
+
+    energy_target_names = [
+        "ωB97x:6-31G(d) Energy",
+    ]
+
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "bohr"
+    __forces_unit__ = "hartree/bohr"
+    __links__ = {}
+
+    @property
+    def root(self):
+        return p_join(get_local_cache(), "qmx")
+
+    @property
+    def preprocess_path(self):
+        path = p_join(self.root, "preprocessed", self.__name__)
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    @property
+    def config(self):
+        assert len(self.__links__) > 0, "No links provided for fetching"
+        return dict(dataset_name="qmx", links=self.__links__)
+
+    def read_raw_entries(self):
+        raw_path = p_join(self.root, f"{self.__name__}.h5.gz")
+        samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)
+        return samples
+
+
+# ['smiles', 'E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2', 'E1-PBE0', 'E2-PBE0', 'f1-PBE0', 'f2-PBE0',
+# 'E1-PBE0.1', 'E2-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1', 'E1-CAM', 'E2-CAM', 'f1-CAM', 'f2-CAM']
+class QM7(QMX):
+    """
+    QM7 is a dataset constructed from subsets of the GDB-13 database (
+    stable and synthetically accessible organic molecules),
+    containing up to seven “heavy” atoms.
+    The molecules conformation are optimized using DFT at the
+    PBE0/def2-TZVP level of theory.
+
+    Chemical species:
+        [C, N, O, S, H]
+
+    Usage:
+    ```python
+    from openqdc.datasets import QM7
+    dataset = QM7()
+    ```
+
+    References:
+        https://arxiv.org/pdf/1703.00564
+    """
+
+    __links__ = {"qm7.hdf5.gz": "https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1"}
+    __name__ = "qm7"
+
+    energy_target_names = [
+        "B2PLYP-D3(BJ):aug-cc-pvdz",
+        "B2PLYP-D3(BJ):aug-cc-pvtz",
+        "B2PLYP-D3(BJ):def2-svp",
+        "B2PLYP-D3(BJ):def2-tzvp",
+        "B2PLYP-D3(BJ):sto-3g",
+        "B2PLYP-D3:aug-cc-pvdz",
+        "B2PLYP-D3:aug-cc-pvtz",
+        "B2PLYP-D3:def2-svp",
+        "B2PLYP-D3:def2-tzvp",
+        "B2PLYP-D3:sto-3g",
+        "B2PLYP-D3M(BJ):aug-cc-pvdz",
+        "B2PLYP-D3M(BJ):aug-cc-pvtz",
+        "B2PLYP-D3M(BJ):def2-svp",
+        "B2PLYP-D3M(BJ):def2-tzvp",
+        "B2PLYP-D3M(BJ):sto-3g",
+        "B2PLYP-D3M:aug-cc-pvdz",
+        "B2PLYP-D3M:aug-cc-pvtz",
+        "B2PLYP-D3M:def2-svp",
+        "B2PLYP-D3M:def2-tzvp",
+        "B2PLYP-D3M:sto-3g",
+        "B2PLYP:aug-cc-pvdz",
+        "B2PLYP:aug-cc-pvtz",
+        "B2PLYP:def2-svp",
+        "B2PLYP:def2-tzvp",
+        "B2PLYP:sto-3g",
+        "B3LYP-D3(BJ):aug-cc-pvdz",
+        "B3LYP-D3(BJ):aug-cc-pvtz",
+        "B3LYP-D3(BJ):def2-svp",
+        "B3LYP-D3(BJ):def2-tzvp",
+        "B3LYP-D3(BJ):sto-3g",
+        "B3LYP-D3:aug-cc-pvdz",
+        "B3LYP-D3:aug-cc-pvtz",
+        "B3LYP-D3:def2-svp",
+        "B3LYP-D3:def2-tzvp",
+        "B3LYP-D3:sto-3g",
+        "B3LYP-D3M(BJ):aug-cc-pvdz",
+        "B3LYP-D3M(BJ):aug-cc-pvtz",
+        "B3LYP-D3M(BJ):def2-svp",
+        "B3LYP-D3M(BJ):def2-tzvp",
+        "B3LYP-D3M(BJ):sto-3g",
+        "B3LYP-D3M:aug-cc-pvdz",
+        "B3LYP-D3M:aug-cc-pvtz",
+        "B3LYP-D3M:def2-svp",
+        "B3LYP-D3M:def2-tzvp",
+        "B3LYP-D3M:sto-3g",
+        "B3LYP:aug-cc-pvdz",
+        "B3LYP:aug-cc-pvtz",
+        "B3LYP:def2-svp",
+        "B3LYP:def2-tzvp",
+        "B3LYP:sto-3g",
+        "HF:aug-cc-pvdz",
+        "HF:aug-cc-pvtz",
+        "HF:def2-svp",
+        "HF:def2-tzvp",
+        "HF:sto-3g",
+        "MP2:aug-cc-pvdz",
+        "MP2:aug-cc-pvtz",
+        "MP2:def2-svp",
+        "MP2:def2-tzvp",
+        "MP2:sto-3g",
+        "PBE0:aug-cc-pvdz",
+        "PBE0:aug-cc-pvtz",
+        "PBE0:def2-svp",
+        "PBE0:def2-tzvp",
+        "PBE0:sto-3g",
+        "PBE:aug-cc-pvdz",
+        "PBE:aug-cc-pvtz",
+        "PBE:def2-svp",
+        "PBE:def2-tzvp",
+        "PBE:sto-3g",
+        "WB97M-V:aug-cc-pvdz",
+        "WB97M-V:aug-cc-pvtz",
+        "WB97M-V:def2-svp",
+        "WB97M-V:def2-tzvp",
+        "WB97M-V:sto-3g",
+        "WB97X-D:aug-cc-pvdz",
+        "WB97X-D:aug-cc-pvtz",
+        "WB97X-D:def2-svp",
+        "WB97X-D:def2-tzvp",
+        "WB97X-D:sto-3g",
+    ]
+
+    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # "wb97x/6-31g(d)"
+
+
+class QM7b(QMX):
+    """
+    QM7b is a dataset constructed from subsets of the GDB-13 database (
+    stable and synthetically accessible organic molecules),
+    containing up to seven “heavy” atoms.
+    The molecules conformation are optimized using DFT at the
+    PBE0/def2-TZVP level of theory.
+
+    Chemical species:
+        [C, N, O, S, Cl, H]
+
+    Usage:
+    ```python
+    from openqdc.datasets import QM7b
+    dataset = QM7b()
+    ```
+
+    References:
+        https://arxiv.org/pdf/1703.00564
+    """
+
+    __links__ = {"qm7b.hdf5.gz": "https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1"}
+    __name__ = "qm7b"
+    energy_target_names = [
+        "CCSD(T0):cc-pVDZ",
+        "HF:cc-pVDZ",
+        "HF:cc-pVTZ",
+        "MP2:cc-pVTZ",
+        "B2PLYP-D3:aug-cc-pvdz",
+        "B2PLYP-D3:aug-cc-pvtz",
+        "B2PLYP-D3:def2-svp",
+        "B2PLYP-D3:def2-tzvp",
+        "B2PLYP-D3:sto-3g",
+        "B2PLYP-D3M(BJ):aug-cc-pvdz",
+        "B2PLYP-D3M(BJ):aug-cc-pvtz",
+        "B2PLYP-D3M(BJ):def2-svp",
+        "B2PLYP-D3M(BJ):def2-tzvp",
+        "B2PLYP-D3M(BJ):sto-3g",
+        "B2PLYP-D3M:aug-cc-pvdz",
+        "B2PLYP-D3M:aug-cc-pvtz",
+        "B2PLYP-D3M:def2-svp",
+        "B2PLYP-D3M:def2-tzvp",
+        "B2PLYP-D3M:sto-3g",
+        "B2PLYP:aug-cc-pvdz",
+        "B2PLYP:aug-cc-pvtz",
+        "B2PLYP:def2-svp",
+        "B2PLYP:def2-tzvp",
+        "B2PLYP:sto-3g",
+        "B3LYP-D3(BJ):aug-cc-pvdz",
+        "B3LYP-D3(BJ):aug-cc-pvtz",
+        "B3LYP-D3(BJ):def2-svp",
+        "B3LYP-D3(BJ):def2-tzvp",
+        "B3LYP-D3(BJ):sto-3g",
+        "B3LYP-D3:aug-cc-pvdz",
+        "B3LYP-D3:aug-cc-pvtz",
+        "B3LYP-D3:def2-svp",
+        "B3LYP-D3:def2-tzvp",
+        "B3LYP-D3:sto-3g",
+        "B3LYP-D3M(BJ):aug-cc-pvdz",
+        "B3LYP-D3M(BJ):aug-cc-pvtz",
+        "B3LYP-D3M(BJ):def2-svp",
+        "B3LYP-D3M(BJ):def2-tzvp",
+        "B3LYP-D3M(BJ):sto-3g",
+        "B3LYP-D3M:aug-cc-pvdz",
+        "B3LYP-D3M:aug-cc-pvtz",
+        "B3LYP-D3M:def2-svp",
+        "B3LYP-D3M:def2-tzvp",
+        "B3LYP-D3M:sto-3g",
+        "B3LYP:aug-cc-pvdz",
+        "B3LYP:aug-cc-pvtz",
+        "B3LYP:def2-svp",
+        "B3LYP:def2-tzvp",
+        "B3LYP:sto-3g",
+        "HF:aug-cc-pvdz",
+        "HF:aug-cc-pvtz",
+        "HF:cc-pvtz",
+        "HF:def2-svp",
+        "HF:def2-tzvp",
+        "HF:sto-3g",
+        "PBE0:aug-cc-pvdz",
+        "PBE0:aug-cc-pvtz",
+        "PBE0:def2-svp",
+        "PBE0:def2-tzvp",
+        "PBE0:sto-3g",
+        "PBE:aug-cc-pvdz",
+        "PBE:aug-cc-pvtz",
+        "PBE:def2-svp",
+        "PBE:def2-tzvp",
+        "PBE:sto-3g",
+        "SVWN:sto-3g",
+        "WB97M-V:aug-cc-pvdz",
+        "WB97M-V:aug-cc-pvtz",
+        "WB97M-V:def2-svp",
+        "WB97M-V:def2-tzvp",
+        "WB97M-V:sto-3g",
+        "WB97X-D:aug-cc-pvdz",
+        "WB97X-D:aug-cc-pvtz",
+        "WB97X-D:def2-svp",
+        "WB97X-D:def2-tzvp",
+        "WB97X-D:sto-3g",
+    ]
+    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]  # "wb97x/6-31g(d)"]
+
+
+class QM8(QMX):
+    """QM8 is the subset of QM9 used in a study on modeling quantum
+    mechanical calculations of electronic spectra and excited
+    state energy (a increase of energy from the ground states) of small molecules
+    up to eight heavy atoms.
+    Multiple methods were used, including
+    time-dependent density functional theories (TDDFT) and
+    second-order approximate coupled-cluster (CC2).
+    The molecules conformations are relaxed geometries computed using
+    the DFT B3LYP with basis set 6-31G(2df,p).
+    For more information about the sampling, check QM9 dataset.
+
+    Usage:
+    ```python
+    from openqdc.datasets import QM8
+    dataset = QM8()
+    ```
+
+    References:
+        https://arxiv.org/pdf/1504.01966
+    """
+
+    __name__ = "qm8"
+
+    __energy_methods__ = [
+        PotentialMethod.NONE,  # "wb97x/6-31g(d)"
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+    ]
+
+    __links__ = {
+        "qm8.csv": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv",
+        "qm8.tar.gz": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz",
+    }
+
+    def read_raw_entries(self):
+        df = pd.read_csv(p_join(self.root, "qm8.csv"))
+        mols = dm.read_sdf(p_join(self.root, "qm8.sdf"), sanitize=False, remove_hs=False)
+        samples = []
+        for idx_row, mol in zip(df.iterrows(), mols):
+            _, row = idx_row
+            positions = mol.GetConformer().GetPositions()
+            x = get_atomic_number_and_charge(mol)
+            n_atoms = positions.shape[0]
+            samples.append(
+                dict(
+                    atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),
+                    name=np.array([row["smiles"]]),
+                    energies=np.array(
+                        [
+                            row[
+                                ["E1-CC2", "E2-CC2", "E1-PBE0", "E2-PBE0", "E1-PBE0.1", "E2-PBE0.1", "E1-CAM", "E2-CAM"]
+                            ].tolist()
+                        ],
+                        dtype=np.float64,
+                    ).reshape(1, -1),
+                    n_atoms=np.array([n_atoms], dtype=np.int32),
+                    subset=np.array([f"{self.__name__}"]),
+                )
+            )
+        return samples
+
+
+class QM9(QMX):
+    """
+    QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,
+    containing up to 9 “heavy” atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)
+    level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed
+    by relaxing geometries with quantum mechanical method B3LYP.
+
+    Usage:
+    ```python
+    from openqdc.datasets import QM9
+    dataset = QM9()
+    ```
+
+    Reference:
+        https://www.nature.com/articles/sdata201422
+    """
+
+    __links__ = {"qm9.hdf5.gz": "https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1"}
+    __name__ = "qm9"
+    energy_target_names = [
+        "Internal energy at 0 K",
+        "B3LYP:def2-svp",
+        "HF:cc-pvtz",
+        "HF:sto-3g",
+        "PBE:sto-3g",
+        "SVWN:sto-3g",
+        "WB97X-D:aug-cc-pvtz",
+        "WB97X-D:def2-svp",
+        "WB97X-D:def2-tzvp",
+    ]
+
+    __energy_methods__ = [
+        PotentialMethod.NONE,  # "wb97x/6-31g(d)"
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+        PotentialMethod.NONE,
+    ]
diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py
index 613ce91f..425f9784 100644
--- a/openqdc/datasets/potential/revmd17.py
+++ b/openqdc/datasets/potential/revmd17.py
@@ -54,23 +54,27 @@ def create_path(filename, root):
 
 class RevMD17(BaseDataset):
     """
-    - Benzene: 627000 samples
-    - Uracil: 133000 samples
-    - Naptalene: 326000 samples
-    - Aspirin: 211000 samples
-    - Salicylic Acid: 320000 samples
-    - Malonaldehyde: 993000 samples
-    - Ethanol: 555000 samples
-    - Toluene: 100000 samples
-
-    Usage
+    Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original
+    dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies
+    are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration
+    grid. The dataset contains the following molecules:
+        Benzene: 627000 samples\n
+        Uracil: 133000 samples\n
+        Naptalene: 326000 samples\n
+        Aspirin: 211000 samples\n
+        Salicylic Acid: 320000 samples\n
+        Malonaldehyde: 993000 samples\n
+        Ethanol: 555000 samples\n
+        Toluene: 100000 samples\n
+
+    Usage:
     ```python
     from openqdc.datasets import RevMD17
     dataset = RevMD17()
     ```
 
     References:
-    - https://arxiv.org/abs/2007.09593
+        https://arxiv.org/abs/2007.09593
     """
 
     __name__ = "revmd17"
diff --git a/openqdc/datasets/potential/sn2_rxn.py b/openqdc/datasets/potential/sn2_rxn.py
index 29337573..2194775b 100644
--- a/openqdc/datasets/potential/sn2_rxn.py
+++ b/openqdc/datasets/potential/sn2_rxn.py
@@ -39,10 +39,12 @@ def extract_npz_entry(data):
 
 class SN2RXN(BaseDataset):
     """
-    This dataset probes chemical reactions of methyl halides with halide anions, i.e.
-    X- + CH3Y -> CH3X +  Y-, and contains structures for all possible combinations of
-    X,Y = F, Cl, Br, I. It contains energy and forces for 452709 conformations calculated
-    at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory.
+    This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X +  Y-, and
+    contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by
+    running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment
+    (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and
+    for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset
+    contains 452,709 structures along with the energy, force and dipole moments.
 
     Usage:
     ```python
@@ -51,8 +53,8 @@ class SN2RXN(BaseDataset):
     ```
 
     References:
-    - https://doi.org/10.1021/acs.jctc.9b00181
-    - https://zenodo.org/records/2605341
+        https://doi.org/10.1021/acs.jctc.9b00181\n
+        https://zenodo.org/records/2605341
     """
 
     __name__ = "sn2_rxn"
diff --git a/openqdc/datasets/potential/solvated_peptides.py b/openqdc/datasets/potential/solvated_peptides.py
index 4fead36f..f00e1a05 100644
--- a/openqdc/datasets/potential/solvated_peptides.py
+++ b/openqdc/datasets/potential/solvated_peptides.py
@@ -7,10 +7,10 @@
 
 class SolvatedPeptides(BaseDataset):
     """
-    The solvated protein fragments dataset probes many-body intermolecular
-    interactions between "protein fragments" and water molecules.
-    It contains energy and forces for 2731180 structures calculated
-    at the revPBE-D3(BJ)/def2-TZVP level of theory.
+    The solvated protein fragments dataset probes many-body intermolecular interactions between "protein fragments"
+    and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are
+    run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10
+    steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.
 
     Usage:
     ```python
@@ -19,8 +19,8 @@ class SolvatedPeptides(BaseDataset):
     ```
 
     References:
-    - https://doi.org/10.1021/acs.jctc.9b00181
-    - https://zenodo.org/records/2605372
+        https://doi.org/10.1021/acs.jctc.9b00181\n
+        https://zenodo.org/records/2605372
     """
 
     __name__ = "solvated_peptides"
diff --git a/openqdc/datasets/potential/spice.py b/openqdc/datasets/potential/spice.py
index 27525bb4..2f8cc36f 100644
--- a/openqdc/datasets/potential/spice.py
+++ b/openqdc/datasets/potential/spice.py
@@ -40,9 +40,12 @@ def read_record(r, obj):
 
 class Spice(BaseDataset):
     """
-    The Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of
-    small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated
-    at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory.
+    Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of
+    small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,
+    and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate
+    100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and
+    molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the
+    wB97M-D3(BJ)/def2-TZVPPD level of theory.
 
     Usage:
     ```python
@@ -51,8 +54,8 @@ class Spice(BaseDataset):
     ```
 
     References:
-    - https://arxiv.org/abs/2209.10702
-    - https://github.com/openmm/spice-dataset
+        https://arxiv.org/abs/2209.10702\n
+        https://github.com/openmm/spice-dataset
     """
 
     __name__ = "spice"
@@ -96,10 +99,11 @@ def read_raw_entries(self):
 
 class SpiceV2(Spice):
     """
-    SpiceV2 dataset augmented with amino acids complexes, water boxes,
-    pubchem solvated molecules.
-    It consists of both forces and energies calculated
-    at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory.
+    SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.
+    The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain
+    silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve
+    sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and
+    (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.
 
     Usage:
     ```python
@@ -108,8 +112,8 @@ class SpiceV2(Spice):
     ```
 
     References:
-    - https://github.com/openmm/spice-dataset/releases/tag/2.0.0
-    - https://github.com/openmm/spice-dataset
+        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\n
+        https://github.com/openmm/spice-dataset
     """
 
     __name__ = "spicev2"
@@ -150,6 +154,20 @@ def read_raw_entries(self):
 
 
 class SpiceVL2(SpiceV2):
+    """
+    SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.
+
+    Usage:
+    ```python
+    from openqdc.datasets import SpiceVL2
+    dataset = SpiceVL2()
+    ```
+
+    References:
+        https://github.com/openmm/spice-dataset/releases/tag/2.0.0\n
+        https://github.com/openmm/spice-dataset
+    """
+
     __name__ = "spice_vl2"
 
     __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]
diff --git a/openqdc/datasets/potential/tmqm.py b/openqdc/datasets/potential/tmqm.py
index 1da6901a..987fa10f 100644
--- a/openqdc/datasets/potential/tmqm.py
+++ b/openqdc/datasets/potential/tmqm.py
@@ -47,10 +47,10 @@ def read_xyz(fname, e_map):
 
 class TMQM(BaseDataset):
     """
-    The tmQM dataset contains the geometries of a large transition metal-organic
-    compound space with a large variety of organic ligands and 30 transition metals.
-    It contains energy labels for 86,665 mononuclear complexe calculated
-    at the TPSSh-D3BJ/def2-SV DFT level of theory.
+    tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of
+    organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated
+    at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database
+    and then optimized in gas phase with the extended tight-binding GFN2-xTB method.
 
     Usage:
     ```python
@@ -59,8 +59,8 @@ class TMQM(BaseDataset):
     ```
 
     References:
-    - https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041
-    - https://github.com/bbskjelstad/tmqm
+        https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\n
+        https://github.com/bbskjelstad/tmqm
     """
 
     __name__ = "tmqm"
diff --git a/openqdc/datasets/potential/transition1x.py b/openqdc/datasets/potential/transition1x.py
index 8b5b4bc1..d15d71c1 100644
--- a/openqdc/datasets/potential/transition1x.py
+++ b/openqdc/datasets/potential/transition1x.py
@@ -39,9 +39,9 @@ def read_record(r, group):
 
 class Transition1X(BaseDataset):
     """
-    The Transition1x dataset contains structures from 10k organic reaction pathways of various types.
-    It contains DFT energy and force labels for 9.6 mio. conformers calculated at the
-    wB97x/6-31-G(d) level of theory.
+    Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy
+    and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and
+    the transition states are generated by running Nudged Elastic Band (NEB) with DFT.
 
     Usage:
     ```python
@@ -50,8 +50,8 @@ class Transition1X(BaseDataset):
     ```
 
     References:
-    - https://www.nature.com/articles/s41597-022-01870-w
-    - https://gitlab.com/matschreiner/Transition1x
+    - https://www.nature.com/articles/s41597-022-01870-w\n
+    - https://gitlab.com/matschreiner/Transition1x\n
     """
 
     __name__ = "transition1x"
diff --git a/openqdc/datasets/potential/vqm24.py b/openqdc/datasets/potential/vqm24.py
new file mode 100644
index 00000000..1710e1dd
--- /dev/null
+++ b/openqdc/datasets/potential/vqm24.py
@@ -0,0 +1,82 @@
+import os
+from os.path import join as p_join
+
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+
+
+def shape_atom_inputs(coords, atom_species):
+    xs = np.stack((atom_species, np.zeros_like(atom_species)), axis=-1)
+    return np.concatenate((xs, coords), axis=-1, dtype=np.float32)
+
+
+def read_npz_entry(raw_path):
+    samples = np.load(raw_path, allow_pickle=True)
+    # get name of file without extension
+    subset = os.path.basename(raw_path).split(".")[0]
+
+    # atoms
+    # coordinates
+    coordinates = np.concatenate(samples["coordinates"])
+    atom_species = np.concatenate(samples["atoms"]).ravel()
+    names = list(map(lambda x: x.split("_")[0], samples["compounds"]))
+    n_comps = len(names)
+
+    # graphs
+    # inchi
+    # Etot
+    # Eatomization
+    res = dict(
+        name=np.array(list(map(lambda x: x.split("_")[0], samples["compounds"]))),
+        subset=np.array([subset] * n_comps),
+        energies=samples["Etot"][:, None].astype(np.float64),
+        atomic_inputs=shape_atom_inputs(coordinates, atom_species),
+        n_atoms=np.array(list(map(lambda x: len(x), samples["coordinates"])), dtype=np.int32),
+    )
+    return res
+
+
+# graphs is smiles
+class VQM24(BaseDataset):
+    """
+    Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical
+    properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional
+    isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and
+    relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.
+
+    Usage:
+    ```python
+    from openqdc.datasets import VQM24
+    dataset = VQM24()
+    ```
+
+    Reference:
+        https://arxiv.org/abs/2405.05961
+    """
+
+    __name__ = "vqm24"
+
+    __energy_methods__ = [
+        PotentialMethod.WB97X_6_31G_D,  # "wb97x/6-31g(d)"
+    ]
+
+    energy_target_names = [
+        "ωB97x:6-31G(d) Energy",
+    ]
+    # ωB97X-D3/cc-pVDZ
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __links__ = {
+        f"{name}.npz": f"https://zenodo.org/records/11164951/files/{name}.npz?download=1"
+        for name in ["DFT_all", "DFT_saddles", "DFT_uniques", "DMC"]
+    }
+
+    def read_raw_entries(self):
+        samples = []
+        for name in self.__links__:
+            raw_path = p_join(self.root, f"{name}")
+            samples.append(read_npz_entry(raw_path))
+        return samples
diff --git a/openqdc/datasets/potential/waterclusters.py b/openqdc/datasets/potential/waterclusters.py
new file mode 100644
index 00000000..8c791474
--- /dev/null
+++ b/openqdc/datasets/potential/waterclusters.py
@@ -0,0 +1,175 @@
+from collections import defaultdict
+from os.path import join as p_join
+
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.package_utils import requires_package
+
+_default_basis_sets = {
+    "BEGDB_H2O": "aug-cc-pVQZ",
+    "WATER27": "aug-cc-pVQZ",
+    "H2O_alkali_clusters": "def2-QZVPPD",
+    "H2O_halide_clusters": "def2-QZVPPD",
+}
+
+
+@requires_package("monty")
+@requires_package("pymatgen")
+def read_geometries(fname, dataset):
+    from monty.serialization import loadfn
+
+    geometries = {k: v.to_ase_atoms() for k, v in loadfn(fname)[dataset].items()}
+    return geometries
+
+
+@requires_package("monty")
+def read_energies(fname, dataset):
+    from monty.serialization import loadfn
+
+    # fname
+    _energies = loadfn(fname)[dataset]
+    metadata_restrictions = {"basis_set": _default_basis_sets.get(dataset)}
+
+    functionals_to_return = []
+    for dfa, at_dfa_d in _energies.items():
+        functionals_to_return += [f"{dfa}" if dfa == at_dfa else f"{dfa}@{at_dfa}" for at_dfa in at_dfa_d]
+
+    energies = defaultdict(dict)
+    for f in functionals_to_return:
+        if "-FLOSIC" in f and "@" not in f:
+            func = f.split("-FLOSIC")[0]
+            at_f = "-FLOSIC"
+        else:
+            func = f.split("@")[0]
+            at_f = f.split("@")[-1]
+
+        if func not in _energies:
+            print(f"No functional {func} included in dataset" f"- available options:\n{', '.join(_energies.keys())}")
+        elif at_f not in _energies[func]:
+            print(
+                f"No @functional {at_f} included in {func} dataset"
+                f"- available options:\n{', '.join(_energies[func].keys())}"
+            )
+        else:
+            if isinstance(_energies[func][at_f], list):
+                for entry in _energies[func][at_f]:
+                    if all(entry["metadata"].get(k) == v for k, v in metadata_restrictions.items()):
+                        energies[f] = entry
+                        break
+            else:
+                energies[f] = _energies[func][at_f]
+    return dict(energies)
+
+
+def extract_desc(atom):
+    # atom_dict=atom.__dict__
+    # arrays -> numbers, positions
+    # charge, spin_multiplicity
+    pos = atom.get_positions()
+    z = atom.get_atomic_numbers()
+    charges = atom.get_initial_charges()
+    formula = atom.get_chemical_formula()
+    return pos, z, charges, formula
+
+
+def format_geometry_and_entries(geometries, energies, subset):
+    entries_list = []
+    for entry, atoms in geometries.items():
+        pos, z, charges, formula = extract_desc(atoms)
+        energies_list = []
+        for level_of_theory, entry_en_dict in energies.items():
+            en = entry_en_dict.get(entry, np.nan)
+            energies_list.append(en)
+        energy_array = np.array(energies_list)
+        if subset in ["WATER27", "H2O_alkali_clusters", "H2O_halide_clusters"]:
+            # only the first 9 energies are available
+            energy_array.resize(19)
+            energy_array[energy_array == 0] = np.nan
+        res = dict(
+            atomic_inputs=np.concatenate(
+                (np.hstack((z[:, None], charges[:, None])), pos), axis=-1, dtype=np.float32
+            ).reshape(-1, 5),
+            name=np.array([formula]),
+            energies=np.array(energy_array, dtype=np.float64).reshape(1, -1),
+            n_atoms=np.array([pos.shape[0]], dtype=np.int32),
+            subset=np.array([subset]),
+        )
+        entries_list.append(res)
+    return entries_list
+
+
+class SCANWaterClusters(BaseDataset):
+    """
+    The SCAN Water Clusters dataset contains conformations of
+    neutral water clusters containing up to 20 monomers, charged water clusters,
+    and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:
+    the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14
+    neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of
+    ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F−, Cl−, or Br−.
+    Water clusters were obtained from  10 nanosecond gas-phase molecular dynamics
+    simulations using AMBER 9 and optimized to obtain
+    lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.
+
+
+    Chemical Species:
+        [H, O, Li, Na, K, F, Cl, Br]
+
+    Usage:
+    ```python
+    from openqdc.datasets import SCANWaterClusters
+    dataset = SCANWaterClusters()
+    ```
+
+    References:
+        https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\n
+        https://github.com/esoteric-ephemera/water_cluster_density_errors
+    """
+
+    __name__ = "scanwaterclusters"
+
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    energy_target_names = [
+        "HF",
+        "HF-r2SCAN-DC4",
+        "SCAN",
+        "SCAN@HF",
+        "SCAN@r2SCAN50",
+        "r2SCAN",
+        "r2SCAN@HF",
+        "r2SCAN@r2SCAN50",
+        "r2SCAN50",
+        "r2SCAN100",
+        "r2SCAN10",
+        "r2SCAN20",
+        "r2SCAN25",
+        "r2SCAN30",
+        "r2SCAN40",
+        "r2SCAN60",
+        "r2SCAN70",
+        "r2SCAN80",
+        "r2SCAN90",
+    ]
+    __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]
+    force_target_names = []
+    # 27            # 9 level
+    subsets = ["BEGDB_H2O", "WATER27", "H2O_alkali_clusters", "H2O_halide_clusters"]
+    __links__ = {
+        "geometries.json.gz": "https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True",  # noqa
+        "total_energies.json.gz": "https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True",  # noqa
+    }
+
+    def read_raw_entries(self):
+        entries = []  # noqa
+        for i, subset in enumerate(self.subsets):
+            geometries = read_geometries(p_join(self.root, "geometries.json.gz"), subset)
+            energies = read_energies(p_join(self.root, "total_energies.json.gz"), subset)
+            datum = {}
+            for k in energies:
+                _ = energies[k].pop("metadata")
+                datum[k] = energies[k]["total_energies"]
+            entries.extend(format_geometry_and_entries(geometries, datum, subset))
+        return entries
diff --git a/openqdc/datasets/potential/waterclusters3_30.py b/openqdc/datasets/potential/waterclusters3_30.py
index f4c7d88e..a52b9e17 100644
--- a/openqdc/datasets/potential/waterclusters3_30.py
+++ b/openqdc/datasets/potential/waterclusters3_30.py
@@ -53,6 +53,10 @@ class WaterClusters(BaseDataset):
     clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with
     the TTM2.1-F ab-initio based interaction potential for water.
     It contains approximately 4.5 mil. structures.
+    Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.
+
+    Chemical Species:
+        ["H", "O"]
 
     Usage:
     ```python
@@ -61,8 +65,8 @@ class WaterClusters(BaseDataset):
     ```
 
     References:
-    - https://doi.org/10.1063/1.5128378
-    - https://sites.uw.edu/wdbase/database-of-water-clusters/
+        https://doi.org/10.1063/1.5128378\n
+        https://sites.uw.edu/wdbase/database-of-water-clusters/\n
     """
 
     __name__ = "waterclusters3_30"
diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py
index d471387b..6b1adeb5 100644
--- a/openqdc/datasets/statistics.py
+++ b/openqdc/datasets/statistics.py
@@ -2,7 +2,7 @@
 from copy import deepcopy
 from dataclasses import asdict, dataclass
 from os.path import join as p_join
-from typing import Optional
+from typing import Callable, Dict, Optional
 
 import numpy as np
 from loguru import logger
@@ -17,9 +17,15 @@ class StatisticsResults:
     """
 
     def to_dict(self):
+        """
+        Convert the class to a dictionary
+        """
         return asdict(self)
 
-    def transform(self, func):
+    def transform(self, func: Callable):
+        """
+        Apply a function to all the attributes of the class
+        """
         for k, v in self.to_dict().items():
             if v is not None:
                 setattr(self, k, func(v))
@@ -55,6 +61,14 @@ class StatisticManager:
     """
 
     def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"):
+        """
+        dataset : openqdc.datasets.base.BaseDataset
+            The dataset object to compute the statistics
+        recompute : bool, default = False
+            Flag to recompute the statistics
+        *statistic_calculators :  AbstractStatsCalculator
+            statistic calculators to run
+        """
         self._state = {}
         self._results = {}
         self._statistic_calculators = [
@@ -63,7 +77,7 @@ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "Ab
         ]
 
     @property
-    def state(self) -> dict:
+    def state(self) -> Dict:
         """
         Return the dictionary state of the manager
         """
@@ -120,7 +134,7 @@ class AbstractStatsCalculator(ABC):
     """
     Abstract class that defines the interface for all
     the calculators object and the methods to
-    compute the statistics
+    compute the statistics.
     """
 
     # State Dependencies of the calculator to skip part of the calculation
@@ -140,6 +154,28 @@ def __init__(
         atom_charges: Optional[np.ndarray] = None,
         forces: Optional[np.ndarray] = None,
     ):
+        """
+        name : str
+            Name of the dataset for saving and loading.
+        energy_type : str, default = None
+            Type of the energy for the computation of the statistics. Used for loading and saving.
+        force_recompute : bool, default = False
+            Flag to force the recomputation of the statistics
+        energies : np.ndarray, default = None
+            Energies of the dataset
+        n_atoms : np.ndarray, default = None
+            Number of atoms in the dataset
+        atom_species : np.ndarray, default = None
+            Atomic species of the dataset
+        position_idx_range : np.ndarray, default = None
+            Position index range of the dataset
+        e0_matrix : np.ndarray, default = None
+            Isolated atom energies matrix of the dataset
+        atom_charges : np.ndarray, default = None
+            Atomic charges of the dataset
+        forces : np.ndarray, default = None
+            Forces of the dataset
+        """
         self.name = name
         self.energy_type = energy_type
         self.force_recompute = force_recompute
@@ -149,6 +185,7 @@ def __init__(
         self.e0_matrix = e0_matrix
         self.n_atoms = n_atoms
         self.atom_species_charges_tuple = (atom_species, atom_charges)
+        self._root = p_join(get_local_cache(), self.name)
         if atom_species is not None and atom_charges is not None:
             # by value not reference
             self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)
@@ -159,7 +196,7 @@ def has_forces(self) -> bool:
 
     @property
     def preprocess_path(self):
-        path = p_join(self.root, "preprocessed", str(self) + ".pkl")
+        path = p_join(self.root, "statistics", self.name + f"_{str(self)}" + ".pkl")
         return path
 
     @property
@@ -167,14 +204,14 @@ def root(self):
         """
         Path to the dataset folder
         """
-        return p_join(get_local_cache(), self.name)
+        return self._root
 
     @classmethod
     def from_openqdc_dataset(cls, dataset, recompute: bool = False):
         """
-        Create a calculator object from a dataset object
+        Create a calculator object from a dataset object.
         """
-        return cls(
+        obj = cls(
             name=dataset.__name__,
             force_recompute=recompute,
             energy_type=dataset.energy_type,
@@ -186,6 +223,8 @@ def from_openqdc_dataset(cls, dataset, recompute: bool = False):
             atom_charges=dataset.data["atomic_inputs"][:, 1].ravel(),
             e0_matrix=dataset.__isolated_atom_energies__,
         )
+        obj._root = dataset.root  # set to the dataset root in case of multiple datasets
+        return obj
 
     @abstractmethod
     def compute(self) -> StatisticsResults:
@@ -214,7 +253,7 @@ def attempt_load(self) -> bool:
             logger.warning(f"Statistics for {str(self)} not found. Computing...")
             return False
 
-    def _setup_deps(self, state: dict) -> None:
+    def _setup_deps(self, state: Dict) -> None:
         """
         Check if the dependencies of calculators are satisfied
         from the state object and set the attributes of the calculator
@@ -226,7 +265,7 @@ def _setup_deps(self, state: dict) -> None:
             for dep in self.state_dependency:
                 setattr(self, dep, state[dep])
 
-    def write_state(self, update: dict) -> None:
+    def write_state(self, update: Dict) -> None:
         """
         Write/update the state dictionary with the update dictionary
 
@@ -235,7 +274,7 @@ def write_state(self, update: dict) -> None:
         """
         self.state.update(update)
 
-    def run(self, state: dict) -> None:
+    def run(self, state: Dict) -> None:
         """
         Main method to run the calculator.
         Setup the dependencies from the state dictionary
diff --git a/openqdc/datasets/structure.py b/openqdc/datasets/structure.py
new file mode 100644
index 00000000..f6dc077e
--- /dev/null
+++ b/openqdc/datasets/structure.py
@@ -0,0 +1,276 @@
+import pickle as pkl
+from abc import ABC, abstractmethod
+from os import PathLike
+from os.path import join as p_join
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import zarr
+
+from openqdc.utils.io import pull_locally
+
+
+class GeneralStructure(ABC):
+    """
+    Abstract Factory class for datasets type in the openQDC package.
+    """
+
+    _ext: Optional[str] = None
+    _extra_files: Optional[List[str]] = None
+
+    @property
+    def ext(self):
+        return self._ext
+
+    @property
+    @abstractmethod
+    def load_fn(self) -> Callable:
+        """
+        Function to use for loading the data.
+        Must be implemented by the child class.
+
+        Returns:
+            the function to use for loading the data
+        """
+        raise NotImplementedError
+
+    def add_extension(self, filename: str) -> str:
+        """
+        Add the correct extension to a filename
+
+        Parameters:
+            filename:  the filename to add the extension to
+
+        Returns:
+            the filename with the extension
+        """
+        return filename + self.ext
+
+    @abstractmethod
+    def save_preprocess(
+        self,
+        preprocess_path: Union[str, PathLike],
+        data_keys: List[str],
+        data_dict: Dict[str, np.ndarray],
+        extra_data_keys: List[str],
+        extra_data_types: Dict[str, type],
+    ) -> List[str]:
+        """
+        Save the preprocessed data to the cache directory and optionally upload it to the remote storage.
+        Must be implemented by the child class.
+
+        Parameters:
+            preprocess_path:  path to the preprocessed data file
+            data_keys:        list of keys to load from the data file
+            data_dict:        dictionary of data to save
+            extra_data_keys:  list of keys to load from the extra data file
+            extra_data_types: dictionary of data types for each key
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_extra_files(
+        self,
+        data: Dict[str, np.ndarray],
+        preprocess_path: Union[str, PathLike],
+        data_keys: List[str],
+        pkl_data_keys: List[str],
+        overwrite: bool,
+    ):
+        """
+        Load extra files required to define other types of data.
+        Must be implemented by the child class.
+
+        Parameters:
+            data:  dictionary of data to load
+            preprocess_path:  path to the preprocessed data file
+            data_keys:    list of keys to load from the data file
+            pkl_data_keys:   list of keys to load from the extra files
+            overwrite:   whether to overwrite the local cache
+        """
+        raise NotImplementedError
+
+    def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:
+        """
+        Join a path and a filename and add the correct extension.
+
+        Parameters:
+            path:  the path to join
+            filename:  the filename to join
+
+        Returns:
+            the joined path with the correct extension
+        """
+        return p_join(path, self.add_extension(filename))
+
+    def load_data(
+        self,
+        preprocess_path: Union[str, PathLike],
+        data_keys: List[str],
+        data_types: Dict[str, np.dtype],
+        data_shapes: Dict[str, Tuple[int, int]],
+        extra_data_keys: List[str],
+        overwrite: bool,
+    ):
+        """
+        Main method to load the data from a filetype structure like memmap or zarr.
+
+        Parameters:
+            preprocess_path:  path to the preprocessed data file
+            data_keys:        list of keys to load from the data file
+            data_types:       dictionary of data types for each key
+            data_shapes:      dictionary of shapes for each key
+            extra_data_keys:  list of keys to load from the extra data file
+            overwrite:        whether to overwrite the local cache
+        """
+        data = {}
+        for key in data_keys:
+            filename = self.join_and_ext(preprocess_path, key)
+            pull_locally(filename, overwrite=overwrite)
+            data[key] = self.load_fn(filename, mode="r", dtype=data_types[key])
+            data[key] = self.unpack(data[key])
+            data[key] = data[key].reshape(*data_shapes[key])
+
+        data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)
+        return data
+
+    def unpack(self, data: any) -> any:
+        """
+        Unpack the data from the loaded file.
+
+        Parameters:
+            data:  the data to unpack
+
+        Returns:
+            the unpacked data
+        """
+        return data
+
+
+class MemMapDataset(GeneralStructure):
+    """
+    Dataset structure for memory-mapped numpy arrays and props.pkl files.
+    """
+
+    _ext = ".mmap"
+    _extra_files = ["props.pkl"]
+
+    @property
+    def load_fn(self):
+        return np.memmap
+
+    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:
+        local_paths = []
+        for key in data_keys:
+            local_path = self.join_and_ext(preprocess_path, key)
+            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
+            out[:] = data_dict.pop(key)[:]
+            out.flush()
+            local_paths.append(local_path)
+
+        # save smiles and subset
+        local_path = p_join(preprocess_path, "props.pkl")
+
+        # assert that (required) pkl keys are present in data_dict
+        assert all([key in data_dict.keys() for key in extra_data_keys])
+
+        # store unique and inverse indices for str-based pkl keys
+        for key in extra_data_keys:
+            if extra_data_types[key] == str:
+                data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+        with open(local_path, "wb") as f:
+            pkl.dump(data_dict, f)
+
+        local_paths.append(local_path)
+        return local_paths
+
+    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):
+        filename = p_join(preprocess_path, "props.pkl")
+        pull_locally(filename, overwrite=overwrite)
+        with open(filename, "rb") as f:
+            tmp = pkl.load(f)
+            all_pkl_keys = set(tmp.keys()) - set(data_keys)
+            # assert required pkl_keys are present in all_pkl_keys
+            assert all([key in all_pkl_keys for key in pkl_data_keys])
+            for key in all_pkl_keys:
+                x = tmp.pop(key)
+                if len(x) == 2:
+                    data[key] = x[0][x[1]]
+                else:
+                    data[key] = x
+        return data
+
+
+class ZarrDataset(GeneralStructure):
+    """
+    Dataset structure for zarr files.
+    """
+
+    _ext = ".zip"
+    _extra_files = ["metadata.zip"]
+    _zarr_version = 2
+
+    @property
+    def load_fn(self):
+        return zarr.open
+
+    def unpack(self, data):
+        return data[:]
+
+    def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:
+        # os.makedirs(p_join(ds.root, "zips",  ds.__name__), exist_ok=True)
+        local_paths = []
+        for key, value in data_dict.items():
+            if key not in data_keys:
+                continue
+            zarr_path = self.join_and_ext(preprocess_path, key)
+            value = data_dict.pop(key)
+            z = zarr.open(
+                zarr.storage.ZipStore(zarr_path),
+                "w",
+                zarr_version=self._zarr_version,
+                shape=value.shape,
+                dtype=value.dtype,
+            )
+            z[:] = value[:]
+            local_paths.append(zarr_path)
+            # if key in attrs:
+            #    z.attrs.update(attrs[key])
+
+        metadata = p_join(preprocess_path, "metadata.zip")
+
+        group = zarr.group(zarr.storage.ZipStore(metadata))
+
+        for key in extra_data_keys:
+            if extra_data_types[key] == str:
+                data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+        for key, value in data_dict.items():
+            # sub=group.create_group(key)
+            if key in ["name", "subset"]:
+                data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)
+                data[:] = value[0][:]
+                data2 = group.create_dataset(key + "_ptr", shape=value[1].shape, dtype=np.int32)
+                data2[:] = value[1][:]
+            else:
+                data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)
+                data[:] = value[:]
+        local_paths.append(metadata)
+        return local_paths
+
+    def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):
+        filename = self.join_and_ext(preprocess_path, "metadata")
+        pull_locally(filename, overwrite=overwrite)
+        tmp = self.load_fn(filename)
+        all_pkl_keys = set(tmp.keys()) - set(data_keys)
+        # assert required pkl_keys are present in all_pkl_keys
+        assert all([key in all_pkl_keys for key in pkl_data_keys])
+        for key in all_pkl_keys:
+            if key not in pkl_data_keys:
+                data[key] = tmp[key][:][tmp[key][:]]
+            else:
+                data[key] = tmp[key][:]
+        return data
+
+    # TODO: checksum , maybe convert to archive instead of zips
diff --git a/openqdc/methods/atom_energies.py b/openqdc/methods/atom_energies.py
index fed41dc4..523ff171 100644
--- a/openqdc/methods/atom_energies.py
+++ b/openqdc/methods/atom_energies.py
@@ -1,6 +1,6 @@
 import ast
 import pkgutil
-from typing import Tuple
+from typing import Dict, Tuple
 
 import numpy as np
 from loguru import logger
@@ -18,19 +18,15 @@
 atom_energy_collection = {k.lower(): v for k, v in atom_energy_collection.items()}
 
 
-def to_e_matrix(atom_energies: dict) -> np.ndarray:
+def to_e_matrix(atom_energies: Dict) -> np.ndarray:
     """
     Get the matrix of isolated atom energies for a dict of non-null values calculates
 
-    Parameters
-    ----------
-    atom_energies: dict
-        Dict of energies computed for a given QM method.
-        Keys are pairs of (atom, charge) and values are energy values
+    Parameters:
+        atom_energies: Dict of energies computed for a given QM method.
+            Keys are pairs of (atom, charge) and values are energy values
 
-    Returns
-    -------
-    np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)
+    Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)
         Matrix containing the isolated atom energies for each atom and charge written in the form:
 
                         |   | -2 | -1 | 0 | +1 | +2 | <- charges
diff --git a/openqdc/utils/download_api.py b/openqdc/utils/download_api.py
index c96f3d91..c73c752d 100644
--- a/openqdc/utils/download_api.py
+++ b/openqdc/utils/download_api.py
@@ -14,7 +14,9 @@
 import gdown
 import requests
 import tqdm
-from aiohttp import ClientTimeout
+
+# from aiohttp import ClientTimeout
+from dotenv import load_dotenv
 from fsspec import AbstractFileSystem
 from fsspec.callbacks import TqdmCallback
 from fsspec.implementations.local import LocalFileSystem
@@ -27,25 +29,39 @@
 @dataclass
 class FileSystem:
     """
-    A class to handle file system operations
+    A basic class to handle file system operations
     """
 
     public_endpoint: Optional[AbstractFileSystem] = None
     private_endpoint: Optional[AbstractFileSystem] = None
     local_endpoint: AbstractFileSystem = LocalFileSystem()
 
+    def __init__(self):
+        load_dotenv()  # load environment variables from .env
+        self.KEY = os.getenv("CLOUDFARE_KEY", None)
+        self.SECRET = os.getenv("CLOUDFARE_SECRET", None)
+
     @property
     def public(self):
+        """
+        Return the public remote filesystem with read permission
+        """
         self.connect()
         return self.public_endpoint
 
     @property
     def private(self):
+        """
+        Return the private remote filesystem with write permission
+        """
         self.connect()
         return self.private_endpoint
 
     @property
     def local(self):
+        """
+        Return the local filesystem
+        """
         return self.local_endpoint
 
     @property
@@ -57,23 +73,29 @@ def is_connected(self):
 
     def connect(self):
         """
-        Attempt connection to the public and private endpoints
+        Attempt connection to the public and private remote endpoints
         """
         if not self.is_connected:
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")  # No quota warning
                 self.public_endpoint = self.get_default_endpoint("public")
                 self.private_endpoint = self.get_default_endpoint("private")
-                self.public_endpoint.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)}
+                # self.public_endpoint.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)}
 
     def get_default_endpoint(self, endpoint: str) -> AbstractFileSystem:
         """
         Return a default endpoint for the given str [public, private]
         """
         if endpoint == "private":
-            return fsspec.filesystem("gs")
+            return fsspec.filesystem(
+                "s3",
+                key=self.KEY,
+                secret=self.SECRET,
+                endpoint_url=ioqdc.request_s3fs_config()["endpoint_url"],
+            )
         elif endpoint == "public":
-            return fsspec.filesystem("https")
+            # return fsspec.filesystem("https")
+            return fsspec.filesystem("s3", **ioqdc.request_s3fs_config())
         else:
             return self.local_endpoint
 
diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py
index 5e039960..08f69e1a 100644
--- a/openqdc/utils/io.py
+++ b/openqdc/utils/io.py
@@ -3,6 +3,8 @@
 import json
 import os
 import pickle as pkl
+
+# from os.path import join as p_join
 from typing import Dict, List, Optional
 
 import fsspec
@@ -23,6 +25,12 @@
     "~/.cache/openqdc" if "OPENQDC_CACHE_DIR" not in os.environ else os.path.normpath(os.environ["OPENQDC_CACHE_DIR"])
 )
 
+_OPENQDC_DOWNLOAD_API = {
+    "s3": "/openqdc/v1",
+    # "https" : "https://storage.openqdc.org/v1",
+    "gs": "https://storage.googleapis.com/qmdata-public/openqdc",
+}
+
 
 def set_cache_dir(d):
     r"""
@@ -54,9 +62,11 @@ def get_remote_cache(write_access=False) -> str:
     Returns the entry point based on the write access.
     """
     if write_access:
-        remote_cache = "gs://qmdata-public/openqdc"
+        remote_cache = "openqdc/v1"  # "gs://qmdata-public/openqdc"
+        # remote_cache = "gs://qmdata-public/openqdc"
     else:
-        remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc"
+        remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get("OPENQDC_DOWNLOAD_API", "s3"))
+        # remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc"
     return remote_cache
 
 
@@ -78,6 +88,7 @@ def pull_locally(local_path, overwrite=False):
     """
     Retrieve file from remote gs path or local cache
     """
+
     remote_path = local_path.replace(get_local_cache(), get_remote_cache())
     os.makedirs(os.path.dirname(local_path), exist_ok=True)
     if not os.path.exists(local_path) or overwrite:
@@ -85,6 +96,15 @@ def pull_locally(local_path, overwrite=False):
     return local_path
 
 
+def request_s3fs_config():
+    import httpx
+
+    response = httpx.get("https://storage.openqdc.org/config.json")
+    response.raise_for_status()
+    config = response.json()
+    return config
+
+
 def copy_exists(local_path):
     remote_path = local_path.replace(get_local_cache(), get_remote_cache())
     return os.path.exists(local_path) or API.exists(remote_path)
@@ -150,8 +170,8 @@ def load_hdf5_file(hdf5_file_path: str):
 
     # inorder to enable multiprocessing:
     # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801
-    fsspec.asyn.iothread[0] = None
-    fsspec.asyn.loop[0] = None
+    # fsspec.asyn.iothread[0] = None
+    # fsspec.asyn.loop[0] = None
 
     return file
 
@@ -177,7 +197,7 @@ def load_xyz(path):
     return MolFromXYZFile(path)
 
 
-def dict_to_atoms(d: dict, ext: bool = False, energy_method: int = 0) -> Atoms:
+def dict_to_atoms(d: Dict, ext: bool = False, energy_method: int = 0) -> Atoms:
     """
     Converts dictionary to ase atoms object
 
diff --git a/openqdc/utils/package_utils.py b/openqdc/utils/package_utils.py
index 990f6cb3..e1381dad 100644
--- a/openqdc/utils/package_utils.py
+++ b/openqdc/utils/package_utils.py
@@ -1,3 +1,4 @@
+# from openFF package
 import importlib
 from functools import wraps
 from typing import Any, Callable, TypeVar
diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py
deleted file mode 100644
index 1171a68c..00000000
--- a/openqdc/utils/preprocess.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Dataset preprocessing."""
-
-import click
-import numpy as np
-from loguru import logger
-
-from openqdc import AVAILABLE_DATASETS
-
-options = list(AVAILABLE_DATASETS.values())
-options_map = {d.__name__.lower(): d for d in options}
-
-
-@click.command()
-@click.option("--dataset", "-d", type=str, default="ani1", help="Dataset name or index.")
-@click.option("--upload", "-u", type=bool, default=False, help="Try to upload it to the remote storage.")
-def preprocess(dataset, upload):
-    if dataset not in options_map:
-        dataset_id = int(dataset)
-        data_class = options[dataset_id]
-    else:
-        data_class = options_map[dataset]
-
-    data_class.no_init().preprocess(upload=upload, overwrite=True)
-    data = data_class()
-    logger.info(f"Preprocessing {data.__name__}")
-
-    n = len(data)
-    for i in np.random.choice(n, 3, replace=False):
-        x = data[i]
-        print(x.name, x.subset, end=" ")
-        for k in x:
-            if isinstance(x[k], np.ndarray):
-                print(k, x[k].shape, end=" ")
-        print()
-
-
-if __name__ == "__main__":
-    preprocess()
diff --git a/openqdc/utils/regressor.py b/openqdc/utils/regressor.py
index 1d3e50ad..0c23d9b4 100644
--- a/openqdc/utils/regressor.py
+++ b/openqdc/utils/regressor.py
@@ -7,8 +7,6 @@
 import pandas as pd
 from loguru import logger
 
-SubSampleFrac = Union[float, int]
-
 
 def non_nan_idxs(array):
     """
@@ -24,7 +22,18 @@ class Solver(ABC):
 
     @staticmethod
     @abstractmethod
-    def solve(X, Y):
+    def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+        """
+        Main method to solve the regression problem.
+        Must be implemented in all the subclasses.
+
+        Parameters:
+            X: Input features of shape (n_samples, n_species)
+            Y: Target values of shape (n_samples,) (energy values for the regression)
+
+        Returns:
+            Tuple of predicted values and the estimated uncertainty.
+        """
         pass
 
     def __call__(self, X, Y):
@@ -38,7 +47,26 @@ def __repr__(self):
 
 
 class Regressor:
-    """Regressor class for preparing and solving regression problem for isolated atom energies."""
+    """
+    Regressor class for preparing and solving regression problem for isolated atom energies.
+    A isolated atom energy regression problem is defined as:\n
+    X = [n_samples, n_species] (number of atoms of each species per sample)\n
+    Y = [n_samples, ] (energies)\n
+    The regression problem is solved by solving the linear system X E0 = Y.
+
+    Example:
+        For a sytem of 2 samples (H20, CH4)\n
+            n_species = 3, n_samples = 2\n
+            H20 = 2H , 1O -> X = [2, 1, 0]\n
+            CH4 = 4C, 1H -> X = [1, 0, 4]\n
+            X = [[2, 1, 0],
+                [ 1, 0, 4]]\n
+            Y = [[10, 20]]\n
+            X E0 = Y\n
+        Linear system to solve\n
+            [[2 eH, 1 eO, 0 eC],
+            [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]
+    """
 
     solver: Solver
 
@@ -49,27 +77,29 @@ def __init__(
         position_idx_range: np.ndarray,
         solver_type: str = "linear",
         stride: int = 1,
-        subsample: Optional[SubSampleFrac] = None,
+        subsample: Optional[Union[float, int]] = None,
         remove_nan: bool = True,
-        *args,
-        **kwargs,
+        *args: any,
+        **kwargs: any,
     ):
         """
-        Parameters
-        ----------
-        energies
-            numpy array of energies in the shape (n_samples, n_energy_methods)
-        atomic_numbers
-            numpy array of atomic numbers in the shape (n_atoms,)
-        position_idx_range
-            array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset
-        stride
-            Stride to use for the regression.
-        subsample
-            Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use.
-            If >1 it is interpreted as the number of samples to use.
-        remove_nan
-            Sanitize the dataset by removing energies samples with NaN values.
+        Regressor class for preparing and solving regression problem for isolated atom energies.
+
+        Parameters:
+            energies:
+                numpy array of energies in the shape (n_samples, n_energy_methods)
+            atomic_numbers:
+                numpy array of atomic numbers in the shape (n_atoms,)
+            position_idx_range:
+                array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset
+            solver_type: Type of solver to use. ["linear", "ridge"]
+            stride: Stride to use for the regression.
+            subsample: Sumsample the dataset.
+                If a float, it is interpreted as a fraction of the dataset to use.
+                If >1 it is interpreted as the number of samples to use.
+            remove_nan: Sanitize the dataset by removing energies samples with NaN values.
+            *args: Additional arguments to be passed to the regressor.
+            **kwargs: Additional keyword arguments to be passed to the regressor.
         """
         self.subsample = subsample
         self.stride = stride
@@ -87,7 +117,19 @@ def __init__(
         self._post_init()
 
     @classmethod
-    def from_openqdc_dataset(cls, dataset, *args, **kwargs):
+    def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> "Regressor":
+        """
+        Initialize the regressor object from an openqdc dataset. This is the default method.
+        *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.
+
+        Parameters:
+            dataset: openqdc dataset object.
+            *args: Additional arguments to be passed to the regressor.
+            **kwargs: Additional keyword arguments to be passed to the regressor.
+
+        Returns:
+            Instance of the regressor class.
+        """
         energies = dataset.data["energies"]
         position_idx_range = dataset.data["position_idx_range"]
         atomic_numbers = dataset.data["atomic_inputs"][:, 0].astype("int32")
@@ -116,12 +158,11 @@ def _downsample(self):
         self.update_hparams({"idxs": idxs})
 
     def _get_solver(self):
-        if self.solver_type == "linear":
+        try:
+            return AVAILABLE_SOLVERS[self.solver_type]()
+        except KeyError:
+            logger.warning(f"Unknown solver type {self.solver_type}, defaulting to linear regression.")
             return LinearSolver()
-        elif self.solver_type == "ridge":
-            return RidgeSolver()
-        logger.warning(f"Unknown solver type {self.solver_type}, defaulting to linear regression.")
-        return LinearSolver()
 
     def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:
         logger.info("Preparing inputs for regression.")
@@ -137,6 +178,9 @@ def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:
         self.y = B
 
     def solve(self):
+        """
+        Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.
+        """
         logger.info(f"Solving regression with {self.solver}.")
         E0_list, cov_list = [], []
         for energy_idx in range(self.y.shape[1]):
@@ -157,6 +201,11 @@ def __call__(self):
 
 
 def atom_standardization(X, y):
+    """
+    Standardize the energies and the atom counts.
+    This will make the calculated uncertainty more
+    meaningful.
+    """
     X_norm = X.sum()
     X = X / X_norm
     y = y / X_norm
@@ -165,7 +214,14 @@ def atom_standardization(X, y):
 
 
 class LinearSolver(Solver):
-    _regr_str = "LinearRegression"
+    """
+    Linear regression solver.
+
+    Note:
+        No Uncertainty associated as it is quite small.
+    """
+
+    _regr_str = "linear"
 
     @staticmethod
     def solve(X, y):
@@ -175,7 +231,11 @@ def solve(X, y):
 
 
 class RidgeSolver(Solver):
-    _regr_str = "RidgeRegression"
+    """
+    Ridge regression solver.
+    """
+
+    _regr_str = "ridge"
 
     @staticmethod
     def solve(X, y):
@@ -189,3 +249,10 @@ def solve(X, y):
         cov = np.sqrt(sigma2 * np.einsum("ij,kj,kl,li->i", Ainv, X, X, Ainv))
         mean = mean + y_mean.reshape([-1])
         return mean, cov
+
+
+AVAILABLE_SOLVERS = {
+    cls._regr_str: cls
+    for str_name, cls in globals().items()
+    if isinstance(cls, type) and issubclass(cls, Solver) and str_name != "Solver"  # Exclude the base class
+}
diff --git a/openqdc/utils/units.py b/openqdc/utils/units.py
index d8613a58..898faf8a 100644
--- a/openqdc/utils/units.py
+++ b/openqdc/utils/units.py
@@ -1,11 +1,14 @@
 """
-Unit conversion utils.
+Units conversion utilities module.
 
-Energy units:
-    ["kcal/mol", "kj/mol", "hartree", "ev"]
+Available Energy units:
+    ["kcal/mol", "kj/mol", "hartree", "ev" "mev", "ryd]
 
-Distance units:
+Available Distance units:
     ["ang", "nm", "bohr"]
+
+Available Force units:
+    Combinations between Energy and Distance units
 """
 
 from enum import Enum, unique
@@ -40,7 +43,16 @@ class EnergyTypeConversion(ConversionEnum, StrEnum):
     MEV = "mev"
     RYD = "ryd"
 
-    def to(self, energy: "EnergyTypeConversion"):
+    def to(self, energy: "EnergyTypeConversion") -> Callable[[float], float]:
+        """
+        Get the conversion function to convert the energy to the desired units.
+
+        Parameters:
+            energy: energy unit to convert to
+
+        Returns:
+            Callable to convert the distance to the desired units
+        """
         return get_conversion(str(self), str(energy))
 
 
@@ -54,7 +66,17 @@ class DistanceTypeConversion(ConversionEnum, StrEnum):
     NM = "nm"
     BOHR = "bohr"
 
-    def to(self, distance: "DistanceTypeConversion", fraction: bool = False):
+    def to(self, distance: "DistanceTypeConversion", fraction: bool = False) -> Callable[[float], float]:
+        """
+        Get the conversion function to convert the distance to the desired units.
+
+        Parameters:
+            distance: distance unit to convert to
+            fraction: whether it is distance^1 or distance^-1
+
+        Returns:
+            callable to convert the distance to the desired units
+        """
         return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))
 
 
@@ -91,33 +113,32 @@ def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversio
     def __str__(self):
         return f"{self.energy}/{self.distance}"
 
-    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):
+    def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:
+        """
+        Get the conversion function to convert the force to the desired units.
+
+        Parameters:
+            energy: energy unit to convert to
+            distance: distance unit to convert to
+
+        Returns:
+            callable to convert the distance to the desired units
+        """
         return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))
 
 
 class Conversion:
     """
-    Conversion from one unit system to another.
-
-    Attributes
-    ----------
-    name
-        A human-readable name for the conversion
-    fn:
-        The callable to compute the conversion
+    Conversion from one unit system to another defined by a name and a callable
     """
 
     def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):
         """
 
-        Parameters
-        ----------
-        in_unit
-            String defining the units of the current values
-        out_unit
-            String defining the target units
-        func
-            The callable to compute the conversion
+        Parameters:
+            in_unit: String defining the units of the current values
+            out_unit: String defining the target units
+            func: The callable to compute the conversion
         """
         name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip()
 
@@ -129,11 +150,20 @@ def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):
         self.fn = func
 
     def __call__(self, x):
-        """Convert measure"""
         return self.fn(x)
 
 
-def get_conversion(in_unit: str, out_unit: str):
+def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:
+    """
+    Utility function to get the conversion function between two units.
+
+    Parameters:
+        in_unit : The input unit
+        out_unit : The output unit
+
+    Returns:
+        The conversion function
+    """
     name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip()
     if in_unit.lower().strip() == out_unit.lower().strip():
         return lambda x: x
@@ -142,6 +172,8 @@ def get_conversion(in_unit: str, out_unit: str):
     return CONVERSION_REGISTRY[name]
 
 
+# Conversion definitions
+
 # ev conversion
 Conversion("ev", "kcal/mol", lambda x: x * 23.0605)
 Conversion("ev", "hartree", lambda x: x * 0.0367493)
diff --git a/pyproject.toml b/pyproject.toml
index 43f414b8..d5e6a002 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,10 @@ dependencies = [
   "ase" ,
   "gdown",
   "h5py >= 3.8.0" ,
-  "dscribe"
+  "dscribe",
+  "zarr",
+  "python-dotenv",
+  "s3fs",
 ]
 
 
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..0718b5fb
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,8 @@
+[pytest]
+tmp_path_retention_policy = none
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::UserWarning
+
+markers =
+    download: tests for datasets downloading
diff --git a/tests/test_download.py b/tests/test_download.py
new file mode 100644
index 00000000..dd53d405
--- /dev/null
+++ b/tests/test_download.py
@@ -0,0 +1,15 @@
+from os.path import join as p_join
+from pathlib import Path
+
+import pytest
+
+from openqdc.datasets import QM7
+
+
+@pytest.mark.download
+def test_API_download(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    ds = QM7(cache_dir=tmp_path)
+    for filename in ["energies.mmap", "position_idx_range.mmap", "atomic_inputs.mmap", "props.pkl"]:
+        assert (Path(p_join(tmp_path, ds.preprocess_path, filename))).exists()
+    monkeypatch.undo()