Merge branch 'main' of https://github.com/leojklarner/gauche

leojklarner · Dec 10, 2023 · 25e4067 · 25e4067
2 parents 8150d6d + be45d9a
commit 25e4067
Show file tree

Hide file tree

Showing 25 changed files with 190 additions and 73 deletions.
diff --git a/.github/workflows/build_documentation.yaml b/.github/workflows/build_documentation.yaml
@@ -11,9 +11,17 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v3
+        with:
+          python-version: "3.11"
+
+      - name: Install Pandoc
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y pandoc
+  
       - name: Install dependencies
         run: |
-          pip install sphinx furo myst_parser
+          pip install .[all,docs]
       - name: Sphinx build
         run: |
           sphinx-build docs/source _build

diff --git a/.requirements/docs.in b/.requirements/docs.in
@@ -1,2 +1,15 @@
+furo
+nbsphinx
+nbsphinx-link
+sphinx-copybutton
+m2r2
+nbstripout
+pandoc
+pydocstyle
 sphinx
-furo
+sphinx-inline-tabs
+sphinxext-opengraph
+sphinxcontrib-gtagjs
+ipython
+watermark
+sphinx_codeautolink
diff --git a/README.md b/README.md
@@ -86,7 +86,7 @@ The easiest way to get started with GAUCHE is to check out our tutorial notebook
 | [GP Regression on Molecules](https://leojklarner.github.io/gauche/notebooks/gp_regression_on_molecules.html)  |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/GP%20Regression%20on%20Molecules.ipynb)   |
 | [Bayesian Optimisation Over Molecules](https://leojklarner.github.io/gauche/notebooks/bayesian_optimisation_over_molecules.html)  |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Bayesian%20Optimisation%20Over%20Molecules.ipynb)   |
 | [Multioutput Gaussian Processes for Multitask Learning](https://leojklarner.github.io/gauche/notebooks/multitask_gp_regression_on_molecules.html)  |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Multitask%20GP%20Regression%20on%20Molecules.ipynb)   |
-| [Training GPs on Graphs](https://leojklarner.github.io/gauche/notebooks/Training%20GPs%20on%20Graphs.html)  |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Training%20GPs%20on%20Graphs.ipynb)   |
+| [Training GPs on Graphs](https://leojklarner.github.io/gauche/notebooks/training_gps_on_graphs.html)  |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Training%20GPs%20on%20Graphs.ipynb)   |
 | [Sparse GP Regression for Big Molecular Data](https://leojklarner.github.io/gauche/notebooks/sparse_gp_regression_for_big_molecular_data.html)  |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Sparse%20GP%20Regression%20for%20Big%20Molecular%20Data.ipynb)   |
 |[Molecular Preference Learning](https://github.com/leojklarner/gauche/blob/main/notebooks/Molecular%20Preference%20Learning.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Molecular%20Preference%20Learning.ipynb) |
 |[Preferential Bayesian Optimisation](https://github.com/leojklarner/gauche/blob/main/notebooks/Preferential%20Bayesian%20Optimisation.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Preferential%20Bayesian%20Optimisation.ipynb) |

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -10,10 +10,12 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-# import os
-# import sys
 
-# sys.path.insert(0, os.path.abspath("../../../gauche"))
+import os
+import sys
+
+print(sys.executable)
+sys.path.insert(0, os.path.abspath("."))
 
 
 # -- Project information -----------------------------------------------------
@@ -23,7 +25,7 @@
 author = "Ryan Rhys-Griffiths"
 
 # The full version, including alpha/beta/rc tags
-release = "0.1.0"
+# release = "1.0.0"
 
 
 # -- General configuration ---------------------------------------------------
@@ -33,15 +35,19 @@
 # ones.
 extensions = [
     "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
     "sphinx.ext.viewcode",
-    # "sphinx_copybutton",
-    # "sphinx_inline_tabs",
-    # "sphinxcontrib.gtagjs",
-    # "sphinxext.opengraph",
-    # "m2r2",
-    # "nbsphinx",
-    # "nbsphinx_link",
+    "sphinx_copybutton",
+    "sphinx_inline_tabs",
+    "sphinxcontrib.gtagjs",
+    "sphinxext.opengraph",
+    "m2r2",
+    "nbsphinx",
+    "nbsphinx_link",
+    "sphinx.ext.napoleon",
+    "sphinx_codeautolink",
+    # "sphinx_autorun",
 ]
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -3,16 +3,63 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to GAUCHE's documentation!
-==================================
+Documentation
+==================
+
+.. image:: ../../imgs/gauche_banner_1.png
+    :width: 100%
+    :alt: GAUCHE Logo
+    :align: left
+
+**GAUCHE** is a collaborative, open-source software library that aims to make state-of-the-art probabilistic modelling and black-box optimisation techniques more easily accessible to scientific experts in chemistry, materials science and beyond. We provide 30+ bespoke kernels for molecules, chemical reactions and proteins and illustrate how they can be used for Gaussian processes and Bayesian optimisation in 10+ easy-to-adapt tutorial notebooks.
+
+`Paper (NeurIPS 2023) <https://arxiv.org/abs/2212.04450>`_
+
+Overview
+==========
+
+General-purpose Gaussian process (GP) and Bayesian optimisation (BO) libraries do not cater for molecular representations. Likewise, general-purpose molecular machine learning libraries do not consider GPs and BO. To bridge this gap, GAUCHE provides a modular, robust and easy-to-use framework of 30+ parallelisable and batch-GP-compatible implementations of string, fingerprint and graph kernels that operate on a range of widely-used molecular representations.
+
+.. image:: ../../imgs/gauche_overview.png
+    :width: 100 %
+    :alt: GAUCHE Overview
+    :align: left
+
 
+Kernels
+---------
 
-.. include:: readme.rst
+Standard GP packages typically assume continuous input spaces of low and fixed dimensionality. This makes it difficult to apply them to common molecular representations: molecular graphs are discrete objects, SMILES strings vary in length and topological fingerprints tend to be high-dimensional and sparse. To bridge this gap, GAUCHE provides:
+
+* **Fingerprint Kernels** that measure the similarity between bit/count vectors of descriptor by examining the degree to which their elements overlap.
+* **String Kernels** that measure the similarity between strings by examining the degree to which their sub-strings overlap.
+* **Graph Kernels** that measure between graphs by examining the degree to which certain substructural motifs overlap.
+
+Representations
+-----------------
+
+GAUCHE supports any representation that is based on bit/count vectors, strings or graphs. For rapid prototyping and benchmarking, we also provide a range of standard featurisation techniques for molecules, chemical reactions and proteins:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Domain
+     - Representation
+   * - Molecules
+     - ECFP Fingerprints [1], rdkit Fragments, Fragprints, Graphs [2], SMILES [3], SELFIES [4]   
+   * - Chemical Reactions
+     - One-Hot Encoding, Data-Driven Reaction Fingerprints [5], Differential Reaction Fingerprints [6], Reaction SMARTS
+   * - Proteins  
+     - Sequences, Graphs [2]
+
+Getting Started
+-----------------
+
+The easiest way to get started with GAUCHE is to check out our tutorial notebooks:
 
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Tutorials
+   :maxdepth: 1
 
    notebooks/gp_regression_on_molecules.nblink
    notebooks/bayesian_optimisation_over_molecules.nblink
@@ -26,6 +73,16 @@ Welcome to GAUCHE's documentation!
    notebooks/external_graph_kernels.nblink
 
 
+
+Extensions
+-----------------
+
+If there are any specific kernels or representations that you would like to see included in GAUCHE, please reach out or submit an issue/pull request.
+
+
+Gauche's API
+================
+
 .. toctree::
    :maxdepth: 3
    :caption: API Reference
@@ -34,11 +91,26 @@ Welcome to GAUCHE's documentation!
    modules/representations
    modules/dataloader
 
-
-
 Indices and tables
 ==================
 
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
+
+References
+==================
+
+.. _bibliography:
+
+[1] Rogers, D. and Hahn, M., 2010. `Extended-connectivity fingerprints. <https://pubs.acs.org/doi/abs/10.1021/ci100050t>`_ Journal of Chemical Information and Modeling, 50(5), pp.742-754.
+
+[2] Jamasb, A., Viñas Torné, R., Ma, E., Du, Y., Harris, C., Huang, K., Hall, D., Lió, P. and Blundell, T., 2022. `Graphein-a Python library for geometric deep learning and network analysis on biomolecular structures and interaction networks <https://proceedings.neurips.cc/paper_files/paper/2022/hash/ade039c1db0391106a3375bd2feb310a-Abstract-Conference.html>`_. Advances in Neural Information Processing Systems, 35, pp.27153-27167.
+
+[3] Weininger, D., 1988. `SMILES, a chemical language and information system. 1. Introduction to methodology and encoding rules. <https://pubs.acs.org/doi/pdf/10.1021/ci00057a005>`_ Journal of Chemical Information and Computer Sciences, 28(1), pp.31-36.
+
+[4] Krenn, M., Häse, F., Nigam, A., Friederich, P. and Aspuru-Guzik, A., 2020. `Self-referencing embedded strings (SELFIES): A 100% robust molecular string representation <https://iopscience.iop.org/article/10.1088/2632-2153/aba947/meta>`_. Machine Learning: Science and Technology, 1(4), p.045024.
+
+[5] Probst, D., Schwaller, P. and Reymond, J.L., 2022. `Reaction classification and yield prediction using the differential reaction fingerprint DRFP <https://pubs.rsc.org/en/content/articlehtml/2022/dd/d1dd00006c>`_. Digital Discovery, 1(2), pp.91-97.
+
+[6] Schwaller, P., Probst, D., Vaucher, A.C., Nair, V.H., Kreutter, D., Laino, T. and Reymond, J.L., 2021. `Mapping the space of chemical reactions using attention-based neural networks <https://www.nature.com/articles/s42256-020-00284-w>`_. Nature Machine Intelligence, 3(2), pp.144-152.
diff --git a/docs/source/modules/dataloader.rst b/docs/source/modules/dataloader.rst
@@ -8,7 +8,7 @@ Dataloader
 
 Molecular Properties
 ----------------------
-.. automodule:: gauche.dataloader.mol_prop_loader
+.. automodule:: gauche.dataloader.molprop_loader
     :members:
 
 Reaction Loader

diff --git a/docs/source/readme.rst b/docs/source/readme.rst
@@ -1 +0,0 @@
-.. mdinclude:: ../../README.md

diff --git a/gauche/dataloader/reaction_loader.py b/gauche/dataloader/reaction_loader.py
@@ -19,6 +19,16 @@
 
 
 class ReactionLoader(DataLoader):
+    """
+    Data loader class for reaction yield prediction
+    datasets with a single regression target.
+    Expects input to be a csv file with either multiple SMILES
+    columns or a single reaction SMARTS column.
+    Contains methods to validate the dataset and to
+    transform the SMILES/SMARTS strings into different
+    molecular representations.
+    """
+
     def __init__(self):
         super(ReactionLoader, self).__init__()
         self.task = "reaction_yield_prediction"

diff --git a/gauche/kernels/fingerprint_kernels/braun_blanquet_kernel.py b/gauche/kernels/fingerprint_kernels/braun_blanquet_kernel.py
@@ -16,7 +16,7 @@ def batch_braun_blanquet_sim(
     Braun-Blanquet similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    <x1, x2> / max(|x1|, |x2|)
+    :math:`<x1, x2> / max(|x1|, |x2|)`
 
     Where || is the L1 norm and <.> is the inner product
 

diff --git a/gauche/kernels/fingerprint_kernels/dice_kernel.py b/gauche/kernels/fingerprint_kernels/dice_kernel.py
@@ -14,7 +14,7 @@ def batch_dice_sim(
     Dice similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    (2 * <x1, x2>) / (|x1| + |x2|)
+    :math:`(2 * <x1, x2>) / (|x1| + |x2|)`
 
     Where || is the L1 norm and <.> is the inner product
 
@@ -50,10 +50,10 @@ class DiceKernel(Kernel):
 
      .. math::
 
-    \begin{equation*}
-     k_{\text{Dice}}(\mathbf{x}, \mathbf{x'}) = \frac{2\langle\mathbf{x},
-     \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert + \left\lVert\mathbf{x'}\right\rVert}
-    \end{equation*}
+        \begin{equation*}
+        k_{\text{Dice}}(\mathbf{x}, \mathbf{x'}) = \frac{2\langle\mathbf{x},
+        \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert + \left\lVert\mathbf{x'}\right\rVert}
+        \end{equation*}
 
     .. note::
 

diff --git a/gauche/kernels/fingerprint_kernels/faith_kernel.py b/gauche/kernels/fingerprint_kernels/faith_kernel.py
@@ -16,7 +16,7 @@ def batch_faith_sim(
     Faith similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    (2 * <x1, x2>) + d / 2n
+    :math:`(2 * <x1, x2>) + d / 2n`
 
     Where <.> is the inner product, d is the number of common zeros and n is the dimension of the input vectors
 

diff --git a/gauche/kernels/fingerprint_kernels/forbes_kernel.py b/gauche/kernels/fingerprint_kernels/forbes_kernel.py
@@ -16,9 +16,9 @@ def batch_forbes_sim(
     Forbes similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    n * <x1, x2> / (|x1| + |x2|)
+    :math:`n * <x1, x2> / (|x1| + |x2|)`
 
-    Where <.> is the inner product, || is the L1 norm, and n is the dimension of the input vectors
+    Where <.> is the inner product, :math:`||` is the L1 norm, and n is the dimension of the input vectors
 
     Args:
         x1: `[b x n x d]` Tensor where b is the batch dimension

diff --git a/gauche/kernels/fingerprint_kernels/inner_product_kernel.py b/gauche/kernels/fingerprint_kernels/inner_product_kernel.py
@@ -16,7 +16,7 @@ def batch_inner_product_sim(
     Inner product similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    <x1, x2>
+    :math:`<x1, x2>`
 
     Where <.> is the inner product
 
@@ -47,10 +47,10 @@ class InnerProductKernel(Kernel):
 
      .. math::
 
-    \begin{equation*}
-     k_{\text{Inner Product}}(\mathbf{x}, \mathbf{x'}) = \langle\mathbf{x},
-     \mathbf{x'}\rangle
-    \end{equation*}
+        \begin{equation*}
+        k_{\text{Inner Product}}(\mathbf{x}, \mathbf{x'}) = \langle\mathbf{x},
+        \mathbf{x'}\rangle
+        \end{equation*}
 
     .. note::
 

diff --git a/gauche/kernels/fingerprint_kernels/intersection_kernel.py b/gauche/kernels/fingerprint_kernels/intersection_kernel.py
@@ -17,7 +17,7 @@ def batch_intersection_sim(
     eps argument ensures numerical stability if all zero tensors are added. Must be
     used with binary-valued vectors only
 
-    <x1, x2> + <x1', x2'>
+    :math:`<x1, x2> + <x1', x2'>`
 
     Where <.> is the inner product and x1' and x2' denote the bit flipped vectors such
     that ones and zeros are interchanged

diff --git a/gauche/kernels/fingerprint_kernels/minmax_kernel.py b/gauche/kernels/fingerprint_kernels/minmax_kernel.py
@@ -14,9 +14,9 @@ def batch_minmax_sim(
     MinMax similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    (|x1| + |x2| - |x1 - x2|) / (|x1| + |x2| + |x1 - x2|)
+    :math:`(|x1| + |x2| - |x1 - x2|) / (|x1| + |x2| + |x1 - x2|)`
 
-    Where || is the L1 norm
+    Where :math:`||` is the L1 norm
 
     Args:
         x1: `[b x n x d]` Tensor where b is the batch dimension
@@ -51,9 +51,9 @@ class MinMaxKernel(Kernel):
 
      .. math::
 
-    \begin{equation*}
-     k_{\text{MinMax}}(\mathbf{x}, \mathbf{x'}) = \frac{\sum_i \min(x_i, x'_i)}
-    \end{equation*}
+        \begin{equation*}
+        k_{\text{MinMax}}(\mathbf{x}, \mathbf{x'}) = \frac{\sum_i \min(x_i, x'_i)}
+        \end{equation*}
 
     .. note::
 

diff --git a/gauche/kernels/fingerprint_kernels/otsuka_kernel.py b/gauche/kernels/fingerprint_kernels/otsuka_kernel.py
@@ -16,7 +16,7 @@ def batch_otsuka_sim(
     Otsuka similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    <x1, x2> / sqrt(|x1| + |x2|)
+    :math:`<x1, x2> / sqrt(|x1| + |x2|)`
 
     Where || is the L1 norm and <.> is the inner product
 

diff --git a/gauche/kernels/fingerprint_kernels/rand_kernel.py b/gauche/kernels/fingerprint_kernels/rand_kernel.py
@@ -16,7 +16,7 @@ def batch_rand_sim(
     Rand similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    <x1, x2> + d / n
+    :math:`<x1, x2> + d / n`
 
     Where <.> is the inner product, d is the number of common zeros and n is the dimensionality
 

diff --git a/gauche/kernels/fingerprint_kernels/rogers_tanimoto_kernel.py b/gauche/kernels/fingerprint_kernels/rogers_tanimoto_kernel.py
@@ -16,7 +16,7 @@ def batch_rogers_tanimoto_sim(
     Rogers-Tanimoto similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    <x1, x2> + d / 2|x1| + 2|x2| - 3*<x1, x2> + d
+    :math:`<x1, x2> + d / 2|x1| + 2|x2| - 3*<x1, x2> + d`
 
     Where || is the L1 norm and <.> is the inner product and d is the number of common zeros
 

diff --git a/gauche/kernels/fingerprint_kernels/russell_rao_kernel.py b/gauche/kernels/fingerprint_kernels/russell_rao_kernel.py
@@ -16,7 +16,7 @@ def batch_russell_rao_sim(
     Russell-Rao similarity between two batched tensors, across last 2 dimensions.
     eps argument ensures numerical stability if all zero tensors are added.
 
-    <x1, x2> / n
+    :math:`<x1, x2> / n`
 
     Where <.> is the inner product and n is the dimension of the vectors x1/x2