OpenBioML · pre-commit-ci · Sep 29, 2025 · Sep 29, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ default_language_version:
   python: python3
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v6.0.0
     hooks:
       - id: check-added-large-files
       - id: check-byte-order-marker
@@ -21,69 +21,69 @@ repos:
       - id: trailing-whitespace
         exclude: .gitignore
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.4.2
+    rev: 25.9.0
     hooks:
       - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.1
+    rev: v0.13.2
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
   - repo: https://github.com/rbubley/mirrors-prettier
-    rev: v3.3.2
+    rev: v3.6.2
     hooks:
       - id: prettier
   - repo: https://github.com/google/yamlfmt
-    rev: v0.13.0
+    rev: v0.17.2
     hooks:
       - id: yamlfmt
   - repo: https://github.com/Yelp/detect-secrets
     rev: v1.5.0
     hooks:
       - id: detect-secrets
   - repo: https://github.com/pappasam/toml-sort
-    rev: v0.23.1
+    rev: v0.24.3
     hooks:
       - id: toml-sort-fix
         exclude: poetry.lock
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.3.0
+    rev: v2.4.1
     hooks:
       - id: codespell
         additional_dependencies: [".[toml]"]
   - repo: https://github.com/sqlfluff/sqlfluff
-    rev: 3.1.0
+    rev: 3.4.2
     hooks:
       - id: sqlfluff-fix
   - repo: https://github.com/hadolint/hadolint
-    rev: v2.13.0-beta
+    rev: v2.14.0
     hooks:
       - id: hadolint-docker
   - repo: https://github.com/jsh9/markdown-toc-creator
-    rev: 0.0.6
+    rev: 0.0.10
     hooks:
       - id: markdown-toc-creator
   - repo: https://github.com/jumanjihouse/pre-commit-hooks
     rev: 3.0.0
     hooks:
       - id: check-mailmap
   - repo: https://github.com/python-poetry/poetry
-    rev: 1.8.0
+    rev: 2.2.1
     hooks:
       - id: poetry-check
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.1
+    rev: v1.18.2
     hooks:
       - id: mypy
         additional_dependencies:
           - types-requests
   - repo: https://github.com/srstevenson/nb-clean
-    rev: 3.3.0
+    rev: 4.0.1
     hooks:
       - id: nb-clean
         args: [--preserve-cell-outputs, --remove-empty-cells]
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.18
+    rev: v0.24.1
     hooks:
       - id: validate-pyproject
         additional_dependencies:

diff --git a/data/tabular/ld50_catmos/meta.yaml b/data/tabular/ld50_catmos/meta.yaml
diff --git a/data/tabular/mona/example_processing_and_templates.ipynb b/data/tabular/mona/example_processing_and_templates.ipynb
@@ -20,7 +20,6 @@
     "from tqdm import tqdm\n",
     "\n",
     "# import datasets\n",
-    "import rdkit\n",
     "import rdkit.Chem as Chem\n",
     "import rdkit.RDLogger as RDLogger"
    ]
@@ -1444,7 +1443,7 @@
     "                k = md[\"name\"]\n",
     "                v = md.get(\"value\", np.nan)\n",
     "                df_row[\"md_\" + transform_key(k)] = v\n",
-    "                if not (v is np.nan):\n",
+    "                if v is not np.nan:\n",
     "                    md_keys.append(k)\n",
     "            md_key_counter.update(md_keys)\n",
     "            compounds = entry.get(\"compound\", [])\n",

diff --git a/data/tabular/ocp/transform.py b/data/tabular/ocp/transform.py
@@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str:
         text = text.replace(chr(code), f"$_{code-8320}$")
 
     text = text.replace("\u0305", "$^-$")
-    text = text.replace("\u207A", "$^+$")
-    text = text.replace("\u207B", "$^-$")
+    text = text.replace("\u207a", "$^+$")
+    text = text.replace("\u207b", "$^-$")
     text = text.replace("\u2074", "$^4$")
     text = text.replace("\u2070", "$^0$")
     text = text.replace("\u2078", "$^1$")

diff --git a/data/tabular/orbnet_denali/develop_transform.ipynb b/data/tabular/orbnet_denali/develop_transform.ipynb
@@ -25,11 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pathlib import Path\n",
     "from rdkit import Chem\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import os\n",
     "import pandas as pd\n",
     "from glob import glob"
    ]
@@ -474,7 +470,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from rdkit.Chem import rdDetermineBonds\n",
     "from chemnlp.utils import xyz_to_mol"
    ]
   },

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -17,7 +17,6 @@ One of the most valuable contributions is implementing a dataset. Here's how to
 1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there.
 2. Create an issue in this repository stating your intention to add the dataset.
 3. Make a Pull Request (PR) that adds a new folder in `data` with the following files:
-
    - `meta.yaml`: Describes the dataset (see structure below).
    - `transform.py`: Python code to transform the original dataset into a usable form.
 

diff --git a/experiments/README.md b/experiments/README.md
@@ -11,7 +11,6 @@ running on the cluster.
 
 1. [Create Environment](scripts/env_creation_neox.sh) -
    creates a basic conda environment for experiments.
-
    - Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
      > Using the positional argument passed into the script
    - Clones `chemnlp` into your personal cluster `USER` directory.
@@ -28,7 +27,6 @@ running on the cluster.
 
 2. [Training Models](scripts/sbatch_train_neox.sh) -
    runs a GPT-NeoX training pipeline
-
    - creates a conda environment using the `env_creation_neox.sh` script.
    - runs the GPT-NeoX `train.py` script using the user configuration
      > as GPT-NeoX configurations can be combined, the PEFT configurations are held
@@ -48,7 +46,6 @@ running on the cluster.
 
 1. [Create Environment](scripts/env_creation_hf.sh) -
    creates a basic conda environment for experiments.
-
    - Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
      > Using the positional argument passed into the script
    - Clones `chemnlp` into your personal cluster `USER` directory.
@@ -65,7 +62,6 @@ running on the cluster.
 
 2. [Single Node Models](scripts/sbatch_train_hf.sh) -
    runs a Hugging Face training pipeline across devices
-
    - creates a conda environment using the `env_creation_hf.sh` script.
    - runs the Hugging Face `run_tune.py` script with the user configuration
 
@@ -81,7 +77,6 @@ running on the cluster.
 
 3. [Multi Node Models](scripts/sbatch_train_hf_multinode.sh) -
    runs a Hugging Face training pipeline across nodes
-
    - creates a conda environment using the `env_creation_hf.sh` script.
    - runs the Hugging Face `run_tune.py` script with the user configuration
 
@@ -97,7 +92,6 @@ running on the cluster.
 
 4. [Grid Search](scripts/run_grid_search.py) -
    runs a grid search across training pipeline configuration options
-
    - Update the upper-case parameters at the top of the script
    - The script runs an exhaustive set of experiments across all permutations
 

diff --git a/experiments/ablations/continued_pretrain.py b/experiments/ablations/continued_pretrain.py
@@ -57,7 +57,13 @@ def load_model(
 
 
 def train(
-    model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None
+    model,
+    tokenizer,
+    dataset,
+    run_name: str,
+    batch_size: int = 64,
+    max_seq_length=2048,
+    eval_dataset=None,
 ):
     wandb.init(project="chemnlp-ablations", name=run_name)
     trainer = UnslothTrainer(
@@ -83,8 +89,8 @@ def train(
             lr_scheduler_type="linear",
             seed=3407,
             output_dir=f"outputs_{run_name}",
-            eval_strategy = 'steps' if eval_dataset is not None else 'no',
-            eval_steps = 10_000 if eval_dataset is not None else None
+            eval_strategy="steps" if eval_dataset is not None else "no",
+            eval_steps=10_000 if eval_dataset is not None else None,
         ),
     )
 
@@ -138,9 +144,18 @@ def run(
     )
 
     dataset = create_dataset(tokenizer, data_files)
-    eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None
+    eval_dataset = (
+        create_dataset(tokenizer, eval_data_files) if eval_data_files else None
+    )
 
-    train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset)
+    train(
+        model,
+        tokenizer,
+        dataset,
+        run_name,
+        batch_size=batch_size,
+        eval_dataset=eval_dataset,
+    )
 
 
 if __name__ == "__main__":

diff --git a/experiments/configs/data_configs/hf_data.yml b/experiments/configs/data_configs/hf_data.yml
@@ -1,7 +1,7 @@
 model_name: "EleutherAI/pythia-1b"
 context_length: 2048
 dataset_name: "EleutherAI/pile"
-dataset_args: {"name": "pubmed", "split": "train"}
+dataset_args: { "name": "pubmed", "split": "train" }
 batch_size: 1
 string_key: "text"
 save_path: "/fsx/proj-chemnlp/data/example_tokenised"