Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ default_language_version:
python: python3
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v6.0.0
hooks:
- id: check-added-large-files
- id: check-byte-order-marker
Expand All @@ -21,69 +21,69 @@ repos:
- id: trailing-whitespace
exclude: .gitignore
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.4.2
rev: 25.9.0
hooks:
- id: black-jupyter
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.1
rev: v0.13.2
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/rbubley/mirrors-prettier
rev: v3.3.2
rev: v3.6.2
hooks:
- id: prettier
- repo: https://github.com/google/yamlfmt
rev: v0.13.0
rev: v0.17.2
hooks:
- id: yamlfmt
- repo: https://github.com/Yelp/detect-secrets
rev: v1.5.0
hooks:
- id: detect-secrets
- repo: https://github.com/pappasam/toml-sort
rev: v0.23.1
rev: v0.24.3
hooks:
- id: toml-sort-fix
exclude: poetry.lock
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
rev: v2.4.1
hooks:
- id: codespell
additional_dependencies: [".[toml]"]
- repo: https://github.com/sqlfluff/sqlfluff
rev: 3.1.0
rev: 3.4.2
hooks:
- id: sqlfluff-fix
- repo: https://github.com/hadolint/hadolint
rev: v2.13.0-beta
rev: v2.14.0
hooks:
- id: hadolint-docker
- repo: https://github.com/jsh9/markdown-toc-creator
rev: 0.0.6
rev: 0.0.10
hooks:
- id: markdown-toc-creator
- repo: https://github.com/jumanjihouse/pre-commit-hooks
rev: 3.0.0
hooks:
- id: check-mailmap
- repo: https://github.com/python-poetry/poetry
rev: 1.8.0
rev: 2.2.1
hooks:
- id: poetry-check
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.1
rev: v1.18.2
hooks:
- id: mypy
additional_dependencies:
- types-requests
- repo: https://github.com/srstevenson/nb-clean
rev: 3.3.0
rev: 4.0.1
hooks:
- id: nb-clean
args: [--preserve-cell-outputs, --remove-empty-cells]
- repo: https://github.com/abravalheri/validate-pyproject
rev: v0.18
rev: v0.24.1
hooks:
- id: validate-pyproject
additional_dependencies:
Expand Down
271 changes: 135 additions & 136 deletions data/tabular/ld50_catmos/meta.yaml

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions data/tabular/mona/example_processing_and_templates.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
"from tqdm import tqdm\n",
"\n",
"# import datasets\n",
"import rdkit\n",
"import rdkit.Chem as Chem\n",
"import rdkit.RDLogger as RDLogger"
]
Expand Down Expand Up @@ -1444,7 +1443,7 @@
" k = md[\"name\"]\n",
" v = md.get(\"value\", np.nan)\n",
" df_row[\"md_\" + transform_key(k)] = v\n",
" if not (v is np.nan):\n",
" if v is not np.nan:\n",
" md_keys.append(k)\n",
" md_key_counter.update(md_keys)\n",
" compounds = entry.get(\"compound\", [])\n",
Expand Down
4 changes: 2 additions & 2 deletions data/tabular/ocp/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str:
text = text.replace(chr(code), f"$_{code-8320}$")

text = text.replace("\u0305", "$^-$")
text = text.replace("\u207A", "$^+$")
text = text.replace("\u207B", "$^-$")
text = text.replace("\u207a", "$^+$")
text = text.replace("\u207b", "$^-$")
text = text.replace("\u2074", "$^4$")
text = text.replace("\u2070", "$^0$")
text = text.replace("\u2078", "$^1$")
Expand Down
5 changes: 0 additions & 5 deletions data/tabular/orbnet_denali/develop_transform.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from rdkit import Chem\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import os\n",
"import pandas as pd\n",
"from glob import glob"
]
Expand Down Expand Up @@ -474,7 +470,6 @@
"metadata": {},
"outputs": [],
"source": [
"from rdkit.Chem import rdDetermineBonds\n",
"from chemnlp.utils import xyz_to_mol"
]
},
Expand Down
1 change: 0 additions & 1 deletion docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ One of the most valuable contributions is implementing a dataset. Here's how to
1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there.
2. Create an issue in this repository stating your intention to add the dataset.
3. Make a Pull Request (PR) that adds a new folder in `data` with the following files:

- `meta.yaml`: Describes the dataset (see structure below).
- `transform.py`: Python code to transform the original dataset into a usable form.

Expand Down
6 changes: 0 additions & 6 deletions experiments/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ running on the cluster.

1. [Create Environment](scripts/env_creation_neox.sh) -
creates a basic conda environment for experiments.

- Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
> Using the positional argument passed into the script
- Clones `chemnlp` into your personal cluster `USER` directory.
Expand All @@ -28,7 +27,6 @@ running on the cluster.

2. [Training Models](scripts/sbatch_train_neox.sh) -
runs a GPT-NeoX training pipeline

- creates a conda environment using the `env_creation_neox.sh` script.
- runs the GPT-NeoX `train.py` script using the user configuration
> as GPT-NeoX configurations can be combined, the PEFT configurations are held
Expand All @@ -48,7 +46,6 @@ running on the cluster.

1. [Create Environment](scripts/env_creation_hf.sh) -
creates a basic conda environment for experiments.

- Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
> Using the positional argument passed into the script
- Clones `chemnlp` into your personal cluster `USER` directory.
Expand All @@ -65,7 +62,6 @@ running on the cluster.

2. [Single Node Models](scripts/sbatch_train_hf.sh) -
runs a Hugging Face training pipeline across devices

- creates a conda environment using the `env_creation_hf.sh` script.
- runs the Hugging Face `run_tune.py` script with the user configuration

Expand All @@ -81,7 +77,6 @@ running on the cluster.

3. [Multi Node Models](scripts/sbatch_train_hf_multinode.sh) -
runs a Hugging Face training pipeline across nodes

- creates a conda environment using the `env_creation_hf.sh` script.
- runs the Hugging Face `run_tune.py` script with the user configuration

Expand All @@ -97,7 +92,6 @@ running on the cluster.

4. [Grid Search](scripts/run_grid_search.py) -
runs a grid search across training pipeline configuration options

- Update the upper-case parameters at the top of the script
- The script runs an exhaustive set of experiments across all permutations

Expand Down
25 changes: 20 additions & 5 deletions experiments/ablations/continued_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,13 @@ def load_model(


def train(
model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None
model,
tokenizer,
dataset,
run_name: str,
batch_size: int = 64,
max_seq_length=2048,
eval_dataset=None,
):
wandb.init(project="chemnlp-ablations", name=run_name)
trainer = UnslothTrainer(
Expand All @@ -83,8 +89,8 @@ def train(
lr_scheduler_type="linear",
seed=3407,
output_dir=f"outputs_{run_name}",
eval_strategy = 'steps' if eval_dataset is not None else 'no',
eval_steps = 10_000 if eval_dataset is not None else None
eval_strategy="steps" if eval_dataset is not None else "no",
eval_steps=10_000 if eval_dataset is not None else None,
),
)

Expand Down Expand Up @@ -138,9 +144,18 @@ def run(
)

dataset = create_dataset(tokenizer, data_files)
eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None
eval_dataset = (
create_dataset(tokenizer, eval_data_files) if eval_data_files else None
)

train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset)
train(
model,
tokenizer,
dataset,
run_name,
batch_size=batch_size,
eval_dataset=eval_dataset,
)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion experiments/configs/data_configs/hf_data.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
model_name: "EleutherAI/pythia-1b"
context_length: 2048
dataset_name: "EleutherAI/pile"
dataset_args: {"name": "pubmed", "split": "train"}
dataset_args: { "name": "pubmed", "split": "train" }
batch_size: 1
string_key: "text"
save_path: "/fsx/proj-chemnlp/data/example_tokenised"
Loading