Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: CI

run-name: "CI #${{ github.run_number }}"

on:
push:
branches:
- main
- develop
pull_request:

jobs:
quality:
name: Lint and Test
runs-on: ubuntu-latest

steps:
- name: Check out repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Ruff lint
run: ruff check src tests

- name: Ruff format check
run: ruff format --check src tests

- name: Run tests
run: pytest -q
13 changes: 10 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,15 @@ WORKDIR /workspace

# Copy dependency lists first to leverage Docker layer caching
COPY requirements.txt /workspace/requirements.txt
# Install Python dependencies
RUN uv pip install -r /workspace/requirements.txt

# Create a virtual environment and install Python dependencies
RUN uv venv /opt/venv \
&& . /opt/venv/bin/activate \
&& uv pip install -r /workspace/requirements.txt

# Ensure the virtual environment is used by default
ENV VIRTUAL_ENV=/opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Copy the rest of the repository into the container
COPY . /workspace
Expand All @@ -24,4 +31,4 @@ COPY . /workspace
ENV PYTHONPATH=/workspace/src

# Set entrypoint
ENTRYPOINT ["bash"]
ENTRYPOINT ["bash"]
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ run:

# Download the MolMole dataset into the data/ directory
download:
docker compose run --rm research python -m molmole_research.downloader --dataset doxa-friend/MolMole_Patent300 --out data/images
docker compose run --rm research python -m molmole_research.downloader --dataset doxa-friend/MolMole_Patent300 --out data/images
6 changes: 2 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: '3.9'

services:
research:
build: .
Expand All @@ -12,8 +10,8 @@ services:
# Provide your OpenAI API key via environment variable. When left unset
# the extractor will attempt to read from ~/.config/openai or prompt the user.
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
# Optionally override the API base if using a selfhosted endpoint
# Optionally override the API base if using a self-hosted endpoint
OPENAI_API_BASE: ${OPENAI_API_BASE:-}
tty: true
stdin_open: true
command: ["bash"]
command: ["bash"]
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ exclude = [".venv", "build", "dist"]

[tool.pytest.ini_options]
addopts = "-ra --strict-markers"
testpaths = ["tests"]
testpaths = ["tests"]
pythonpath = ["src"]
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ huggingface_hub>=0.21.0
pillow>=10.0.0
typer[all]>=0.12.0
tqdm>=4.66.0
rdkit-pypi>=2023.9.5
rdkit>=2023.9.5

# YAML parsing for the runner
pyyaml>=6.0

# Development dependencies
pytest>=7.4.0
pytest-mock>=3.10.0
ruff>=0.1.5
ruff>=0.1.5
2 changes: 1 addition & 1 deletion src/molmole_research/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"runner",
]

__version__ = "0.1.0"
__version__ = "0.1.0"
8 changes: 3 additions & 5 deletions src/molmole_research/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ def download_dataset(
"doxa-friend/MolMole_Patent300", help="HuggingFace dataset identifier"
),
split: str = typer.Option("train", help="Which split to download (e.g., train/validation)"),
out: Path = typer.Option(
Path("data/images"), help="Output directory for images and labels"
),
out: Path = typer.Option(Path("data/images"), help="Output directory for images and labels"),
) -> None:
"""Download the specified dataset and save it locally.

Expand All @@ -66,7 +64,7 @@ def download_dataset(
except Exception as exc: # pragma: no cover - network errors
typer.echo(
f"Failed to download dataset {dataset}. Please ensure you have access and"
" have run `huggingface-cli login` if required. Error: {exc}"
f" have run `huggingface-cli login` if required. Error: {exc}"
)
raise typer.Exit(1)

Expand Down Expand Up @@ -112,4 +110,4 @@ def download_dataset(


if __name__ == "__main__":
app()
app()
9 changes: 4 additions & 5 deletions src/molmole_research/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@

import json
from pathlib import Path
from typing import Dict, List, Tuple
from typing import Dict, List

import typer

try:
# RDKit is used for canonical SMILES and InChI generation. It may not be
# available in all environments, so we make its import optional and
Expand Down Expand Up @@ -79,9 +80,7 @@ def evaluate(
dataset_dir: Path = typer.Option(
Path("data/images"), exists=True, help="Directory containing labels.json"
),
out: Path = typer.Option(
Path("results"), help="Directory where metrics will be saved"
),
out: Path = typer.Option(Path("results"), help="Directory where metrics will be saved"),
) -> None:
"""Evaluate predictions against ground truth SMILES.

Expand Down Expand Up @@ -158,4 +157,4 @@ def evaluate(


if __name__ == "__main__":
app()
app()
19 changes: 10 additions & 9 deletions src/molmole_research/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
import base64
import datetime as dt
import json
import os
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
from typing import List, Optional

import typer
from PIL import Image

try:
# Attempt to import the OpenAI client library. This dependency is optional
# because it may not be available in environments without network access or
Expand Down Expand Up @@ -129,12 +128,13 @@ def default_prompt() -> str:

@app.command("run")
def run_extraction(
model: str = typer.Option(
..., "--model", help="Name of the vision model to use (e.g. gpt-4o)"
),
model: str = typer.Option(..., "--model", help="Name of the vision model to use (e.g. gpt-4o)"),
dataset_dir: Path = typer.Option(
Path("data/images/images"), exists=True, file_okay=False, dir_okay=True,
help="Directory containing images downloaded by the downloader"
Path("data/images/images"),
exists=True,
file_okay=False,
dir_okay=True,
help="Directory containing images downloaded by the downloader",
),
out: Path = typer.Option(
Path("results"), help="Directory where the JSONL results will be saved"
Expand Down Expand Up @@ -180,6 +180,7 @@ def run_extraction(
max_tokens=max_tokens,
)
except Exception as exc: # pragma: no cover - network errors
typer.echo(f"Failed to process {img_path.name}: {exc}")
text = ""
record = {"file_name": img_path.name, "text": text}
fh.write(json.dumps(record) + "\n")
Expand All @@ -188,4 +189,4 @@ def run_extraction(


if __name__ == "__main__":
app()
app()
6 changes: 2 additions & 4 deletions src/molmole_research/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ def _find_latest_prediction(out_dir: Path, model_prefix: str) -> Optional[Path]:

@app.command("run")
def run_experiments(
config: Path = typer.Option(
None, exists=False, help="YAML file with experiment definitions"
),
config: Path = typer.Option(None, exists=False, help="YAML file with experiment definitions"),
dataset_dir: Path = typer.Option(
Path("data/images/images"), exists=True, help="Directory of dataset images"
),
Expand Down Expand Up @@ -166,4 +164,4 @@ def run_experiments(


if __name__ == "__main__":
app()
app()
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
This file exists to make ``tests`` a Python package. Having a package
allows relative imports within the test suite and is required when using
pytest with certain plugins.
"""
"""
20 changes: 9 additions & 11 deletions tests/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
from __future__ import annotations

import json
from pathlib import Path

import pytest
from PIL import Image
import datasets

Expand All @@ -28,11 +26,13 @@ def _make_dummy_dataset(num_items: int = 2):
images.append(img)
smiles.append("C") # simplest molecule
file_names.append(f"item_{i}.png")
return datasets.Dataset.from_dict({
"image": images,
"smiles": smiles,
"file_name": file_names,
})
return datasets.Dataset.from_dict(
{
"image": images,
"smiles": smiles,
"file_name": file_names,
}
)


def test_downloader_saves_images_and_labels(monkeypatch, tmp_path):
Expand All @@ -46,9 +46,7 @@ def fake_load_dataset(*args, **kwargs): # noqa: D401
monkeypatch.setattr(datasets, "load_dataset", fake_load_dataset)

out_dir = tmp_path / "download"
download_dataset.callback( # type: ignore[attr-defined]
dataset="dummy", split="train", out=out_dir
)
download_dataset(dataset="dummy", split="train", out=out_dir)
# Check that images were saved
images_dir = out_dir / "images"
assert images_dir.exists() and images_dir.is_dir()
Expand All @@ -61,4 +59,4 @@ def fake_load_dataset(*args, **kwargs): # noqa: D401
labels = json.loads(labels_path.read_text())
assert len(labels) == 3
for entry in labels:
assert entry["smiles"] == "C"
assert entry["smiles"] == "C"
6 changes: 2 additions & 4 deletions tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import json
from pathlib import Path

import pytest

from molmole_research import evaluator


Expand Down Expand Up @@ -47,7 +45,7 @@ def test_evaluator_accuracy(tmp_path):
_write_predictions(pred_path, preds)

results_dir = tmp_path / "results"
evaluator.evaluate.callback(pred=pred_path, dataset_dir=dataset_dir, out=results_dir)
evaluator.evaluate(pred=pred_path, dataset_dir=dataset_dir, out=results_dir)

# Load metrics
metrics_files = list(results_dir.iterdir())
Expand All @@ -58,4 +56,4 @@ def test_evaluator_accuracy(tmp_path):
# Only the first prediction is correct
assert metrics["correct_smiles"] == 1
assert metrics["correct_inchi"] == 1
assert abs(metrics["accuracy_smiles"] - 0.5) < 1e-6
assert abs(metrics["accuracy_smiles"] - 0.5) < 1e-6
13 changes: 9 additions & 4 deletions tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import json
from pathlib import Path

import pytest
from PIL import Image

from molmole_research import extractor
Expand All @@ -35,8 +34,14 @@ def fake_call_openai_model(*args, **kwargs): # noqa: D401
monkeypatch.setattr(extractor, "call_openai_model", fake_call_openai_model)

out_dir = tmp_path / "results"
extractor.run_extraction.callback(
model="gpt-4o-test", dataset_dir=dataset_dir, out=out_dir, api_base=None, api_key=None, temperature=0.0, max_tokens=32
extractor.run_extraction(
model="gpt-4o-test",
dataset_dir=dataset_dir,
out=out_dir,
api_base=None,
api_key=None,
temperature=0.0,
max_tokens=32,
)
# There should be exactly one JSONL file in out_dir
files = list(out_dir.iterdir())
Expand All @@ -46,4 +51,4 @@ def fake_call_openai_model(*args, **kwargs): # noqa: D401
assert len(contents) == 2
for line in contents:
record = json.loads(line)
assert record["text"] == "C"
assert record["text"] == "C"