From 4d106eb46b8acfe84d71349486f1a23bcde1311a Mon Sep 17 00:00:00 2001 From: Dmitry Date: Mon, 27 Oct 2025 22:39:27 +0100 Subject: [PATCH] Fix Docker build and add CI workflow --- .github/workflows/ci.yml | 38 ++++++++++++++++++++++++++++++ Dockerfile | 13 +++++++--- Makefile | 2 +- docker-compose.yml | 6 ++--- pyproject.toml | 3 ++- requirements.txt | 4 ++-- src/molmole_research/__init__.py | 2 +- src/molmole_research/downloader.py | 8 +++---- src/molmole_research/evaluator.py | 9 ++++--- src/molmole_research/extractor.py | 19 ++++++++------- src/molmole_research/runner.py | 6 ++--- tests/__init__.py | 2 +- tests/test_downloader.py | 20 +++++++--------- tests/test_evaluator.py | 6 ++--- tests/test_extractor.py | 13 ++++++---- 15 files changed, 96 insertions(+), 55 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..092a110 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,38 @@ +name: CI + +run-name: "CI #${{ github.run_number }}" + +on: + push: + branches: + - main + - develop + pull_request: + +jobs: + quality: + name: Lint and Test + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Ruff lint + run: ruff check src tests + + - name: Ruff format check + run: ruff format --check src tests + + - name: Run tests + run: pytest -q diff --git a/Dockerfile b/Dockerfile index fff2123..4c0fb5d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,8 +14,15 @@ WORKDIR /workspace # Copy dependency lists first to leverage Docker layer caching COPY requirements.txt /workspace/requirements.txt -# Install Python dependencies -RUN uv pip install -r /workspace/requirements.txt + +# Create a virtual environment and install Python dependencies +RUN uv venv /opt/venv \ + && . /opt/venv/bin/activate \ + && uv pip install -r /workspace/requirements.txt + +# Ensure the virtual environment is used by default +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="/opt/venv/bin:$PATH" # Copy the rest of the repository into the container COPY . /workspace @@ -24,4 +31,4 @@ COPY . /workspace ENV PYTHONPATH=/workspace/src # Set entrypoint -ENTRYPOINT ["bash"] \ No newline at end of file +ENTRYPOINT ["bash"] diff --git a/Makefile b/Makefile index 846539a..00739bc 100644 --- a/Makefile +++ b/Makefile @@ -34,4 +34,4 @@ run: # Download the MolMole dataset into the data/ directory download: - docker compose run --rm research python -m molmole_research.downloader --dataset doxa-friend/MolMole_Patent300 --out data/images \ No newline at end of file + docker compose run --rm research python -m molmole_research.downloader --dataset doxa-friend/MolMole_Patent300 --out data/images diff --git a/docker-compose.yml b/docker-compose.yml index 8c25ff8..72a5e48 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.9' - services: research: build: . @@ -12,8 +10,8 @@ services: # Provide your OpenAI API key via environment variable. When left unset # the extractor will attempt to read from ~/.config/openai or prompt the user. OPENAI_API_KEY: ${OPENAI_API_KEY:-} - # Optionally override the API base if using a self‑hosted endpoint + # Optionally override the API base if using a self-hosted endpoint OPENAI_API_BASE: ${OPENAI_API_BASE:-} tty: true stdin_open: true - command: ["bash"] \ No newline at end of file + command: ["bash"] diff --git a/pyproject.toml b/pyproject.toml index c6dbb6f..f73ef51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,4 +5,5 @@ exclude = [".venv", "build", "dist"] [tool.pytest.ini_options] addopts = "-ra --strict-markers" -testpaths = ["tests"] \ No newline at end of file +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/requirements.txt b/requirements.txt index 90697e4..b45c7d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ huggingface_hub>=0.21.0 pillow>=10.0.0 typer[all]>=0.12.0 tqdm>=4.66.0 -rdkit-pypi>=2023.9.5 +rdkit>=2023.9.5 # YAML parsing for the runner pyyaml>=6.0 @@ -13,4 +13,4 @@ pyyaml>=6.0 # Development dependencies pytest>=7.4.0 pytest-mock>=3.10.0 -ruff>=0.1.5 \ No newline at end of file +ruff>=0.1.5 diff --git a/src/molmole_research/__init__.py b/src/molmole_research/__init__.py index eec758d..7a9c337 100644 --- a/src/molmole_research/__init__.py +++ b/src/molmole_research/__init__.py @@ -20,4 +20,4 @@ "runner", ] -__version__ = "0.1.0" \ No newline at end of file +__version__ = "0.1.0" diff --git a/src/molmole_research/downloader.py b/src/molmole_research/downloader.py index 0bec445..ed194ae 100644 --- a/src/molmole_research/downloader.py +++ b/src/molmole_research/downloader.py @@ -45,9 +45,7 @@ def download_dataset( "doxa-friend/MolMole_Patent300", help="HuggingFace dataset identifier" ), split: str = typer.Option("train", help="Which split to download (e.g., train/validation)"), - out: Path = typer.Option( - Path("data/images"), help="Output directory for images and labels" - ), + out: Path = typer.Option(Path("data/images"), help="Output directory for images and labels"), ) -> None: """Download the specified dataset and save it locally. @@ -66,7 +64,7 @@ def download_dataset( except Exception as exc: # pragma: no cover - network errors typer.echo( f"Failed to download dataset {dataset}. Please ensure you have access and" - " have run `huggingface-cli login` if required. Error: {exc}" + f" have run `huggingface-cli login` if required. Error: {exc}" ) raise typer.Exit(1) @@ -112,4 +110,4 @@ def download_dataset( if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/src/molmole_research/evaluator.py b/src/molmole_research/evaluator.py index 3298cdc..395e0bc 100644 --- a/src/molmole_research/evaluator.py +++ b/src/molmole_research/evaluator.py @@ -17,9 +17,10 @@ import json from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List import typer + try: # RDKit is used for canonical SMILES and InChI generation. It may not be # available in all environments, so we make its import optional and @@ -79,9 +80,7 @@ def evaluate( dataset_dir: Path = typer.Option( Path("data/images"), exists=True, help="Directory containing labels.json" ), - out: Path = typer.Option( - Path("results"), help="Directory where metrics will be saved" - ), + out: Path = typer.Option(Path("results"), help="Directory where metrics will be saved"), ) -> None: """Evaluate predictions against ground truth SMILES. @@ -158,4 +157,4 @@ def evaluate( if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/src/molmole_research/extractor.py b/src/molmole_research/extractor.py index f9b63b4..09847fe 100644 --- a/src/molmole_research/extractor.py +++ b/src/molmole_research/extractor.py @@ -18,12 +18,11 @@ import base64 import datetime as dt import json -import os from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional +from typing import List, Optional import typer -from PIL import Image + try: # Attempt to import the OpenAI client library. This dependency is optional # because it may not be available in environments without network access or @@ -129,12 +128,13 @@ def default_prompt() -> str: @app.command("run") def run_extraction( - model: str = typer.Option( - ..., "--model", help="Name of the vision model to use (e.g. gpt-4o)" - ), + model: str = typer.Option(..., "--model", help="Name of the vision model to use (e.g. gpt-4o)"), dataset_dir: Path = typer.Option( - Path("data/images/images"), exists=True, file_okay=False, dir_okay=True, - help="Directory containing images downloaded by the downloader" + Path("data/images/images"), + exists=True, + file_okay=False, + dir_okay=True, + help="Directory containing images downloaded by the downloader", ), out: Path = typer.Option( Path("results"), help="Directory where the JSONL results will be saved" @@ -180,6 +180,7 @@ def run_extraction( max_tokens=max_tokens, ) except Exception as exc: # pragma: no cover - network errors + typer.echo(f"Failed to process {img_path.name}: {exc}") text = "" record = {"file_name": img_path.name, "text": text} fh.write(json.dumps(record) + "\n") @@ -188,4 +189,4 @@ def run_extraction( if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/src/molmole_research/runner.py b/src/molmole_research/runner.py index 4445eeb..af28d9f 100644 --- a/src/molmole_research/runner.py +++ b/src/molmole_research/runner.py @@ -57,9 +57,7 @@ def _find_latest_prediction(out_dir: Path, model_prefix: str) -> Optional[Path]: @app.command("run") def run_experiments( - config: Path = typer.Option( - None, exists=False, help="YAML file with experiment definitions" - ), + config: Path = typer.Option(None, exists=False, help="YAML file with experiment definitions"), dataset_dir: Path = typer.Option( Path("data/images/images"), exists=True, help="Directory of dataset images" ), @@ -166,4 +164,4 @@ def run_experiments( if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/tests/__init__.py b/tests/__init__.py index 66c1c7a..ccab5a0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -3,4 +3,4 @@ This file exists to make ``tests`` a Python package. Having a package allows relative imports within the test suite and is required when using pytest with certain plugins. -""" \ No newline at end of file +""" diff --git a/tests/test_downloader.py b/tests/test_downloader.py index c0f5723..ab3b8f0 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,9 +9,7 @@ from __future__ import annotations import json -from pathlib import Path -import pytest from PIL import Image import datasets @@ -28,11 +26,13 @@ def _make_dummy_dataset(num_items: int = 2): images.append(img) smiles.append("C") # simplest molecule file_names.append(f"item_{i}.png") - return datasets.Dataset.from_dict({ - "image": images, - "smiles": smiles, - "file_name": file_names, - }) + return datasets.Dataset.from_dict( + { + "image": images, + "smiles": smiles, + "file_name": file_names, + } + ) def test_downloader_saves_images_and_labels(monkeypatch, tmp_path): @@ -46,9 +46,7 @@ def fake_load_dataset(*args, **kwargs): # noqa: D401 monkeypatch.setattr(datasets, "load_dataset", fake_load_dataset) out_dir = tmp_path / "download" - download_dataset.callback( # type: ignore[attr-defined] - dataset="dummy", split="train", out=out_dir - ) + download_dataset(dataset="dummy", split="train", out=out_dir) # Check that images were saved images_dir = out_dir / "images" assert images_dir.exists() and images_dir.is_dir() @@ -61,4 +59,4 @@ def fake_load_dataset(*args, **kwargs): # noqa: D401 labels = json.loads(labels_path.read_text()) assert len(labels) == 3 for entry in labels: - assert entry["smiles"] == "C" \ No newline at end of file + assert entry["smiles"] == "C" diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index 7d697b6..31b05b6 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -9,8 +9,6 @@ import json from pathlib import Path -import pytest - from molmole_research import evaluator @@ -47,7 +45,7 @@ def test_evaluator_accuracy(tmp_path): _write_predictions(pred_path, preds) results_dir = tmp_path / "results" - evaluator.evaluate.callback(pred=pred_path, dataset_dir=dataset_dir, out=results_dir) + evaluator.evaluate(pred=pred_path, dataset_dir=dataset_dir, out=results_dir) # Load metrics metrics_files = list(results_dir.iterdir()) @@ -58,4 +56,4 @@ def test_evaluator_accuracy(tmp_path): # Only the first prediction is correct assert metrics["correct_smiles"] == 1 assert metrics["correct_inchi"] == 1 - assert abs(metrics["accuracy_smiles"] - 0.5) < 1e-6 \ No newline at end of file + assert abs(metrics["accuracy_smiles"] - 0.5) < 1e-6 diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 8ed0b80..576b341 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -10,7 +10,6 @@ import json from pathlib import Path -import pytest from PIL import Image from molmole_research import extractor @@ -35,8 +34,14 @@ def fake_call_openai_model(*args, **kwargs): # noqa: D401 monkeypatch.setattr(extractor, "call_openai_model", fake_call_openai_model) out_dir = tmp_path / "results" - extractor.run_extraction.callback( - model="gpt-4o-test", dataset_dir=dataset_dir, out=out_dir, api_base=None, api_key=None, temperature=0.0, max_tokens=32 + extractor.run_extraction( + model="gpt-4o-test", + dataset_dir=dataset_dir, + out=out_dir, + api_base=None, + api_key=None, + temperature=0.0, + max_tokens=32, ) # There should be exactly one JSONL file in out_dir files = list(out_dir.iterdir()) @@ -46,4 +51,4 @@ def fake_call_openai_model(*args, **kwargs): # noqa: D401 assert len(contents) == 2 for line in contents: record = json.loads(line) - assert record["text"] == "C" \ No newline at end of file + assert record["text"] == "C"