From 06cfb849b0d8d378fe2d5360642c4b46764360ee Mon Sep 17 00:00:00 2001
From: Liam Keegan
Date: Mon, 30 Sep 2024 10:59:15 +0200
Subject: [PATCH] Add CSV file upload
- replace single zip file upload with two file uploads: one h5 file, one csv file
- replace `/input_file` endpoint with `/input_h5_file` and `/input_csv_file`
- update model, endpoints and runner accordingly
- validate csv file on frontend
- parse first line and extract column names
- require that "barcode", "cdr3", "chain" are present
- resolves #3
- validate file sizes
- h5 must be less than 50MB
- csv must be less than 10MB
- resolves #2
- increase nginx/flask body limit to 100MB
- make runner into a package, add initial test using requests-mock
---
.github/workflows/ci.yml | 5 +-
README_DEPLOYMENT.md | 4 +-
backend/src/predicTCR_server/app.py | 33 ++++--
backend/src/predicTCR_server/model.py | 17 ++-
backend/tests/helpers/flask_test_utils.py | 9 +-
backend/tests/test_app.py | 25 ++---
frontend/Dockerfile | 2 +
frontend/nginx.conf | 2 +-
frontend/pnpm-lock.yaml | 11 --
frontend/src/components/SamplesTable.vue | 28 ++++-
frontend/src/utils/api-client.ts | 30 +++---
frontend/src/views/SamplesView.vue | 113 +++++++++++++++-----
runner/pyproject.toml | 33 ++++++
runner/src/predicTCR_runner/__init__.py | 1 +
runner/src/predicTCR_runner/main.py | 31 ++++++
runner/{ => src/predicTCR_runner}/runner.py | 66 ++++--------
runner/tests/__init__.py | 0
runner/tests/test_runner.py | 9 ++
18 files changed, 282 insertions(+), 137 deletions(-)
create mode 100644 runner/pyproject.toml
create mode 100644 runner/src/predicTCR_runner/__init__.py
create mode 100644 runner/src/predicTCR_runner/main.py
rename runner/{ => src/predicTCR_runner}/runner.py (72%)
create mode 100644 runner/tests/__init__.py
create mode 100644 runner/tests/test_runner.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index db2cb8a..67f3542 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -56,7 +56,6 @@ jobs:
name: "Docker website"
steps:
- uses: actions/checkout@v4
- - run: echo "VITE_REST_API_LOCATION=https://predictcr.lkeegan.dev/api" > frontend/.env
- run: docker compose build
- uses: docker/login-action@v3
with:
@@ -92,8 +91,8 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- - run: pip install pytest
- - run: python -m pytest -sv
+ - run: pip install -e .[tests]
+ - run: pytest -sv
runner-docker:
runs-on: ubuntu-latest
name: "Docker runner"
diff --git a/README_DEPLOYMENT.md b/README_DEPLOYMENT.md
index 335e77d..abffaea 100644
--- a/README_DEPLOYMENT.md
+++ b/README_DEPLOYMENT.md
@@ -6,7 +6,7 @@ Some information on how to deploy the website.
Production docker container images are automatically built by CI.
To deploy the latest version on a virtual machine with docker compose installed,
-download [docker compose.yml](https://raw.githubusercontent.com/ssciwr/predicTCR/main/docker compose.yml), then do
+download [docker-compose.yml](https://raw.githubusercontent.com/ssciwr/predicTCR/main/docker-compose.yml), then do
```
sudo docker compose pull
@@ -16,7 +16,7 @@ sudo docker compose up -d
The location of data directory, SSL keys and secret key should be set
either in env vars or in a file `.env` in the same location as the docker compose.yml.
-For example the current deployment on heicloud looks like this:
+For example the current test deployment on heicloud looks like this:
```
PREDICTCR_DATA="/home/ubuntu/predicTCR/docker_volume"
diff --git a/backend/src/predicTCR_server/app.py b/backend/src/predicTCR_server/app.py
index f5fefc0..f47e2c2 100644
--- a/backend/src/predicTCR_server/app.py
+++ b/backend/src/predicTCR_server/app.py
@@ -47,8 +47,8 @@ def create_app(data_path: str = "/predictcr_data"):
app.config["JWT_ACCESS_TOKEN_EXPIRES"] = datetime.timedelta(minutes=60)
app.config["SQLALCHEMY_DATABASE_URI"] = f"sqlite:///{data_path}/predicTCR.db"
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
- # limit max file upload size to 20mb
- app.config["MAX_CONTENT_LENGTH"] = 20 * 1024 * 1024
+ # limit max file upload size to 100mb
+ app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
app.config["PREDICTCR_DATA_PATH"] = data_path
CORS(app)
@@ -147,9 +147,9 @@ def change_password():
def samples():
return get_samples(current_user.email)
- @app.route("/api/input_file", methods=["POST"])
+ @app.route("/api/input_h5_file", methods=["POST"])
@jwt_required()
- def input_file():
+ def input_h5_file():
sample_id = request.json.get("sample_id", None)
logger.info(
f"User {current_user.email} requesting results for sample {sample_id}"
@@ -163,7 +163,25 @@ def input_file():
if user_sample is None:
logger.info(f" -> sample {sample_id} not found")
return jsonify(message="Sample not found"), 400
- return flask.send_file(user_sample.input_file_path(), as_attachment=True)
+ return flask.send_file(user_sample.input_h5_file_path(), as_attachment=True)
+
+ @app.route("/api/input_csv_file", methods=["POST"])
+ @jwt_required()
+ def input_csv_file():
+ sample_id = request.json.get("sample_id", None)
+ logger.info(
+ f"User {current_user.email} requesting results for sample {sample_id}"
+ )
+ filters = {"id": sample_id}
+ if not current_user.is_admin and not current_user.is_runner:
+ filters["email"] = current_user.email
+ user_sample = db.session.execute(
+ db.select(Sample).filter_by(**filters)
+ ).scalar_one_or_none()
+ if user_sample is None:
+ logger.info(f" -> sample {sample_id} not found")
+ return jsonify(message="Sample not found"), 400
+ return flask.send_file(user_sample.input_csv_file_path(), as_attachment=True)
@app.route("/api/result", methods=["POST"])
@jwt_required()
@@ -199,10 +217,11 @@ def add_sample():
name = form_as_dict.get("name", "")
tumor_type = form_as_dict.get("tumor_type", "")
source = form_as_dict.get("source", "")
- infile = request.files.get("file")
+ h5_file = request.files.get("h5_file")
+ csv_file = request.files.get("csv_file")
logger.info(f"Adding sample {name} from {email}")
new_sample, error_message = add_new_sample(
- email, name, tumor_type, source, infile
+ email, name, tumor_type, source, h5_file, csv_file
)
if new_sample is not None:
logger.info(" - > success")
diff --git a/backend/src/predicTCR_server/model.py b/backend/src/predicTCR_server/model.py
index 9435943..df5be0e 100644
--- a/backend/src/predicTCR_server/model.py
+++ b/backend/src/predicTCR_server/model.py
@@ -50,8 +50,11 @@ def _base_path(self) -> pathlib.Path:
data_path = flask.current_app.config["PREDICTCR_DATA_PATH"]
return pathlib.Path(f"{data_path}/{self.id}")
- def input_file_path(self) -> pathlib.Path:
- return self._base_path() / "input.zip"
+ def input_h5_file_path(self) -> pathlib.Path:
+ return self._base_path() / "input.h5"
+
+ def input_csv_file_path(self) -> pathlib.Path:
+ return self._base_path() / "input.csv"
def result_file_path(self) -> pathlib.Path:
return self._base_path() / "result.zip"
@@ -110,6 +113,7 @@ def get_samples(email: str | None = None) -> list[Sample]:
def request_job() -> int | None:
+ # todo: go through running jobs and reset to queued if they have been running for more than e.g. 2 hrs
selected_samples = (
db.select(Sample)
.filter(Sample.status == Status.QUEUED)
@@ -288,6 +292,7 @@ def enable_user(email: str, enabled: bool) -> tuple[str, int]:
if user is None:
logger.info(f" -> Unknown email address '{email}'")
return f"Unknown email address {email}", 400
+ user.activated = True
user.enabled = enabled
db.session.commit()
return f"Account {email} activated", 200
@@ -345,7 +350,8 @@ def add_new_sample(
name: str,
tumor_type: str,
source: str,
- input_file: FileStorage,
+ h5_file: FileStorage,
+ csv_file: FileStorage,
) -> tuple[Sample | None, str]:
user = db.session.execute(
db.select(User).filter(User.email == email)
@@ -378,6 +384,7 @@ def add_new_sample(
)
db.session.add(new_sample)
db.session.commit()
- new_sample.input_file_path().parent.mkdir(parents=True, exist_ok=True)
- input_file.save(new_sample.input_file_path())
+ new_sample.input_h5_file_path().parent.mkdir(parents=True, exist_ok=True)
+ h5_file.save(new_sample.input_h5_file_path())
+ csv_file.save(new_sample.input_csv_file_path())
return new_sample, ""
diff --git a/backend/tests/helpers/flask_test_utils.py b/backend/tests/helpers/flask_test_utils.py
index ed74aa1..b2af717 100644
--- a/backend/tests/helpers/flask_test_utils.py
+++ b/backend/tests/helpers/flask_test_utils.py
@@ -1,8 +1,6 @@
import argon2
from predicTCR_server.model import User, Sample, db, Status
import pathlib
-import shutil
-import tempfile
def add_test_users(app):
@@ -42,10 +40,9 @@ def add_test_samples(app, data_path: pathlib.Path):
):
ref_dir = data_path / f"{sample_id}"
ref_dir.mkdir(parents=True, exist_ok=True)
- with tempfile.TemporaryDirectory() as tmp_dir:
- with open(f"{tmp_dir}/test.txt", "w") as f:
- f.write(name)
- shutil.make_archive(f"{ref_dir}/input", "zip", tmp_dir)
+ for input_file_type in ["h5", "csv"]:
+ with open(f"{ref_dir}/input.{input_file_type}", "w") as f:
+ f.write(input_file_type)
new_sample = Sample(
email="user@abc.xy",
name=name,
diff --git a/backend/tests/test_app.py b/backend/tests/test_app.py
index 16e6b97..bca8739 100644
--- a/backend/tests/test_app.py
+++ b/backend/tests/test_app.py
@@ -1,7 +1,7 @@
from __future__ import annotations
from typing import Dict
import io
-import zipfile
+import pytest
import pathlib
import predicTCR_server
import flask_test_utils as ftu
@@ -129,17 +129,18 @@ def test_samples_valid(client):
assert len(response.json) == 4
-def test_input_file_invalid(client):
+@pytest.mark.parametrize("input_file_type", ["h5", "csv"])
+def test_input_file_invalid(client, input_file_type: str):
# no auth header
response = client.post(
- "/api/input_file",
+ f"/api/input_{input_file_type}_file",
json={"sample_id": 2},
)
assert response.status_code == 401
# invalid sample id
headers = _get_auth_headers(client)
response = client.post(
- "/api/input_file",
+ f"/api/input_{input_file_type}_file",
json={"sample_id": 66},
headers=headers,
)
@@ -147,18 +148,17 @@ def test_input_file_invalid(client):
assert "not found" in response.json["message"]
-def test_input_file_valid(client):
+@pytest.mark.parametrize("input_file_type", ["h5", "csv"])
+def test_input_file_valid(client, input_file_type: str):
headers = _get_auth_headers(client)
response = client.post(
- "/api/input_file",
+ f"/api/input_{input_file_type}_file",
json={"sample_id": 2},
headers=headers,
)
assert response.status_code == 200
- zip_file = zipfile.ZipFile(io.BytesIO(response.data))
- filenames = [f.filename for f in zip_file.filelist]
- assert len(filenames) == 1
- assert "test.txt" in filenames
+ with io.BytesIO(response.data) as f:
+ assert input_file_type in f.read().decode("utf-8")
def test_result_invalid(client):
@@ -222,14 +222,15 @@ def test_admin_runner_token_invalid(client):
assert response.status_code == 400
-def test_admin_runner_token_valid(client):
+@pytest.mark.parametrize("input_file_type", ["h5", "csv"])
+def test_admin_runner_token_valid(client, input_file_type: str):
headers = _get_auth_headers(client, "admin@abc.xy", "admin")
response = client.get("/api/admin/runner_token", headers=headers)
assert response.status_code == 200
new_token = response.json["access_token"]
assert (
client.post(
- "/api/input_file",
+ f"/api/input_{input_file_type}_file",
json={"sample_id": 1},
headers={"Authorization": f"Bearer {new_token}"},
).status_code
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 7750396..d4a261d 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -14,6 +14,8 @@ RUN pnpm install
COPY . .
+RUN echo "VITE_REST_API_LOCATION=/api" > .env
+
RUN pnpm run build-only
FROM nginx
diff --git a/frontend/nginx.conf b/frontend/nginx.conf
index 119d9a8..472d6c1 100644
--- a/frontend/nginx.conf
+++ b/frontend/nginx.conf
@@ -7,7 +7,7 @@ server {
ssl_certificate_key /predictcr_ssl_key.pem;
# Maximum file upload size
- client_max_body_size 20M;
+ client_max_body_size 100M;
# Improve HTTPS performance with session resumption
ssl_session_cache shared:SSL:10m;
diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml
index 4613b4d..ccb3041 100644
--- a/frontend/pnpm-lock.yaml
+++ b/frontend/pnpm-lock.yaml
@@ -13,9 +13,6 @@ importers:
bootstrap-icons:
specifier: ^1.11.3
version: 1.11.3
- jsbarcode:
- specifier: ^3.11.6
- version: 3.11.6
pinia:
specifier: ^2.2.2
version: 2.2.2(typescript@5.5.4)(vue@3.4.38(typescript@5.5.4))
@@ -2799,12 +2796,6 @@ packages:
}
hasBin: true
- jsbarcode@3.11.6:
- resolution:
- {
- integrity: sha512-G5TKGyKY1zJo0ZQKFM1IIMfy0nF2rs92BLlCz+cU4/TazIc4ZH+X1GYeDRt7TKjrYqmPfTjwTBkU/QnQlsYiuA==,
- }
-
jsdom@24.1.1:
resolution:
{
@@ -6243,8 +6234,6 @@ snapshots:
dependencies:
argparse: 2.0.1
- jsbarcode@3.11.6: {}
-
jsdom@24.1.1:
dependencies:
cssstyle: 4.0.1
diff --git a/frontend/src/components/SamplesTable.vue b/frontend/src/components/SamplesTable.vue
index 9bcd760..47260e9 100644
--- a/frontend/src/components/SamplesTable.vue
+++ b/frontend/src/components/SamplesTable.vue
@@ -1,6 +1,10 @@
@@ -114,21 +153,39 @@ function add_sample() {
-
+
+
+
+
+
diff --git a/runner/pyproject.toml b/runner/pyproject.toml
new file mode 100644
index 0000000..9b72c2a
--- /dev/null
+++ b/runner/pyproject.toml
@@ -0,0 +1,33 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "predicTCR_runner"
+description = "predicTCR runner"
+readme = "README.md"
+maintainers = [{ name = "Liam Keegan", email = "ssc@iwr.uni-heidelberg.de" }]
+dynamic = ["version"]
+requires-python = ">=3.10"
+license = { text = "MIT" }
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "Operating System :: OS Independent",
+ "License :: OSI Approved :: MIT License",
+]
+dependencies = [
+ "requests",
+ "click",
+]
+
+[project.scripts]
+predicTCR_runner = "predicTCR_runner.main:main"
+
+[project.optional-dependencies]
+tests = ["pytest", "requests-mock", ]
+
+[tool.setuptools.dynamic]
+version = { attr = "predicTCR_runner.__version__" }
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/runner/src/predicTCR_runner/__init__.py b/runner/src/predicTCR_runner/__init__.py
new file mode 100644
index 0000000..f102a9c
--- /dev/null
+++ b/runner/src/predicTCR_runner/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
diff --git a/runner/src/predicTCR_runner/main.py b/runner/src/predicTCR_runner/main.py
new file mode 100644
index 0000000..6419315
--- /dev/null
+++ b/runner/src/predicTCR_runner/main.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import click
+import logging
+from .runner import Runner
+
+
+@click.command()
+@click.option("--api-url", type=str)
+@click.option("--jwt-token", type=str)
+@click.option("--poll-interval", type=int, default=5, show_default=True)
+@click.option(
+ "--log-level",
+ default="INFO",
+ type=click.Choice(
+ ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False
+ ),
+ help="Log level",
+ show_default=True,
+ show_choices=True,
+)
+def main(api_url, jwt_token, poll_interval, log_level):
+ logging.basicConfig(
+ level=log_level, format="%(levelname)s %(module)s.%(funcName)s :: %(message)s"
+ )
+ runner = Runner(api_url, jwt_token, poll_interval)
+ runner.start()
+
+
+if __name__ == "__main__":
+ main(auto_envvar_prefix="PREDICTCR")
diff --git a/runner/runner.py b/runner/src/predicTCR_runner/runner.py
similarity index 72%
rename from runner/runner.py
rename to runner/src/predicTCR_runner/runner.py
index 8d2d854..cbc2f68 100644
--- a/runner/runner.py
+++ b/runner/src/predicTCR_runner/runner.py
@@ -3,12 +3,9 @@
import requests
import time
import logging
-import click
import os
import tempfile
import shutil
-import zipfile
-import io
import subprocess
@@ -78,25 +75,28 @@ def _upload_result(self, sample_id: int, result_file: str):
def _run_job(self, sample_id: int):
self.logger.info(f"Starting job for sample id {sample_id}...")
- self.logger.debug("Downloading input file...")
- response = requests.post(
- url=f"{self.api_url}/input_file",
- json={"sample_id": sample_id},
- headers=self.auth_header,
- timeout=30,
- )
- if response.status_code != 200:
- self.logger.error(f"Failed to download input file: {response.content}")
- return self._report_job_failed(
- sample_id, f"Failed to download input file on {self.runner_hostname}"
- )
+ self.logger.debug("Downloading input files...")
with tempfile.TemporaryDirectory(delete=False) as tmpdir:
- try:
- zip_file = zipfile.ZipFile(io.BytesIO(response.content))
- self.logger.debug(
- f" - extracting {zip_file.namelist()} to {tmpdir}..."
+ for input_file_type in ["h5", "csv"]:
+ response = requests.post(
+ url=f"{self.api_url}/{input_file_type}_input_file",
+ json={"sample_id": sample_id},
+ headers=self.auth_header,
+ timeout=30,
)
- zip_file.extractall(tmpdir)
+ if response.status_code != 200:
+ self.logger.error(
+ f"Failed to download {input_file_type}: {response.content}"
+ )
+ return self._report_job_failed(
+ sample_id,
+ f"Failed to download {input_file_type} on {self.runner_hostname}",
+ )
+ input_file_name = f"input.{input_file_type}"
+ self.logger.debug(f" - writing {input_file_name} to {tmpdir}...")
+ with open(f"{tmpdir}/{input_file_name}", "wb") as input_file:
+ input_file.write(response.content)
+ try:
self.logger.debug(
f" - copying contents of scripts folder to {tmpdir}..."
)
@@ -121,29 +121,3 @@ def start(self):
self._run_job(job_id)
else:
time.sleep(self.poll_interval)
-
-
-@click.command()
-@click.option("--api-url", type=str)
-@click.option("--jwt-token", type=str)
-@click.option("--poll-interval", type=int, default=5, show_default=True)
-@click.option(
- "--log-level",
- default="INFO",
- type=click.Choice(
- ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False
- ),
- help="Log level",
- show_default=True,
- show_choices=True,
-)
-def main(api_url, jwt_token, poll_interval, log_level):
- logging.basicConfig(
- level=log_level, format="%(levelname)s %(module)s.%(funcName)s :: %(message)s"
- )
- runner = Runner(api_url, jwt_token, poll_interval)
- runner.start()
-
-
-if __name__ == "__main__":
- main(auto_envvar_prefix="PREDICTCR")
diff --git a/runner/tests/__init__.py b/runner/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/runner/tests/test_runner.py b/runner/tests/test_runner.py
new file mode 100644
index 0000000..f044eaa
--- /dev/null
+++ b/runner/tests/test_runner.py
@@ -0,0 +1,9 @@
+from predicTCR_runner.runner import Runner
+
+
+def test_runner_request_job(requests_mock):
+ requests_mock.post("http://api/runner/request_job", status_code=204)
+ runner = Runner(api_url="http://api", jwt_token="abc")
+ assert runner._request_job() is None
+ requests_mock.post("http://api/runner/request_job", json={"sample_id": 44})
+ assert runner._request_job() == 44