From 06cfb849b0d8d378fe2d5360642c4b46764360ee Mon Sep 17 00:00:00 2001
From: Liam Keegan <liam@keegan.ch>
Date: Mon, 30 Sep 2024 10:59:15 +0200
Subject: [PATCH] Add CSV file upload

- replace single zip file upload with two file uploads: one h5 file, one csv file
- replace `/input_file` endpoint with `/input_h5_file` and `/input_csv_file`
- update model, endpoints and runner accordingly
- validate csv file on frontend
  - parse first line and extract column names
  - require that "barcode", "cdr3", "chain" are present
- resolves #3
- validate file sizes
  - h5 must be less than 50MB
  - csv must be less than 10MB
  - resolves #2
- increase nginx/flask body limit to 100MB
- make runner into a package, add initial test using requests-mock
---
 .github/workflows/ci.yml                    |   5 +-
 README_DEPLOYMENT.md                        |   4 +-
 backend/src/predicTCR_server/app.py         |  33 ++++--
 backend/src/predicTCR_server/model.py       |  17 ++-
 backend/tests/helpers/flask_test_utils.py   |   9 +-
 backend/tests/test_app.py                   |  25 ++---
 frontend/Dockerfile                         |   2 +
 frontend/nginx.conf                         |   2 +-
 frontend/pnpm-lock.yaml                     |  11 --
 frontend/src/components/SamplesTable.vue    |  28 ++++-
 frontend/src/utils/api-client.ts            |  30 +++---
 frontend/src/views/SamplesView.vue          | 113 +++++++++++++++-----
 runner/pyproject.toml                       |  33 ++++++
 runner/src/predicTCR_runner/__init__.py     |   1 +
 runner/src/predicTCR_runner/main.py         |  31 ++++++
 runner/{ => src/predicTCR_runner}/runner.py |  66 ++++--------
 runner/tests/__init__.py                    |   0
 runner/tests/test_runner.py                 |   9 ++
 18 files changed, 282 insertions(+), 137 deletions(-)
 create mode 100644 runner/pyproject.toml
 create mode 100644 runner/src/predicTCR_runner/__init__.py
 create mode 100644 runner/src/predicTCR_runner/main.py
 rename runner/{ => src/predicTCR_runner}/runner.py (72%)
 create mode 100644 runner/tests/__init__.py
 create mode 100644 runner/tests/test_runner.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index db2cb8a..67f3542 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -56,7 +56,6 @@ jobs:
     name: "Docker website"
     steps:
       - uses: actions/checkout@v4
-      - run: echo "VITE_REST_API_LOCATION=https://predictcr.lkeegan.dev/api" > frontend/.env
       - run: docker compose build
       - uses: docker/login-action@v3
         with:
@@ -92,8 +91,8 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      - run: pip install pytest
-      - run: python -m pytest -sv
+      - run: pip install -e .[tests]
+      - run: pytest -sv
   runner-docker:
     runs-on: ubuntu-latest
     name: "Docker runner"
diff --git a/README_DEPLOYMENT.md b/README_DEPLOYMENT.md
index 335e77d..abffaea 100644
--- a/README_DEPLOYMENT.md
+++ b/README_DEPLOYMENT.md
@@ -6,7 +6,7 @@ Some information on how to deploy the website.
 
 Production docker container images are automatically built by CI.
 To deploy the latest version on a virtual machine with docker compose installed,
-download [docker compose.yml](https://raw.githubusercontent.com/ssciwr/predicTCR/main/docker compose.yml), then do
+download [docker-compose.yml](https://raw.githubusercontent.com/ssciwr/predicTCR/main/docker-compose.yml), then do
 
 ```
 sudo docker compose pull
@@ -16,7 +16,7 @@ sudo docker compose up -d
 The location of data directory, SSL keys and secret key should be set
 either in env vars or in a file `.env` in the same location as the docker compose.yml.
 
-For example the current deployment on heicloud looks like this:
+For example the current test deployment on heicloud looks like this:
 
 ```
 PREDICTCR_DATA="/home/ubuntu/predicTCR/docker_volume"
diff --git a/backend/src/predicTCR_server/app.py b/backend/src/predicTCR_server/app.py
index f5fefc0..f47e2c2 100644
--- a/backend/src/predicTCR_server/app.py
+++ b/backend/src/predicTCR_server/app.py
@@ -47,8 +47,8 @@ def create_app(data_path: str = "/predictcr_data"):
     app.config["JWT_ACCESS_TOKEN_EXPIRES"] = datetime.timedelta(minutes=60)
     app.config["SQLALCHEMY_DATABASE_URI"] = f"sqlite:///{data_path}/predicTCR.db"
     app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
-    # limit max file upload size to 20mb
-    app.config["MAX_CONTENT_LENGTH"] = 20 * 1024 * 1024
+    # limit max file upload size to 100mb
+    app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
     app.config["PREDICTCR_DATA_PATH"] = data_path
 
     CORS(app)
@@ -147,9 +147,9 @@ def change_password():
     def samples():
         return get_samples(current_user.email)
 
-    @app.route("/api/input_file", methods=["POST"])
+    @app.route("/api/input_h5_file", methods=["POST"])
     @jwt_required()
-    def input_file():
+    def input_h5_file():
         sample_id = request.json.get("sample_id", None)
         logger.info(
             f"User {current_user.email} requesting results for sample {sample_id}"
@@ -163,7 +163,25 @@ def input_file():
         if user_sample is None:
             logger.info(f"  -> sample {sample_id} not found")
             return jsonify(message="Sample not found"), 400
-        return flask.send_file(user_sample.input_file_path(), as_attachment=True)
+        return flask.send_file(user_sample.input_h5_file_path(), as_attachment=True)
+
+    @app.route("/api/input_csv_file", methods=["POST"])
+    @jwt_required()
+    def input_csv_file():
+        sample_id = request.json.get("sample_id", None)
+        logger.info(
+            f"User {current_user.email} requesting results for sample {sample_id}"
+        )
+        filters = {"id": sample_id}
+        if not current_user.is_admin and not current_user.is_runner:
+            filters["email"] = current_user.email
+        user_sample = db.session.execute(
+            db.select(Sample).filter_by(**filters)
+        ).scalar_one_or_none()
+        if user_sample is None:
+            logger.info(f"  -> sample {sample_id} not found")
+            return jsonify(message="Sample not found"), 400
+        return flask.send_file(user_sample.input_csv_file_path(), as_attachment=True)
 
     @app.route("/api/result", methods=["POST"])
     @jwt_required()
@@ -199,10 +217,11 @@ def add_sample():
         name = form_as_dict.get("name", "")
         tumor_type = form_as_dict.get("tumor_type", "")
         source = form_as_dict.get("source", "")
-        infile = request.files.get("file")
+        h5_file = request.files.get("h5_file")
+        csv_file = request.files.get("csv_file")
         logger.info(f"Adding sample {name} from {email}")
         new_sample, error_message = add_new_sample(
-            email, name, tumor_type, source, infile
+            email, name, tumor_type, source, h5_file, csv_file
         )
         if new_sample is not None:
             logger.info("  - > success")
diff --git a/backend/src/predicTCR_server/model.py b/backend/src/predicTCR_server/model.py
index 9435943..df5be0e 100644
--- a/backend/src/predicTCR_server/model.py
+++ b/backend/src/predicTCR_server/model.py
@@ -50,8 +50,11 @@ def _base_path(self) -> pathlib.Path:
         data_path = flask.current_app.config["PREDICTCR_DATA_PATH"]
         return pathlib.Path(f"{data_path}/{self.id}")
 
-    def input_file_path(self) -> pathlib.Path:
-        return self._base_path() / "input.zip"
+    def input_h5_file_path(self) -> pathlib.Path:
+        return self._base_path() / "input.h5"
+
+    def input_csv_file_path(self) -> pathlib.Path:
+        return self._base_path() / "input.csv"
 
     def result_file_path(self) -> pathlib.Path:
         return self._base_path() / "result.zip"
@@ -110,6 +113,7 @@ def get_samples(email: str | None = None) -> list[Sample]:
 
 
 def request_job() -> int | None:
+    # todo: go through running jobs and reset to queued if they have been running for more than e.g. 2 hrs
     selected_samples = (
         db.select(Sample)
         .filter(Sample.status == Status.QUEUED)
@@ -288,6 +292,7 @@ def enable_user(email: str, enabled: bool) -> tuple[str, int]:
     if user is None:
         logger.info(f"  -> Unknown email address '{email}'")
         return f"Unknown email address {email}", 400
+    user.activated = True
     user.enabled = enabled
     db.session.commit()
     return f"Account {email} activated", 200
@@ -345,7 +350,8 @@ def add_new_sample(
     name: str,
     tumor_type: str,
     source: str,
-    input_file: FileStorage,
+    h5_file: FileStorage,
+    csv_file: FileStorage,
 ) -> tuple[Sample | None, str]:
     user = db.session.execute(
         db.select(User).filter(User.email == email)
@@ -378,6 +384,7 @@ def add_new_sample(
     )
     db.session.add(new_sample)
     db.session.commit()
-    new_sample.input_file_path().parent.mkdir(parents=True, exist_ok=True)
-    input_file.save(new_sample.input_file_path())
+    new_sample.input_h5_file_path().parent.mkdir(parents=True, exist_ok=True)
+    h5_file.save(new_sample.input_h5_file_path())
+    csv_file.save(new_sample.input_csv_file_path())
     return new_sample, ""
diff --git a/backend/tests/helpers/flask_test_utils.py b/backend/tests/helpers/flask_test_utils.py
index ed74aa1..b2af717 100644
--- a/backend/tests/helpers/flask_test_utils.py
+++ b/backend/tests/helpers/flask_test_utils.py
@@ -1,8 +1,6 @@
 import argon2
 from predicTCR_server.model import User, Sample, db, Status
 import pathlib
-import shutil
-import tempfile
 
 
 def add_test_users(app):
@@ -42,10 +40,9 @@ def add_test_samples(app, data_path: pathlib.Path):
         ):
             ref_dir = data_path / f"{sample_id}"
             ref_dir.mkdir(parents=True, exist_ok=True)
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                with open(f"{tmp_dir}/test.txt", "w") as f:
-                    f.write(name)
-                shutil.make_archive(f"{ref_dir}/input", "zip", tmp_dir)
+            for input_file_type in ["h5", "csv"]:
+                with open(f"{ref_dir}/input.{input_file_type}", "w") as f:
+                    f.write(input_file_type)
             new_sample = Sample(
                 email="user@abc.xy",
                 name=name,
diff --git a/backend/tests/test_app.py b/backend/tests/test_app.py
index 16e6b97..bca8739 100644
--- a/backend/tests/test_app.py
+++ b/backend/tests/test_app.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from typing import Dict
 import io
-import zipfile
+import pytest
 import pathlib
 import predicTCR_server
 import flask_test_utils as ftu
@@ -129,17 +129,18 @@ def test_samples_valid(client):
     assert len(response.json) == 4
 
 
-def test_input_file_invalid(client):
+@pytest.mark.parametrize("input_file_type", ["h5", "csv"])
+def test_input_file_invalid(client, input_file_type: str):
     # no auth header
     response = client.post(
-        "/api/input_file",
+        f"/api/input_{input_file_type}_file",
         json={"sample_id": 2},
     )
     assert response.status_code == 401
     # invalid sample id
     headers = _get_auth_headers(client)
     response = client.post(
-        "/api/input_file",
+        f"/api/input_{input_file_type}_file",
         json={"sample_id": 66},
         headers=headers,
     )
@@ -147,18 +148,17 @@ def test_input_file_invalid(client):
     assert "not found" in response.json["message"]
 
 
-def test_input_file_valid(client):
+@pytest.mark.parametrize("input_file_type", ["h5", "csv"])
+def test_input_file_valid(client, input_file_type: str):
     headers = _get_auth_headers(client)
     response = client.post(
-        "/api/input_file",
+        f"/api/input_{input_file_type}_file",
         json={"sample_id": 2},
         headers=headers,
     )
     assert response.status_code == 200
-    zip_file = zipfile.ZipFile(io.BytesIO(response.data))
-    filenames = [f.filename for f in zip_file.filelist]
-    assert len(filenames) == 1
-    assert "test.txt" in filenames
+    with io.BytesIO(response.data) as f:
+        assert input_file_type in f.read().decode("utf-8")
 
 
 def test_result_invalid(client):
@@ -222,14 +222,15 @@ def test_admin_runner_token_invalid(client):
     assert response.status_code == 400
 
 
-def test_admin_runner_token_valid(client):
+@pytest.mark.parametrize("input_file_type", ["h5", "csv"])
+def test_admin_runner_token_valid(client, input_file_type: str):
     headers = _get_auth_headers(client, "admin@abc.xy", "admin")
     response = client.get("/api/admin/runner_token", headers=headers)
     assert response.status_code == 200
     new_token = response.json["access_token"]
     assert (
         client.post(
-            "/api/input_file",
+            f"/api/input_{input_file_type}_file",
             json={"sample_id": 1},
             headers={"Authorization": f"Bearer {new_token}"},
         ).status_code
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
index 7750396..d4a261d 100644
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -14,6 +14,8 @@ RUN pnpm install
 
 COPY . .
 
+RUN echo "VITE_REST_API_LOCATION=/api" > .env
+
 RUN pnpm run build-only
 
 FROM nginx
diff --git a/frontend/nginx.conf b/frontend/nginx.conf
index 119d9a8..472d6c1 100644
--- a/frontend/nginx.conf
+++ b/frontend/nginx.conf
@@ -7,7 +7,7 @@ server {
         ssl_certificate_key  /predictcr_ssl_key.pem;
 
         # Maximum file upload size
-        client_max_body_size 20M;
+        client_max_body_size 100M;
 
         # Improve HTTPS performance with session resumption
         ssl_session_cache shared:SSL:10m;
diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml
index 4613b4d..ccb3041 100644
--- a/frontend/pnpm-lock.yaml
+++ b/frontend/pnpm-lock.yaml
@@ -13,9 +13,6 @@ importers:
       bootstrap-icons:
         specifier: ^1.11.3
         version: 1.11.3
-      jsbarcode:
-        specifier: ^3.11.6
-        version: 3.11.6
       pinia:
         specifier: ^2.2.2
         version: 2.2.2(typescript@5.5.4)(vue@3.4.38(typescript@5.5.4))
@@ -2799,12 +2796,6 @@ packages:
       }
     hasBin: true
 
-  jsbarcode@3.11.6:
-    resolution:
-      {
-        integrity: sha512-G5TKGyKY1zJo0ZQKFM1IIMfy0nF2rs92BLlCz+cU4/TazIc4ZH+X1GYeDRt7TKjrYqmPfTjwTBkU/QnQlsYiuA==,
-      }
-
   jsdom@24.1.1:
     resolution:
       {
@@ -6243,8 +6234,6 @@ snapshots:
     dependencies:
       argparse: 2.0.1
 
-  jsbarcode@3.11.6: {}
-
   jsdom@24.1.1:
     dependencies:
       cssstyle: 4.0.1
diff --git a/frontend/src/components/SamplesTable.vue b/frontend/src/components/SamplesTable.vue
index 9bcd760..47260e9 100644
--- a/frontend/src/components/SamplesTable.vue
+++ b/frontend/src/components/SamplesTable.vue
@@ -1,6 +1,10 @@
 <script setup lang="ts">
 // @ts-ignore
-import { download_input_file, download_result } from "@/utils/api-client";
+import {
+  download_input_csv_file,
+  download_input_h5_file,
+  download_result,
+} from "@/utils/api-client";
 import type { Sample } from "@/utils/types";
 
 defineProps<{
@@ -19,7 +23,8 @@ defineProps<{
       <th>Tumor type</th>
       <th>Source</th>
       <th>Status</th>
-      <th>Input file</th>
+      <th>Input H5 file</th>
+      <th>Input csv file</th>
       <th>Results</th>
     </tr>
     <tr v-for="sample in samples" :key="sample.id">
@@ -31,11 +36,26 @@ defineProps<{
       <td>{{ sample["source"] }}</td>
       <td>{{ sample["status"] }}</td>
       <td>
-        <a href="" @click.prevent="download_input_file(sample['id'])"> zip </a>
+        <a
+          href=""
+          @click.prevent="download_input_h5_file(sample.id, sample.name)"
+        >
+          input.h5
+        </a>
+      </td>
+      <td>
+        <a
+          href=""
+          @click.prevent="download_input_csv_file(sample.id, sample.name)"
+        >
+          input.csv
+        </a>
       </td>
       <td>
         <template v-if="sample.has_results_zip">
-          <a href="" @click.prevent="download_result(sample.id)">zip</a>
+          <a href="" @click.prevent="download_result(sample.id, sample.name)"
+            >zip</a
+          >
         </template>
         <template v-else> - </template>
       </td>
diff --git a/frontend/src/utils/api-client.ts b/frontend/src/utils/api-client.ts
index 8da00c1..f6d4527 100644
--- a/frontend/src/utils/api-client.ts
+++ b/frontend/src/utils/api-client.ts
@@ -3,7 +3,7 @@ import router from "@/router";
 import type { AxiosInstance } from "axios";
 import { useUserStore } from "@/stores/user";
 
-const apiClient: AxiosInstance = axios.create({
+export const apiClient: AxiosInstance = axios.create({
   baseURL: import.meta.env.VITE_REST_API_LOCATION,
   headers: {
     "Content-type": "application/json",
@@ -23,9 +23,6 @@ function download_file_from_endpoint(
 ) {
   apiClient
     .post(endpoint, json, {
-      headers: {
-        "Content-Type": "multipart/form-data",
-      },
       responseType: "blob",
     })
     .then((response) => {
@@ -43,27 +40,36 @@ function download_file_from_endpoint(
     });
 }
 
-function download_input_file(sample_id: number) {
+export function download_input_h5_file(sample_id: number, sample_name: string) {
   download_file_from_endpoint(
-    "input_file",
+    "input_h5_file",
     { sample_id: sample_id },
-    `${sample_id}_input_file.zip`,
+    `${sample_name}.h5`,
   );
 }
 
-function download_result(sample_id: number) {
+export function download_input_csv_file(
+  sample_id: number,
+  sample_name: string,
+) {
+  download_file_from_endpoint(
+    "input_csv_file",
+    { sample_id: sample_id },
+    `${sample_name}.csv`,
+  );
+}
+
+export function download_result(sample_id: number, sample_name: string) {
   download_file_from_endpoint(
     "result",
     { sample_id: sample_id },
-    `${sample_id}.zip`,
+    `${sample_name}.zip`,
   );
 }
 
-function logout() {
+export function logout() {
   const user = useUserStore();
   user.user = null;
   user.token = "";
   router.push({ name: "login" });
 }
-
-export { apiClient, logout, download_input_file, download_result };
diff --git a/frontend/src/views/SamplesView.vue b/frontend/src/views/SamplesView.vue
index 04be7dc..bc9b180 100644
--- a/frontend/src/views/SamplesView.vue
+++ b/frontend/src/views/SamplesView.vue
@@ -7,32 +7,72 @@ import type { Sample } from "@/utils/types";
 
 const tumor_types = ["lung", "breast", "other"];
 const sources = ["TIL", "PBMC", "other"];
+const required_columns = ["barcode", "cdr3", "chain"];
 
 const sample_name = ref("");
 const tumor_type = ref("lung");
 const source = ref("TIL");
-const selected_files = ref(null as null | FileList);
-const file_input_key = ref(0);
+const selected_h5_file = ref(null as null | File);
+const h5_file_input_key = ref(0);
+const selected_csv_file = ref(null as null | File);
+const csv_file_input_key = ref(0);
 const new_sample_error_message = ref("");
 
-function on_file_changed(event: Event) {
-  const max_upload_size_mb = 20;
-  let total_upload_size_bytes = 0;
+function on_h5_file_changed(event: Event) {
+  const max_upload_size_mb = 50;
   const target = event.target as HTMLInputElement;
   if (target.files != null && target.files.length > 0) {
-    selected_files.value = target.files;
-    for (const selected_file of target.files) {
-      total_upload_size_bytes += selected_file.size;
+    selected_h5_file.value = target.files[0];
+    if (selected_h5_file.value.size > 1024 * 1024 * max_upload_size_mb) {
+      selected_h5_file.value = null;
+      h5_file_input_key.value++;
+      window.alert(
+        `Provided h5 file exceeds maximum allowed upload size of ${max_upload_size_mb}MB`,
+      );
     }
-    if (total_upload_size_bytes > 1024 * 1024 * max_upload_size_mb) {
-      selected_files.value = null;
-      file_input_key.value++;
+  } else {
+    selected_h5_file.value = null;
+  }
+}
+
+async function validate_csv_file(file: File) {
+  const blob = file as Blob;
+  const text = await blob.text();
+  const lines = text.split(/\n/);
+  if (lines.length >= 1) {
+    const columns = lines[0].split(/,/);
+    console.log(columns);
+    for (const required_column of required_columns) {
+      if (!columns.includes(required_column)) {
+        console.log(`Missing header: ${required_column}`);
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+async function on_csv_file_changed(event: Event) {
+  const max_upload_size_mb = 10;
+  const target = event.target as HTMLInputElement;
+  if (target.files != null && target.files.length > 0) {
+    selected_csv_file.value = target.files[0];
+    if (selected_csv_file.value.size > 1024 * 1024 * max_upload_size_mb) {
+      selected_csv_file.value = null;
+      csv_file_input_key.value++;
+      window.alert(
+        `Provided csv file exceeds maximum allowed upload size of ${max_upload_size_mb}MB`,
+      );
+    } else if (!(await validate_csv_file(selected_csv_file.value as File))) {
+      selected_csv_file.value = null;
+      csv_file_input_key.value++;
       window.alert(
-        `Selected files exceed maximum upload size of ${max_upload_size_mb}MB`,
+        `Provided csv file doesn't contain the required columns ${required_columns}`,
       );
     }
   } else {
-    selected_files.value = null;
+    selected_csv_file.value = null;
   }
 }
 
@@ -56,11 +96,8 @@ function add_sample() {
   formData.append("name", sample_name.value);
   formData.append("tumor_type", tumor_type.value);
   formData.append("source", source.value);
-  if (selected_files.value !== null) {
-    for (const file of selected_files.value) {
-      formData.append("file", file);
-    }
-  }
+  formData.append("h5_file", selected_h5_file.value as File);
+  formData.append("csv_file", selected_csv_file.value as File);
   apiClient
     .post("sample", formData, {
       headers: {
@@ -78,8 +115,10 @@ function add_sample() {
       new_sample_error_message.value = error.response.data.message;
     });
   sample_name.value = "";
-  selected_files.value = null;
-  file_input_key.value++;
+  selected_h5_file.value = null;
+  h5_file_input_key.value++;
+  selected_csv_file.value = null;
+  csv_file_input_key.value++;
 }
 </script>
 
@@ -114,21 +153,39 @@ function add_sample() {
           </select>
         </p>
         <p>
-          <label for="input_file">Input file:</label>
+          <label for="input_h5_file">H5 input file:</label>
+          <input
+            type="file"
+            id="input_h5_file"
+            name="h5 file"
+            :multiple="false"
+            @change="on_h5_file_changed($event)"
+            :key="h5_file_input_key"
+            accept=".h5,.he5,.hdf5"
+            title="Select the h5 file to upload"
+          />
+        </p>
+        <p>
+          <label for="input_csv_file">CSV input file:</label>
           <input
             type="file"
-            id="input_file"
-            name="file"
-            :multiple="true"
-            @change="on_file_changed($event)"
-            :key="file_input_key"
-            title="Upload the input file"
+            id="input_csv_file"
+            name="csv file"
+            :multiple="false"
+            @change="on_csv_file_changed($event)"
+            :key="csv_file_input_key"
+            accept=".csv,.txt"
+            title="Select the csv file to upload"
           />
         </p>
         <p>
           <input
             type="submit"
-            :disabled="selected_files === null || sample_name.length === 0"
+            :disabled="
+              selected_h5_file === null ||
+              selected_csv_file === null ||
+              sample_name.length === 0
+            "
           />
         </p>
         <div class="error-message">
diff --git a/runner/pyproject.toml b/runner/pyproject.toml
new file mode 100644
index 0000000..9b72c2a
--- /dev/null
+++ b/runner/pyproject.toml
@@ -0,0 +1,33 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "predicTCR_runner"
+description = "predicTCR runner"
+readme = "README.md"
+maintainers = [{ name = "Liam Keegan", email = "ssc@iwr.uni-heidelberg.de" }]
+dynamic = ["version"]
+requires-python = ">=3.10"
+license = { text = "MIT" }
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Operating System :: OS Independent",
+  "License :: OSI Approved :: MIT License",
+]
+dependencies = [
+  "requests",
+  "click",
+]
+
+[project.scripts]
+predicTCR_runner = "predicTCR_runner.main:main"
+
+[project.optional-dependencies]
+tests = ["pytest",  "requests-mock", ]
+
+[tool.setuptools.dynamic]
+version = { attr = "predicTCR_runner.__version__" }
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/runner/src/predicTCR_runner/__init__.py b/runner/src/predicTCR_runner/__init__.py
new file mode 100644
index 0000000..f102a9c
--- /dev/null
+++ b/runner/src/predicTCR_runner/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"
diff --git a/runner/src/predicTCR_runner/main.py b/runner/src/predicTCR_runner/main.py
new file mode 100644
index 0000000..6419315
--- /dev/null
+++ b/runner/src/predicTCR_runner/main.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import click
+import logging
+from .runner import Runner
+
+
+@click.command()
+@click.option("--api-url", type=str)
+@click.option("--jwt-token", type=str)
+@click.option("--poll-interval", type=int, default=5, show_default=True)
+@click.option(
+    "--log-level",
+    default="INFO",
+    type=click.Choice(
+        ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False
+    ),
+    help="Log level",
+    show_default=True,
+    show_choices=True,
+)
+def main(api_url, jwt_token, poll_interval, log_level):
+    logging.basicConfig(
+        level=log_level, format="%(levelname)s %(module)s.%(funcName)s :: %(message)s"
+    )
+    runner = Runner(api_url, jwt_token, poll_interval)
+    runner.start()
+
+
+if __name__ == "__main__":
+    main(auto_envvar_prefix="PREDICTCR")
diff --git a/runner/runner.py b/runner/src/predicTCR_runner/runner.py
similarity index 72%
rename from runner/runner.py
rename to runner/src/predicTCR_runner/runner.py
index 8d2d854..cbc2f68 100644
--- a/runner/runner.py
+++ b/runner/src/predicTCR_runner/runner.py
@@ -3,12 +3,9 @@
 import requests
 import time
 import logging
-import click
 import os
 import tempfile
 import shutil
-import zipfile
-import io
 import subprocess
 
 
@@ -78,25 +75,28 @@ def _upload_result(self, sample_id: int, result_file: str):
 
     def _run_job(self, sample_id: int):
         self.logger.info(f"Starting job for sample id {sample_id}...")
-        self.logger.debug("Downloading input file...")
-        response = requests.post(
-            url=f"{self.api_url}/input_file",
-            json={"sample_id": sample_id},
-            headers=self.auth_header,
-            timeout=30,
-        )
-        if response.status_code != 200:
-            self.logger.error(f"Failed to download input file: {response.content}")
-            return self._report_job_failed(
-                sample_id, f"Failed to download input file on {self.runner_hostname}"
-            )
+        self.logger.debug("Downloading input files...")
         with tempfile.TemporaryDirectory(delete=False) as tmpdir:
-            try:
-                zip_file = zipfile.ZipFile(io.BytesIO(response.content))
-                self.logger.debug(
-                    f"  - extracting {zip_file.namelist()} to {tmpdir}..."
+            for input_file_type in ["h5", "csv"]:
+                response = requests.post(
+                    url=f"{self.api_url}/{input_file_type}_input_file",
+                    json={"sample_id": sample_id},
+                    headers=self.auth_header,
+                    timeout=30,
                 )
-                zip_file.extractall(tmpdir)
+                if response.status_code != 200:
+                    self.logger.error(
+                        f"Failed to download {input_file_type}: {response.content}"
+                    )
+                    return self._report_job_failed(
+                        sample_id,
+                        f"Failed to download {input_file_type} on {self.runner_hostname}",
+                    )
+                input_file_name = f"input.{input_file_type}"
+                self.logger.debug(f"  - writing {input_file_name} to {tmpdir}...")
+                with open(f"{tmpdir}/{input_file_name}", "wb") as input_file:
+                    input_file.write(response.content)
+            try:
                 self.logger.debug(
                     f"  - copying contents of scripts folder to {tmpdir}..."
                 )
@@ -121,29 +121,3 @@ def start(self):
                 self._run_job(job_id)
             else:
                 time.sleep(self.poll_interval)
-
-
-@click.command()
-@click.option("--api-url", type=str)
-@click.option("--jwt-token", type=str)
-@click.option("--poll-interval", type=int, default=5, show_default=True)
-@click.option(
-    "--log-level",
-    default="INFO",
-    type=click.Choice(
-        ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False
-    ),
-    help="Log level",
-    show_default=True,
-    show_choices=True,
-)
-def main(api_url, jwt_token, poll_interval, log_level):
-    logging.basicConfig(
-        level=log_level, format="%(levelname)s %(module)s.%(funcName)s :: %(message)s"
-    )
-    runner = Runner(api_url, jwt_token, poll_interval)
-    runner.start()
-
-
-if __name__ == "__main__":
-    main(auto_envvar_prefix="PREDICTCR")
diff --git a/runner/tests/__init__.py b/runner/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/runner/tests/test_runner.py b/runner/tests/test_runner.py
new file mode 100644
index 0000000..f044eaa
--- /dev/null
+++ b/runner/tests/test_runner.py
@@ -0,0 +1,9 @@
+from predicTCR_runner.runner import Runner
+
+
+def test_runner_request_job(requests_mock):
+    requests_mock.post("http://api/runner/request_job", status_code=204)
+    runner = Runner(api_url="http://api", jwt_token="abc")
+    assert runner._request_job() is None
+    requests_mock.post("http://api/runner/request_job", json={"sample_id": 44})
+    assert runner._request_job() == 44