From 6f699f78b4e502ac9c1fea2b8e8a09dae622adae Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Wed, 6 Mar 2024 10:51:54 +0000
Subject: [PATCH 01/32] Built basic framework for seq retrieval component

---
 pipeline/seq_retrieval/Dockerfile       | 11 +++++++++
 pipeline/seq_retrieval/Makefile         |  2 ++
 pipeline/seq_retrieval/README.md        | 32 +++++++++++++++++++++++++
 pipeline/seq_retrieval/requirements.txt |  1 +
 pipeline/seq_retrieval/src/main.py      | 12 ++++++++++
 5 files changed, 58 insertions(+)
 create mode 100644 pipeline/seq_retrieval/Dockerfile
 create mode 100644 pipeline/seq_retrieval/Makefile
 create mode 100644 pipeline/seq_retrieval/README.md
 create mode 100644 pipeline/seq_retrieval/requirements.txt
 create mode 100644 pipeline/seq_retrieval/src/main.py

diff --git a/pipeline/seq_retrieval/Dockerfile b/pipeline/seq_retrieval/Dockerfile
new file mode 100644
index 00000000..949423a4
--- /dev/null
+++ b/pipeline/seq_retrieval/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.12-alpine
+
+WORKDIR /usr/src/app
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src/ ./
+
+ENTRYPOINT [ "python", "main.py"]
+CMD [ "--help" ]
diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
new file mode 100644
index 00000000..d8a0daec
--- /dev/null
+++ b/pipeline/seq_retrieval/Makefile
@@ -0,0 +1,2 @@
+container-image:
+	docker build -t agr_pavi/seq_retrieval .
diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md
new file mode 100644
index 00000000..242eeb1e
--- /dev/null
+++ b/pipeline/seq_retrieval/README.md
@@ -0,0 +1,32 @@
+# PAVI Sequence retrieval
+This subdirectory contains all code and configs for the PAVI sequence retrieval component.
+
+## Development
+In order to enable isolated local development that does not interfere with the global system python setup,
+a virtual environment is used to do code development and testing for this component.
+
+To start developing on a new system, create a virtual environment using the following command
+(having this directory as your working directory):
+```bash
+python3.12 -m venv ./venv
+```
+
+Then when developing this component, activated the virtual environment with:
+```bash
+source venv/bin/activate
+```
+Once the virtual environment is activated, all packages will be installed in this isolated virtual environment.
+To install all python package dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+To upgrade all packages (already installed) to the latest available version matching the requirements:
+```bash
+pip install -U -r requirements.txt
+```
+
+Once ending development for this component, deactivate the virtual environment with:
+```bash
+deactivate
+```
diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt
new file mode 100644
index 00000000..5a1f42a8
--- /dev/null
+++ b/pipeline/seq_retrieval/requirements.txt
@@ -0,0 +1 @@
+click==8.1.*
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
new file mode 100644
index 00000000..2ece0e0f
--- /dev/null
+++ b/pipeline/seq_retrieval/src/main.py
@@ -0,0 +1,12 @@
+import click
+
+@click.command()
+@click.option("--seq_id", required=True, help="The sequence ID to retrieve sequences for.")
+@click.option("--seq_strand", required=True, help="The sequence strand to retrieve sequences for.")
+@click.option("--genes", required=True, help="A list of gene maps to retrieve sequences for.")
+def main(seq_id, seq_strand, genes):
+    """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files."""
+    click.echo(f"Received request to retrieve sequences for {seq_id} strand {seq_strand}!")
+
+if __name__ == '__main__':
+    main()

From 33396e2ba4effd05acc852f8ea43fbf27d6d2d8e Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 7 Mar 2024 11:47:54 +0000
Subject: [PATCH 02/32] Updated input params to match single sequence retrieval
 and added input validation

---
 pipeline/seq_retrieval/src/main.py | 60 +++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 2ece0e0f..b98edd08 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,12 +1,60 @@
 import click
+import json
+
+def validate_strand(ctx, param, value):
+    """Returns a normalised version of strings representing a strand.
+    Negative strand is normalised to '-', positive strand to '+'.
+    Throws a click.BadParameter exception if an unrecognised string was provided."""
+    POS_CHOICES = ['+', '+1', 'pos']
+    NEG_CHOICES = ['-', '-1', 'neg']
+    if   value in POS_CHOICES:
+        return '+'
+    elif value in NEG_CHOICES:
+        return '-'
+    else:
+        raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
+
+def process_seq_regions(ctx, param, value):
+    """Parse the seq_regions parameter value and validate it's structure.
+    Value is expected to be a JSON-formatted list of sequence regions to retrieve.
+    Each region should have:
+     * a 'start' property indicating the region start (inclusive)
+     * a 'end' property indicating the region end (inclusive)
+    Throws a click.BadParameter exception if value could not be parsed as JSON or had an invalid structure."""
+    seq_regions = None
+    try:
+        seq_regions = json.loads(value)
+    except:
+        raise click.BadParameter(f"Must be a valid JSON-formatted string.")
+    else:
+        if not isinstance(seq_regions, list):
+            raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.")
+        for region in seq_regions:
+            if not isinstance(region, dict):
+                raise click.BadParameter(f"Region {region} is not a valid dict. All regions in seq_regions list must be valid dicts (JSON-objects).")
+            if 'start' not in region.keys():
+                raise click.BadParameter(f"Region {region} does not have a 'start' property, which is a required property.")
+            if 'end' not in region.keys():
+                raise click.BadParameter(f"Region {region} does not have a 'end' property, which is a required property.")
+            if not isinstance(region['start'], int):
+                raise click.BadParameter(f"'start' property of region {region} is not an integer. All positions must be integers.")
+            if not isinstance(region['end'], int):
+                raise click.BadParameter(f"'end' property of region {region} is not an integer. All positions must be integers.")
+
+        return seq_regions
 
 @click.command()
-@click.option("--seq_id", required=True, help="The sequence ID to retrieve sequences for.")
-@click.option("--seq_strand", required=True, help="The sequence strand to retrieve sequences for.")
-@click.option("--genes", required=True, help="A list of gene maps to retrieve sequences for.")
-def main(seq_id, seq_strand, genes):
-    """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files."""
-    click.echo(f"Received request to retrieve sequences for {seq_id} strand {seq_strand}!")
+@click.option("--seq_id", type=click.STRING, required=True,
+              help="The sequence ID to retrieve sequences for.")
+@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand,
+              help="The sequence strand to retrieve sequences for.")
+@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions,
+              help="A list of sequence regions to retrieve sequences for.")
+def main(seq_id, seq_strand, seq_regions):
+    """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files.
+    Returns a single (transcript) sequence made by concatenating all sequence regions requested
+    (in specified order)."""
+    click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
 if __name__ == '__main__':
     main()

From 14090ac23d48fe10d7b2dd0cd5a1cc9b55e8ee25 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 7 Mar 2024 14:08:47 +0000
Subject: [PATCH 03/32] Added code for basic (+ strand) seq retrieval

---
 pipeline/seq_retrieval/requirements.txt |  1 +
 pipeline/seq_retrieval/src/faidx.py     | 19 +++++++++++++++++++
 pipeline/seq_retrieval/src/main.py      | 22 ++++++++++++++++++++--
 3 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 pipeline/seq_retrieval/src/faidx.py

diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt
index 5a1f42a8..9dc491f0 100644
--- a/pipeline/seq_retrieval/requirements.txt
+++ b/pipeline/seq_retrieval/requirements.txt
@@ -1 +1,2 @@
 click==8.1.*
+pysam==0.22.*
diff --git a/pipeline/seq_retrieval/src/faidx.py b/pipeline/seq_retrieval/src/faidx.py
new file mode 100644
index 00000000..f44e4aca
--- /dev/null
+++ b/pipeline/seq_retrieval/src/faidx.py
@@ -0,0 +1,19 @@
+"""
+Module containing all functions used to handle and access faidx files.
+"""
+import pysam
+
+def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand):
+    """Return sequence found at `seq_id`:`seq_start`-`seq_end`:`seq_strand`
+    by reading from faidx files at `file_path`. Use 1-based, fully inclusive coordinates."""
+    try:
+        fasta_file = pysam.FastaFile(fasta_file_path)
+    except ValueError:
+        raise FileNotFoundError(f"Missing index file matching path {fasta_file_path}.")
+    except IOError:
+        raise IOError(f"Error while reading fasta file or index matching path {fasta_file_path}.")
+    else:
+        seq = fasta_file.fetch(reference=seq_id, start=seq_start-1, end=seq_end)
+
+        fasta_file.close()
+        return seq
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index b98edd08..f40dd174 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,6 +1,8 @@
 import click
 import json
 
+import faidx
+
 def validate_strand(ctx, param, value):
     """Returns a normalised version of strings representing a strand.
     Negative strand is normalised to '-', positive strand to '+'.
@@ -50,11 +52,27 @@ def process_seq_regions(ctx, param, value):
               help="The sequence strand to retrieve sequences for.")
 @click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions,
               help="A list of sequence regions to retrieve sequences for.")
-def main(seq_id, seq_strand, seq_regions):
-    """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files.
+@click.option("--fasta_file_path", type=click.STRING, required=True,
+              help="""Path to (local) faidx-indexed fasta file.\
+                   Assumes the fasta file is compressed,
+                   and an index file is found at `<fasta_file_path>.gzi`.""")
+def main(seq_id, seq_strand, seq_regions, fasta_file_path):
+    """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
     Returns a single (transcript) sequence made by concatenating all sequence regions requested
     (in specified order)."""
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
+    #Retrieve sequence for each region
+    click.echo(f"\nRegion seqs:")
+    for region in seq_regions:
+        seq = faidx.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand,
+                            fasta_file_path=fasta_file_path)
+        click.echo(seq)
+        region['seq'] = seq
+
+    #Concatenate all regions into single sequence
+    seq_concat = ''.join(map(lambda region:region['seq'], seq_regions))
+    click.echo(f"\nSeq concat: {seq_concat}")
+
 if __name__ == '__main__':
     main()

From 29c4dff2ab3f66cb377ec6607a4a786304514f65 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 7 Mar 2024 16:27:01 +0000
Subject: [PATCH 04/32] Refactored faidx to enable more broad seq functions
 import

---
 pipeline/seq_retrieval/src/main.py                  | 4 ++--
 pipeline/seq_retrieval/src/{faidx.py => seq_fns.py} | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename pipeline/seq_retrieval/src/{faidx.py => seq_fns.py} (88%)

diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index f40dd174..86b68e23 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,7 +1,7 @@
 import click
 import json
 
-import faidx
+import seq_fns
 
 def validate_strand(ctx, param, value):
     """Returns a normalised version of strings representing a strand.
@@ -65,7 +65,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_path):
     #Retrieve sequence for each region
     click.echo(f"\nRegion seqs:")
     for region in seq_regions:
-        seq = faidx.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand,
+        seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand,
                             fasta_file_path=fasta_file_path)
         click.echo(seq)
         region['seq'] = seq
diff --git a/pipeline/seq_retrieval/src/faidx.py b/pipeline/seq_retrieval/src/seq_fns.py
similarity index 88%
rename from pipeline/seq_retrieval/src/faidx.py
rename to pipeline/seq_retrieval/src/seq_fns.py
index f44e4aca..a3b3d130 100644
--- a/pipeline/seq_retrieval/src/faidx.py
+++ b/pipeline/seq_retrieval/src/seq_fns.py
@@ -1,5 +1,5 @@
 """
-Module containing all functions used to handle and access faidx files.
+Module containing all functions used to handle and access sequences and sequence files.
 """
 import pysam
 

From 048b9f14679dff36f3d47d609c4bde70eff1e8d0 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 7 Mar 2024 19:19:09 +0000
Subject: [PATCH 05/32] Added support for correct negative strand sequence
 retrieval (and chaining)

---
 pipeline/seq_retrieval/requirements.txt |  1 +
 pipeline/seq_retrieval/src/main.py      | 13 ++++++++++--
 pipeline/seq_retrieval/src/seq_fns.py   | 27 +++++++++++++++++++++++--
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt
index 9dc491f0..50fce435 100644
--- a/pipeline/seq_retrieval/requirements.txt
+++ b/pipeline/seq_retrieval/requirements.txt
@@ -1,2 +1,3 @@
 click==8.1.*
+biopython==1.83
 pysam==0.22.*
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 86b68e23..61e9ca62 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -60,18 +60,27 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_path):
     """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
     Returns a single (transcript) sequence made by concatenating all sequence regions requested
     (in specified order)."""
+
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
-    #Retrieve sequence for each region
     click.echo(f"\nRegion seqs:")
     for region in seq_regions:
+        #If seq_strand is -, ensure seq_start < seq_end (swap as required)
+        if seq_strand == '-' and region['end'] < region['start']:
+            seq_start = region['end']
+            seq_end = region['start']
+
+            region['start'] = seq_start
+            region['end'] = seq_end
+
+        #Retrieve sequence for region
         seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand,
                             fasta_file_path=fasta_file_path)
         click.echo(seq)
         region['seq'] = seq
 
     #Concatenate all regions into single sequence
-    seq_concat = ''.join(map(lambda region:region['seq'], seq_regions))
+    seq_concat = seq_fns.chain_seq_region_seqs(seq_regions, seq_strand)
     click.echo(f"\nSeq concat: {seq_concat}")
 
 if __name__ == '__main__':
diff --git a/pipeline/seq_retrieval/src/seq_fns.py b/pipeline/seq_retrieval/src/seq_fns.py
index a3b3d130..cd9a5aac 100644
--- a/pipeline/seq_retrieval/src/seq_fns.py
+++ b/pipeline/seq_retrieval/src/seq_fns.py
@@ -1,6 +1,7 @@
 """
 Module containing all functions used to handle and access sequences and sequence files.
 """
+from Bio import Seq #Bio.Seq biopython submodule
 import pysam
 
 def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand):
@@ -14,6 +15,28 @@ def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand):
         raise IOError(f"Error while reading fasta file or index matching path {fasta_file_path}.")
     else:
         seq = fasta_file.fetch(reference=seq_id, start=seq_start-1, end=seq_end)
-
         fasta_file.close()
-        return seq
+
+        if seq_strand == '-':
+            seq = Seq.reverse_complement(seq)
+
+    return seq
+
+def chain_seq_region_seqs(seq_regions: list, seq_strand: str):
+    """
+    Chain multiple sequence region seq's together into one continuous sequence.
+    Regions are chained together in an order based on the 'start' position of each:
+     * Ascending order when positive strand
+     * Descending order when negative strand
+    """
+
+    sort_args = dict(key=lambda region: region['start'])
+
+    if seq_strand == '-':
+        sort_args['reverse'] = True
+
+    sorted_regions = seq_regions
+    seq_regions.sort(**sort_args)
+    chained_seq = ''.join(map(lambda region: region['seq'], sorted_regions))
+
+    return chained_seq

From 26fc87817fcef3321141a1854c8c2270e57b35f2 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 7 Mar 2024 19:20:24 +0000
Subject: [PATCH 06/32] Added additional python pysam and biopython package
 installation dependencies

---
 pipeline/seq_retrieval/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pipeline/seq_retrieval/Dockerfile b/pipeline/seq_retrieval/Dockerfile
index 949423a4..5043e735 100644
--- a/pipeline/seq_retrieval/Dockerfile
+++ b/pipeline/seq_retrieval/Dockerfile
@@ -2,6 +2,8 @@ FROM python:3.12-alpine
 
 WORKDIR /usr/src/app
 
+RUN apk add --no-cache build-base zlib-dev bzip2-dev xz-dev
+
 COPY requirements.txt ./
 RUN pip install --no-cache-dir -r requirements.txt
 

From 3965c482e69dc05ed923356161a0dacfb53195f5 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Fri, 8 Mar 2024 14:23:11 +0000
Subject: [PATCH 07/32] Updated fasta_file option to support remote URLs

---
 pipeline/seq_retrieval/requirements.txt       |  1 +
 .../src/data_mover/data_file_mover.py         | 75 +++++++++++++++++++
 pipeline/seq_retrieval/src/main.py            | 27 +++++--
 3 files changed, 97 insertions(+), 6 deletions(-)
 create mode 100644 pipeline/seq_retrieval/src/data_mover/data_file_mover.py

diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt
index 50fce435..30887157 100644
--- a/pipeline/seq_retrieval/requirements.txt
+++ b/pipeline/seq_retrieval/requirements.txt
@@ -1,3 +1,4 @@
 click==8.1.*
 biopython==1.83
 pysam==0.22.*
+requests==2.31.*
diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
new file mode 100644
index 00000000..36bf885a
--- /dev/null
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -0,0 +1,75 @@
+"""
+Moving files to and from remote locations
+"""
+import os.path
+from pathlib import Path
+import requests
+from urllib.parse import urlparse, unquote
+
+_stored_files = dict()
+_DEFAULT_DIR = '/tmp/pavi/'
+
+def is_accessible_url(url: str):
+    """
+    Returns True when provided `url` is an accessible URL
+    """
+    response = requests.head(url)
+    if response.ok:
+        return True
+    else:
+        return False
+
+def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR):
+    """
+    Fetch file from URL, return its local path.
+    """
+    local_path = None
+    if url not in _stored_files.keys():
+        url_components = urlparse(url)
+        if url_components.scheme == 'file':
+            filepath = url_components.netloc + url_components.path
+            local_path = find_local_file(filepath)
+        else:
+            local_path = download_from_url(url, dest_dir)
+        _stored_files[url] = local_path
+    else:
+        local_path = _stored_files[url]
+
+    return local_path
+
+def find_local_file(path: str):
+    """
+    Find a file locally based on path and return its absolute path.
+    If no file was found at given path, throws Exception.
+    """
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"No file found at path '{path}'.")
+    else:
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f"Specified path '{path}' exists but is not a file.")
+        else:
+            return Path(path).resolve()
+
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024):
+    url_components = urlparse(url)
+    if url_components.scheme in ['http', 'https']:
+
+        if not is_accessible_url(url):
+            raise ValueError(f"URL {url} is not accessible.")
+
+        Path(dest_dir).mkdir(parents=True, exist_ok=True)
+
+        filename = unquote(os.path.basename(url_components.path))
+        local_file_path = os.path.join(dest_dir, filename)
+
+        #Download file through streaming to support large files
+        response = requests.get(url, stream=True)
+
+        with open(local_file_path, mode="wb") as local_file:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                local_file.write(chunk)
+
+        return Path(local_file_path).resolve()
+    else:
+        #Currently not supported
+        raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.")
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 61e9ca62..211d24d2 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,6 +1,8 @@
 import click
 import json
+from os import path
 
+import data_mover.data_file_mover as data_file_mover
 import seq_fns
 
 def validate_strand(ctx, param, value):
@@ -52,17 +54,30 @@ def process_seq_regions(ctx, param, value):
               help="The sequence strand to retrieve sequences for.")
 @click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions,
               help="A list of sequence regions to retrieve sequences for.")
-@click.option("--fasta_file_path", type=click.STRING, required=True,
-              help="""Path to (local) faidx-indexed fasta file.\
-                   Assumes the fasta file is compressed,
-                   and an index file is found at `<fasta_file_path>.gzi`.""")
-def main(seq_id, seq_strand, seq_regions, fasta_file_path):
+@click.option("--fasta_file_url", type=click.STRING, required=True,
+              help="""URL to (faidx-indexed) fasta file to retrieve sequences from.\
+                   Assumes additional index files can be found at `<fasta_file_url>.fai`,
+                   and at `<fasta_file_url>.gzi` if the fastafile is compressed.
+                   Use "file://*" URL for local file or "http(s)://*" for remote files.""")
+def main(seq_id, seq_strand, seq_regions, fasta_file_url: str):
     """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
     Returns a single (transcript) sequence made by concatenating all sequence regions requested
     (in specified order)."""
 
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
+    # Fetch the file(s)
+    local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url)
+
+    # Fetch additional faidx index files in addition to fasta file itself
+    # (to the same location)
+    index_files = [fasta_file_url+'.fai']
+    if fasta_file_url.endswith('.gz'):
+        index_files.append(fasta_file_url+'.gzi')
+
+    for index_file in index_files:
+        data_file_mover.fetch_file(index_file)
+
     click.echo(f"\nRegion seqs:")
     for region in seq_regions:
         #If seq_strand is -, ensure seq_start < seq_end (swap as required)
@@ -75,7 +90,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_path):
 
         #Retrieve sequence for region
         seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand,
-                            fasta_file_path=fasta_file_path)
+                            fasta_file_path=local_fasta_file_path)
         click.echo(seq)
         region['seq'] = seq
 

From 0d2400b248d1f01bcb4f7ae7c49fc2637ff6c1cd Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Fri, 8 Mar 2024 14:25:27 +0000
Subject: [PATCH 08/32] Added Make targets for easy python dependency
 installation

---
 pipeline/seq_retrieval/Makefile | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
index d8a0daec..63937f80 100644
--- a/pipeline/seq_retrieval/Makefile
+++ b/pipeline/seq_retrieval/Makefile
@@ -1,2 +1,19 @@
+.PHONY: check-venv-active
+
 container-image:
 	docker build -t agr_pavi/seq_retrieval .
+python-dependencies:
+	pip install -r requirements.txt
+python-dependencies-update:
+	pip install -U -r requirements.txt
+check-venv-active:
+ifeq ($(VIRTUAL_ENV),)
+	@echo 'No active python virtual environment found.'\
+		  'Please active the virtual environment first by running `source venv/bin/activate`,'\
+		  'or read README.md for instructions on how to set up a new one.'
+	@exit 1
+else
+	@:
+endif
+python-dependencies-dev: check-venv-active python-dependencies
+python-dependencies-dev-update: check-venv-active python-dependencies-update

From a759e4d5f767c70b5ce53c8d7cde88a4417ffe55 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Fri, 8 Mar 2024 14:25:54 +0000
Subject: [PATCH 09/32] Sorted python requirements alphabetically

---
 pipeline/seq_retrieval/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt
index 30887157..ae3751d8 100644
--- a/pipeline/seq_retrieval/requirements.txt
+++ b/pipeline/seq_retrieval/requirements.txt
@@ -1,4 +1,4 @@
-click==8.1.*
 biopython==1.83
+click==8.1.*
 pysam==0.22.*
 requests==2.31.*

From 47d62a9cc6d001ad731293663218bd171cb165ae Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Fri, 8 Mar 2024 14:31:13 +0000
Subject: [PATCH 10/32] Minor docstring update

---
 pipeline/seq_retrieval/src/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 211d24d2..69f047b8 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -62,7 +62,7 @@ def process_seq_regions(ctx, param, value):
 def main(seq_id, seq_strand, seq_regions, fasta_file_url: str):
     """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
     Returns a single (transcript) sequence made by concatenating all sequence regions requested
-    (in specified order)."""
+    (in positional order defined by specified seq_strand)."""
 
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 

From 50ced5c34ccd110c2e582683d87a23805573c9a3 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Fri, 8 Mar 2024 15:32:18 +0000
Subject: [PATCH 11/32] Added option to reuse local cache

---
 .../seq_retrieval/src/data_mover/data_file_mover.py | 13 ++++++++++---
 pipeline/seq_retrieval/src/main.py                  |  9 ++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index 36bf885a..8568bf21 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -19,7 +19,7 @@ def is_accessible_url(url: str):
     else:
         return False
 
-def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR):
+def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = False):
     """
     Fetch file from URL, return its local path.
     """
@@ -30,7 +30,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR):
             filepath = url_components.netloc + url_components.path
             local_path = find_local_file(filepath)
         else:
-            local_path = download_from_url(url, dest_dir)
+            local_path = download_from_url(url, dest_dir, reuse_local_cache=reuse_local_cache)
         _stored_files[url] = local_path
     else:
         local_path = _stored_files[url]
@@ -50,7 +50,7 @@ def find_local_file(path: str):
         else:
             return Path(path).resolve()
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024):
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = False):
     url_components = urlparse(url)
     if url_components.scheme in ['http', 'https']:
 
@@ -62,6 +62,13 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
         filename = unquote(os.path.basename(url_components.path))
         local_file_path = os.path.join(dest_dir, filename)
 
+        if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
+            if reuse_local_cache == True:
+                #Return the local file path without downloading new content
+                return Path(local_file_path).resolve()
+            else:
+                os.remove(local_file_path)
+
         #Download file through streaming to support large files
         response = requests.get(url, stream=True)
 
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 69f047b8..8d8baa5b 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -59,7 +59,10 @@ def process_seq_regions(ctx, param, value):
                    Assumes additional index files can be found at `<fasta_file_url>.fai`,
                    and at `<fasta_file_url>.gzi` if the fastafile is compressed.
                    Use "file://*" URL for local file or "http(s)://*" for remote files.""")
-def main(seq_id, seq_strand, seq_regions, fasta_file_url: str):
+@click.option("--reuse_local_cache", is_flag=True,
+              help="""When defined and using remote `fasta_file_url`, reused local files
+              if file already exists at destination path, rather than re-downloading and overwritting.""")
+def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache: bool):
     """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
     Returns a single (transcript) sequence made by concatenating all sequence regions requested
     (in positional order defined by specified seq_strand)."""
@@ -67,7 +70,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_url: str):
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
     # Fetch the file(s)
-    local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url)
+    local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url, reuse_local_cache=reuse_local_cache)
 
     # Fetch additional faidx index files in addition to fasta file itself
     # (to the same location)
@@ -76,7 +79,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_url: str):
         index_files.append(fasta_file_url+'.gzi')
 
     for index_file in index_files:
-        data_file_mover.fetch_file(index_file)
+        data_file_mover.fetch_file(index_file, reuse_local_cache=reuse_local_cache)
 
     click.echo(f"\nRegion seqs:")
     for region in seq_regions:

From 44dbe05d88e6e6b326ef7f11f9be1bff47506699 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Fri, 8 Mar 2024 15:54:12 +0000
Subject: [PATCH 12/32] Prevent reads of partial streamed file after interrupts

---
 pipeline/seq_retrieval/src/data_mover/data_file_mover.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index 8568bf21..fda72d68 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -70,12 +70,15 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
                 os.remove(local_file_path)
 
         #Download file through streaming to support large files
+        tmp_file_path = f"{local_file_path}.part"
         response = requests.get(url, stream=True)
 
-        with open(local_file_path, mode="wb") as local_file:
+        with open(tmp_file_path, mode="wb") as local_file:
             for chunk in response.iter_content(chunk_size=chunk_size):
                 local_file.write(chunk)
 
+        os.rename(tmp_file_path, local_file_path)
+
         return Path(local_file_path).resolve()
     else:
         #Currently not supported

From 3228d457c1aaaa39e59dc62a9f1e33e3928782ce Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Mon, 11 Mar 2024 19:37:30 +0000
Subject: [PATCH 13/32] Refactor to improve code readability

---
 .../src/data_mover/data_file_mover.py         | 16 +++-
 pipeline/seq_retrieval/src/main.py            | 33 ++-----
 pipeline/seq_retrieval/src/seq_fns.py         | 42 --------
 .../seq_retrieval/src/seq_region/__init__.py  |  2 +
 .../src/seq_region/seq_region.py              | 95 +++++++++++++++++++
 5 files changed, 119 insertions(+), 69 deletions(-)
 delete mode 100644 pipeline/seq_retrieval/src/seq_fns.py
 create mode 100644 pipeline/seq_retrieval/src/seq_region/__init__.py
 create mode 100644 pipeline/seq_retrieval/src/seq_region/seq_region.py

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index fda72d68..f8463814 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -8,6 +8,14 @@
 
 _stored_files = dict()
 _DEFAULT_DIR = '/tmp/pavi/'
+_reuse_local_cache = False
+
+def set_local_cache_reuse(reuse: bool):
+    """
+    Set _reuse_local_cache (True or False, default False)
+    """
+    global _reuse_local_cache
+    _reuse_local_cache = reuse
 
 def is_accessible_url(url: str):
     """
@@ -19,10 +27,11 @@ def is_accessible_url(url: str):
     else:
         return False
 
-def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = False):
+def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = None):
     """
     Fetch file from URL, return its local path.
     """
+    global _stored_files
     local_path = None
     if url not in _stored_files.keys():
         url_components = urlparse(url)
@@ -50,7 +59,10 @@ def find_local_file(path: str):
         else:
             return Path(path).resolve()
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = False):
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = None):
+    if reuse_local_cache == None:
+        reuse_local_cache = _reuse_local_cache
+
     url_components = urlparse(url)
     if url_components.scheme in ['http', 'https']:
 
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 8d8baa5b..e59fa976 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -3,7 +3,7 @@
 from os import path
 
 import data_mover.data_file_mover as data_file_mover
-import seq_fns
+from seq_region import SeqRegion, chain_seq_region_seqs
 
 def validate_strand(ctx, param, value):
     """Returns a normalised version of strings representing a strand.
@@ -69,36 +69,19 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache
 
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
-    # Fetch the file(s)
-    local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url, reuse_local_cache=reuse_local_cache)
+    data_file_mover.set_local_cache_reuse(reuse_local_cache)
 
-    # Fetch additional faidx index files in addition to fasta file itself
-    # (to the same location)
-    index_files = [fasta_file_url+'.fai']
-    if fasta_file_url.endswith('.gz'):
-        index_files.append(fasta_file_url+'.gzi')
-
-    for index_file in index_files:
-        data_file_mover.fetch_file(index_file, reuse_local_cache=reuse_local_cache)
-
-    click.echo(f"\nRegion seqs:")
+    seq_region_objs = []
     for region in seq_regions:
-        #If seq_strand is -, ensure seq_start < seq_end (swap as required)
-        if seq_strand == '-' and region['end'] < region['start']:
-            seq_start = region['end']
-            seq_end = region['start']
-
-            region['start'] = seq_start
-            region['end'] = seq_end
+        seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
+                                          fasta_file_url=fasta_file_url))
 
+    for seq_region in seq_region_objs:
         #Retrieve sequence for region
-        seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand,
-                            fasta_file_path=local_fasta_file_path)
-        click.echo(seq)
-        region['seq'] = seq
+        seq_region.fetch_seq()
 
     #Concatenate all regions into single sequence
-    seq_concat = seq_fns.chain_seq_region_seqs(seq_regions, seq_strand)
+    seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand)
     click.echo(f"\nSeq concat: {seq_concat}")
 
 if __name__ == '__main__':
diff --git a/pipeline/seq_retrieval/src/seq_fns.py b/pipeline/seq_retrieval/src/seq_fns.py
deleted file mode 100644
index cd9a5aac..00000000
--- a/pipeline/seq_retrieval/src/seq_fns.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-Module containing all functions used to handle and access sequences and sequence files.
-"""
-from Bio import Seq #Bio.Seq biopython submodule
-import pysam
-
-def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand):
-    """Return sequence found at `seq_id`:`seq_start`-`seq_end`:`seq_strand`
-    by reading from faidx files at `file_path`. Use 1-based, fully inclusive coordinates."""
-    try:
-        fasta_file = pysam.FastaFile(fasta_file_path)
-    except ValueError:
-        raise FileNotFoundError(f"Missing index file matching path {fasta_file_path}.")
-    except IOError:
-        raise IOError(f"Error while reading fasta file or index matching path {fasta_file_path}.")
-    else:
-        seq = fasta_file.fetch(reference=seq_id, start=seq_start-1, end=seq_end)
-        fasta_file.close()
-
-        if seq_strand == '-':
-            seq = Seq.reverse_complement(seq)
-
-    return seq
-
-def chain_seq_region_seqs(seq_regions: list, seq_strand: str):
-    """
-    Chain multiple sequence region seq's together into one continuous sequence.
-    Regions are chained together in an order based on the 'start' position of each:
-     * Ascending order when positive strand
-     * Descending order when negative strand
-    """
-
-    sort_args = dict(key=lambda region: region['start'])
-
-    if seq_strand == '-':
-        sort_args['reverse'] = True
-
-    sorted_regions = seq_regions
-    seq_regions.sort(**sort_args)
-    chained_seq = ''.join(map(lambda region: region['seq'], sorted_regions))
-
-    return chained_seq
diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py
new file mode 100644
index 00000000..75eecf7f
--- /dev/null
+++ b/pipeline/seq_retrieval/src/seq_region/__init__.py
@@ -0,0 +1,2 @@
+from .seq_region import SeqRegion
+from .seq_region import chain_seq_region_seqs
\ No newline at end of file
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
new file mode 100644
index 00000000..4486fd9d
--- /dev/null
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -0,0 +1,95 @@
+"""
+Module containing the SeqRegion class and all methods to handle them.
+SeqRegion class is used to define and retrieve sequence regions.
+"""
+from Bio import Seq #Bio.Seq biopython submodule
+import pysam
+
+from data_mover import data_file_mover
+
+class SeqRegion():
+    seq_id = str
+    start = int
+    end = int
+    strand = str
+    fasta_file_path = str
+    sequence = str
+
+    def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: str = None):
+        """
+        Uses 1-based, fully inclusive coordinates
+        """
+        self.seq_id = seq_id
+        self.strand = strand
+
+        #If strand is -, ensure start <= end (swap as required)
+        if strand == '-':
+            if end < start:
+                self.start = end
+                self.end = start
+            else:
+                self.start = start
+                self.end = end
+        #If strand is +, throw error when end < start (likely user error)
+        else:
+            if end < start:
+                raise ValueError(f"Unexpected position order: end {end} < start {start}.")
+            else:
+                self.start = start
+                self.end = end
+
+        # Fetch the file(s)
+        local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url)
+
+        # Fetch additional faidx index files in addition to fasta file itself
+        # (to the same location)
+        index_files = [fasta_file_url+'.fai']
+        if fasta_file_url.endswith('.gz'):
+            index_files.append(fasta_file_url+'.gzi')
+
+        for index_file in index_files:
+            data_file_mover.fetch_file(index_file)
+
+        self.fasta_file_path = local_fasta_file_path
+
+        if seq != None:
+            self.sequence = seq
+
+
+    def fetch_seq(self):
+        """Fetch sequence found at `seq_id`:`start`-`end`:`strand`
+        by reading from faidx files at `fasta_file_path`.
+        Uses 1-based, fully inclusive coordinates."""
+        try:
+            fasta_file = pysam.FastaFile(self.fasta_file_path)
+        except ValueError:
+            raise FileNotFoundError(f"Missing index file matching path {self.fasta_file_path}.")
+        except IOError:
+            raise IOError(f"Error while reading fasta file or index matching path {self.fasta_file_path}.")
+        else:
+            seq = fasta_file.fetch(reference=self.seq_id, start=self.start-1, end=self.end)
+            fasta_file.close()
+
+            if self.strand == '-':
+                seq = Seq.reverse_complement(seq)
+
+        self.sequence = seq
+
+def chain_seq_region_seqs(seq_regions: list, seq_strand: str):
+    """
+    Chain multiple SeqRegions' sequenes together into one continuous sequence.
+    SeqRegions are chained together in an order based on the 'start' position of each:
+     * Ascending order when positive strand
+     * Descending order when negative strand
+    """
+
+    sort_args = dict(key=lambda region: region.start)
+
+    if seq_strand == '-':
+        sort_args['reverse'] = True
+
+    sorted_regions = seq_regions
+    seq_regions.sort(**sort_args)
+    chained_seq = ''.join(map(lambda region : region.sequence, sorted_regions))
+
+    return chained_seq

From bde10e2bd7880abfced5c725c08a38fc7ab78daa Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Mon, 11 Mar 2024 19:41:07 +0000
Subject: [PATCH 14/32] Minor refactor to improve code readability

---
 pipeline/seq_retrieval/src/main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index e59fa976..e7f867ac 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -5,7 +5,7 @@
 import data_mover.data_file_mover as data_file_mover
 from seq_region import SeqRegion, chain_seq_region_seqs
 
-def validate_strand(ctx, param, value):
+def validate_strand_param(ctx, param, value):
     """Returns a normalised version of strings representing a strand.
     Negative strand is normalised to '-', positive strand to '+'.
     Throws a click.BadParameter exception if an unrecognised string was provided."""
@@ -18,7 +18,7 @@ def validate_strand(ctx, param, value):
     else:
         raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
 
-def process_seq_regions(ctx, param, value):
+def process_seq_regions_param(ctx, param, value):
     """Parse the seq_regions parameter value and validate it's structure.
     Value is expected to be a JSON-formatted list of sequence regions to retrieve.
     Each region should have:
@@ -50,9 +50,9 @@ def process_seq_regions(ctx, param, value):
 @click.command()
 @click.option("--seq_id", type=click.STRING, required=True,
               help="The sequence ID to retrieve sequences for.")
-@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand,
+@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand_param,
               help="The sequence strand to retrieve sequences for.")
-@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions,
+@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions_param,
               help="A list of sequence regions to retrieve sequences for.")
 @click.option("--fasta_file_url", type=click.STRING, required=True,
               help="""URL to (faidx-indexed) fasta file to retrieve sequences from.\

From 78d4c639396a3e811492bbb01503def8430bf4a2 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 09:41:35 +0000
Subject: [PATCH 15/32] Unused import cleanup

---
 pipeline/seq_retrieval/src/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index e7f867ac..00dcf118 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,6 +1,5 @@
 import click
 import json
-from os import path
 
 import data_mover.data_file_mover as data_file_mover
 from seq_region import SeqRegion, chain_seq_region_seqs

From fee42df8d13e4f004a34baf97a99c4d3d549057e Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 13:43:44 +0000
Subject: [PATCH 16/32] Added framework for PR validation through GH actions

---
 .github/workflows/PR-validation.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/workflows/PR-validation.yml

diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml
new file mode 100644
index 00000000..9d1f59fb
--- /dev/null
+++ b/.github/workflows/PR-validation.yml
@@ -0,0 +1,27 @@
+name: PR validation
+on:
+  pull_request:
+    types: [synchronize, opened, reopened, edited]
+    branches:
+      - main
+jobs:
+  pipeline-seq-retrieval-container-image-build:
+    name: pipeline/seq_retrieval container-image build
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - name: Build container image
+        run: |
+          make container-image
+  #TODO: add typing testing (mypy)
+  #TODO: add unit testing
+  #TODO: add integration testing

From b6286581858cb055393e58a333c16fa637faccb3 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 14:15:15 +0000
Subject: [PATCH 17/32] Added python type testing + included in PR validation

---
 .github/workflows/PR-validation.yml           | 21 ++++++++++++++++++-
 pipeline/seq_retrieval/Makefile               | 13 +++++++++++-
 .../test/type-testing-requirements.txt        |  1 +
 3 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 pipeline/seq_retrieval/test/type-testing-requirements.txt

diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml
index 9d1f59fb..ebc1f3b0 100644
--- a/.github/workflows/PR-validation.yml
+++ b/.github/workflows/PR-validation.yml
@@ -22,6 +22,25 @@ jobs:
       - name: Build container image
         run: |
           make container-image
-  #TODO: add typing testing (mypy)
+  pipeline-seq-retrieval-check-python-typing:
+    name: pipeline/seq_retrieval check python typing
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Check python typing
+        run: |
+          make check-python-typing
   #TODO: add unit testing
   #TODO: add integration testing
diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
index 63937f80..8a5fc4fc 100644
--- a/pipeline/seq_retrieval/Makefile
+++ b/pipeline/seq_retrieval/Makefile
@@ -1,11 +1,18 @@
-.PHONY: check-venv-active
+.PHONY: check-venv-active check-python-typing
 
 container-image:
 	docker build -t agr_pavi/seq_retrieval .
 python-dependencies:
 	pip install -r requirements.txt
+
 python-dependencies-update:
 	pip install -U -r requirements.txt
+
+check-python-typing:
+	pip install -r test/type-testing-requirements.txt
+	mypy --install-types --non-interactive
+	mypy src/main.py
+
 check-venv-active:
 ifeq ($(VIRTUAL_ENV),)
 	@echo 'No active python virtual environment found.'\
@@ -15,5 +22,9 @@ ifeq ($(VIRTUAL_ENV),)
 else
 	@:
 endif
+
 python-dependencies-dev: check-venv-active python-dependencies
+
 python-dependencies-dev-update: check-venv-active python-dependencies-update
+
+check-python-typing-dev: check-venv-active check-python-typing
diff --git a/pipeline/seq_retrieval/test/type-testing-requirements.txt b/pipeline/seq_retrieval/test/type-testing-requirements.txt
new file mode 100644
index 00000000..fcea0984
--- /dev/null
+++ b/pipeline/seq_retrieval/test/type-testing-requirements.txt
@@ -0,0 +1 @@
+mypy==1.9.*
\ No newline at end of file

From d02d0ab48bed2c2cbafb3d617bdc0748a3c71535 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 14:21:52 +0000
Subject: [PATCH 18/32] Python type checking bugfix

---
 pipeline/seq_retrieval/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
index 8a5fc4fc..03927c5b 100644
--- a/pipeline/seq_retrieval/Makefile
+++ b/pipeline/seq_retrieval/Makefile
@@ -10,8 +10,7 @@ python-dependencies-update:
 
 check-python-typing:
 	pip install -r test/type-testing-requirements.txt
-	mypy --install-types --non-interactive
-	mypy src/main.py
+	mypy --install-types --non-interactive src/main.py
 
 check-venv-active:
 ifeq ($(VIRTUAL_ENV),)

From 51fa3aacb5243d12b7a5e4ff3a828b58c96a7613 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 14:26:29 +0000
Subject: [PATCH 19/32] Added main app dependencies as dependency for type
 testing

---
 pipeline/seq_retrieval/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
index 03927c5b..d371eefd 100644
--- a/pipeline/seq_retrieval/Makefile
+++ b/pipeline/seq_retrieval/Makefile
@@ -8,7 +8,7 @@ python-dependencies:
 python-dependencies-update:
 	pip install -U -r requirements.txt
 
-check-python-typing:
+check-python-typing: python-dependencies
 	pip install -r test/type-testing-requirements.txt
 	mypy --install-types --non-interactive src/main.py
 

From 4fae5a7b36cbc4ec83061fc167e280be58c683a1 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 15:06:48 +0000
Subject: [PATCH 20/32] Fixed typing errors reported by mypy

---
 .../src/data_mover/data_file_mover.py         |  7 +++---
 .../src/seq_region/seq_region.py              | 22 ++++++++++---------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index f8463814..33bcd98c 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -3,10 +3,11 @@
 """
 import os.path
 from pathlib import Path
+import typing
 import requests
 from urllib.parse import urlparse, unquote
 
-_stored_files = dict()
+_stored_files: typing.Dict[str, str] = dict()
 _DEFAULT_DIR = '/tmp/pavi/'
 _reuse_local_cache = False
 
@@ -27,7 +28,7 @@ def is_accessible_url(url: str):
     else:
         return False
 
-def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = None):
+def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str:
     """
     Fetch file from URL, return its local path.
     """
@@ -59,7 +60,7 @@ def find_local_file(path: str):
         else:
             return Path(path).resolve()
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = None):
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None):
     if reuse_local_cache == None:
         reuse_local_cache = _reuse_local_cache
 
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 4486fd9d..37fccb1d 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -2,20 +2,22 @@
 Module containing the SeqRegion class and all methods to handle them.
 SeqRegion class is used to define and retrieve sequence regions.
 """
+import typing
+
 from Bio import Seq #Bio.Seq biopython submodule
 import pysam
 
 from data_mover import data_file_mover
 
 class SeqRegion():
-    seq_id = str
-    start = int
-    end = int
-    strand = str
-    fasta_file_path = str
-    sequence = str
-
-    def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: str = None):
+    seq_id: str
+    start: int
+    end: int
+    strand: str
+    fasta_file_path: str
+    sequence: typing.Optional[str]
+
+    def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: typing.Optional[str] = None):
         """
         Uses 1-based, fully inclusive coordinates
         """
@@ -56,7 +58,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
             self.sequence = seq
 
 
-    def fetch_seq(self):
+    def fetch_seq(self) -> None:
         """Fetch sequence found at `seq_id`:`start`-`end`:`strand`
         by reading from faidx files at `fasta_file_path`.
         Uses 1-based, fully inclusive coordinates."""
@@ -83,7 +85,7 @@ def chain_seq_region_seqs(seq_regions: list, seq_strand: str):
      * Descending order when negative strand
     """
 
-    sort_args = dict(key=lambda region: region.start)
+    sort_args: typing.Dict[str, typing.Any] = dict(key=lambda region: region.start, reverse=False)
 
     if seq_strand == '-':
         sort_args['reverse'] = True

From 4e25e6f9b1872d8435da742cc557b29672d16b92 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 15:13:00 +0000
Subject: [PATCH 21/32] Updated GH actions runner to current Ubuntu LTS (22.04)

---
 .github/workflows/PR-validation.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml
index ebc1f3b0..5f1406d8 100644
--- a/.github/workflows/PR-validation.yml
+++ b/.github/workflows/PR-validation.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   pipeline-seq-retrieval-container-image-build:
     name: pipeline/seq_retrieval container-image build
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     defaults:
       run:
         shell: bash
@@ -24,7 +24,7 @@ jobs:
           make container-image
   pipeline-seq-retrieval-check-python-typing:
     name: pipeline/seq_retrieval check python typing
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     defaults:
       run:
         shell: bash

From b5e4a988c03c562ef360ddcb4e3e5f753411a613 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 15:13:41 +0000
Subject: [PATCH 22/32] GH actions checkout version bump (to deprecate node.js
 v16 usage)

---
 .github/workflows/PR-validation.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml
index 5f1406d8..47fa9ce3 100644
--- a/.github/workflows/PR-validation.yml
+++ b/.github/workflows/PR-validation.yml
@@ -14,7 +14,7 @@ jobs:
         working-directory: ./pipeline/seq_retrieval/
     steps:
       - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           sparse-checkout: |
@@ -31,7 +31,7 @@ jobs:
         working-directory: ./pipeline/seq_retrieval/
     steps:
       - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           sparse-checkout: |

From 2ac053730ab2c5f64d66e82de78d8f2ddcdfdf21 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 15:45:03 +0000
Subject: [PATCH 23/32] Added vscode configs to gitignore

---
 .gitignore | 64 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/.gitignore b/.gitignore
index 68bc17f9..af90381a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,13 @@
-# Byte-compiled / optimized / DLL files
+# Python files
+## Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 
-# C extensions
+## C extensions
 *.so
 
-# Distribution / packaging
+## Distribution / packaging
 .Python
 build/
 develop-eggs/
@@ -26,17 +27,17 @@ share/python-wheels/
 *.egg
 MANIFEST
 
-# PyInstaller
+## PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 
-# Installer logs
+## package installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 
-# Unit test / coverage reports
+## Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
@@ -51,57 +52,57 @@ coverage.xml
 .pytest_cache/
 cover/
 
-# Translations
+## Translations
 *.mo
 *.pot
 
-# Django stuff:
+## Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 
-# Flask stuff:
+## Flask stuff:
 instance/
 .webassets-cache
 
-# Scrapy stuff:
+## Scrapy stuff:
 .scrapy
 
-# Sphinx documentation
+## Sphinx documentation
 docs/_build/
 
-# PyBuilder
+## PyBuilder
 .pybuilder/
 target/
 
-# Jupyter Notebook
+## Jupyter Notebook
 .ipynb_checkpoints
 
-# IPython
+## IPython
 profile_default/
 ipython_config.py
 
-# pyenv
+## pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 
-# pipenv
+## pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 
-# poetry
+## poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 
-# pdm
+## pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
@@ -109,17 +110,17 @@ ipython_config.py
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+## PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 
-# Celery stuff
+## Celery stuff
 celerybeat-schedule
 celerybeat.pid
 
-# SageMath parsed files
+## SageMath parsed files
 *.sage.py
 
-# Environments
+## Environments
 .env
 .venv
 env/
@@ -128,33 +129,36 @@ ENV/
 env.bak/
 venv.bak/
 
-# Spyder project settings
+## Spyder project settings
 .spyderproject
 .spyproject
 
-# Rope project settings
+## Rope project settings
 .ropeproject
 
-# mkdocs documentation
+## mkdocs documentation
 /site
 
-# mypy
+## mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 
-# Pyre type checker
+## Pyre type checker
 .pyre/
 
-# pytype static type analyzer
+## pytype static type analyzer
 .pytype/
 
-# Cython debug symbols
+## Cython debug symbols
 cython_debug/
 
-# PyCharm
+## PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# VS Code
+.vscode/

From 2acb5f992886e6e70dd6d1df09fb131a46998bf8 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 16:28:05 +0000
Subject: [PATCH 24/32] Added output typing for all methods and functions

---
 .../src/data_mover/data_file_mover.py              | 14 +++++++-------
 .../seq_retrieval/src/seq_region/seq_region.py     |  8 ++++++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index 33bcd98c..c507ac6e 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -11,14 +11,14 @@
 _DEFAULT_DIR = '/tmp/pavi/'
 _reuse_local_cache = False
 
-def set_local_cache_reuse(reuse: bool):
+def set_local_cache_reuse(reuse: bool) -> None:
     """
     Set _reuse_local_cache (True or False, default False)
     """
     global _reuse_local_cache
     _reuse_local_cache = reuse
 
-def is_accessible_url(url: str):
+def is_accessible_url(url: str) -> bool:
     """
     Returns True when provided `url` is an accessible URL
     """
@@ -47,7 +47,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing
 
     return local_path
 
-def find_local_file(path: str):
+def find_local_file(path: str) -> str:
     """
     Find a file locally based on path and return its absolute path.
     If no file was found at given path, throws Exception.
@@ -58,9 +58,9 @@ def find_local_file(path: str):
         if not os.path.isfile(path):
             raise FileNotFoundError(f"Specified path '{path}' exists but is not a file.")
         else:
-            return Path(path).resolve()
+            return str(Path(path).resolve())
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None):
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None) -> str:
     if reuse_local_cache == None:
         reuse_local_cache = _reuse_local_cache
 
@@ -78,7 +78,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
         if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
             if reuse_local_cache == True:
                 #Return the local file path without downloading new content
-                return Path(local_file_path).resolve()
+                return str(Path(local_file_path).resolve())
             else:
                 os.remove(local_file_path)
 
@@ -92,7 +92,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
 
         os.rename(tmp_file_path, local_file_path)
 
-        return Path(local_file_path).resolve()
+        return str(Path(local_file_path).resolve())
     else:
         #Currently not supported
         raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.")
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 37fccb1d..701c9cf7 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -77,7 +77,11 @@ def fetch_seq(self) -> None:
 
         self.sequence = seq
 
-def chain_seq_region_seqs(seq_regions: list, seq_strand: str):
+    def get_sequence(self) -> str:
+        """Return SeqRegion's sequence as a string (empty string if `None`)."""
+        return str(self.sequence)
+
+def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) -> str:
     """
     Chain multiple SeqRegions' sequenes together into one continuous sequence.
     SeqRegions are chained together in an order based on the 'start' position of each:
@@ -92,6 +96,6 @@ def chain_seq_region_seqs(seq_regions: list, seq_strand: str):
 
     sorted_regions = seq_regions
     seq_regions.sort(**sort_args)
-    chained_seq = ''.join(map(lambda region : region.sequence, sorted_regions))
+    chained_seq = ''.join(map(lambda region : region.get_sequence(), sorted_regions))
 
     return chained_seq

From 4b068192234c46f81d64924cfd9bf2da512ee102 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 16:29:41 +0000
Subject: [PATCH 25/32] Bugfix for seqRegion sorting in chain_seq_region_seqs()
 function

---
 pipeline/seq_retrieval/src/seq_region/seq_region.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 701c9cf7..06c5e009 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -95,7 +95,7 @@ def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str)
         sort_args['reverse'] = True
 
     sorted_regions = seq_regions
-    seq_regions.sort(**sort_args)
+    sorted_regions.sort(**sort_args)
     chained_seq = ''.join(map(lambda region : region.get_sequence(), sorted_regions))
 
     return chained_seq

From 84a03dfb45c1cac4e72f1985729f124ec0cfaf29 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Tue, 12 Mar 2024 16:56:59 +0000
Subject: [PATCH 26/32] Additional main functions arguments and output typing

---
 pipeline/seq_retrieval/src/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 00dcf118..1f544945 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,10 +1,11 @@
 import click
+import typing
 import json
 
 import data_mover.data_file_mover as data_file_mover
 from seq_region import SeqRegion, chain_seq_region_seqs
 
-def validate_strand_param(ctx, param, value):
+def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']:
     """Returns a normalised version of strings representing a strand.
     Negative strand is normalised to '-', positive strand to '+'.
     Throws a click.BadParameter exception if an unrecognised string was provided."""
@@ -17,7 +18,7 @@ def validate_strand_param(ctx, param, value):
     else:
         raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
 
-def process_seq_regions_param(ctx, param, value):
+def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,typing.Any]]:
     """Parse the seq_regions parameter value and validate it's structure.
     Value is expected to be a JSON-formatted list of sequence regions to retrieve.
     Each region should have:
@@ -61,7 +62,7 @@ def process_seq_regions_param(ctx, param, value):
 @click.option("--reuse_local_cache", is_flag=True,
               help="""When defined and using remote `fasta_file_url`, reused local files
               if file already exists at destination path, rather than re-downloading and overwritting.""")
-def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache: bool):
+def main(seq_id: str, seq_strand: str, seq_regions: typing.List, fasta_file_url: str, reuse_local_cache: bool):
     """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
     Returns a single (transcript) sequence made by concatenating all sequence regions requested
     (in positional order defined by specified seq_strand)."""

From 7cbd950728ad4647ef69379df4642c3a2b85d070 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Wed, 13 Mar 2024 11:15:54 +0000
Subject: [PATCH 27/32] Added typing guidelines to README

---
 pipeline/seq_retrieval/README.md | 87 ++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md
index 242eeb1e..6f4ad2c1 100644
--- a/pipeline/seq_retrieval/README.md
+++ b/pipeline/seq_retrieval/README.md
@@ -2,6 +2,7 @@
 This subdirectory contains all code and configs for the PAVI sequence retrieval component.
 
 ## Development
+### Local dev environment
 In order to enable isolated local development that does not interfere with the global system python setup,
 a virtual environment is used to do code development and testing for this component.
 
@@ -30,3 +31,89 @@ Once ending development for this component, deactivate the virtual environment w
 ```bash
 deactivate
 ```
+
+### Code guidelines
+#### Input/output typing and checking
+**TL;DR**  
+Type hints are required wherever possible and `mypy` is used to enforce this on PR validation.  
+To check all code part of the seq_retrieval module in this folder (on local development), run:
+```shell
+make check-python-typing-dev
+```
+
+**Detailed explanation**  
+Altought the Python interpreter always uses dynamic typing, it is recommended to use type hints wherever possible,
+to ensure all functions and methods always receive the expected type as to not get caught by unexpected errors.
+
+As an example, the following untyped python function will define local cache usage behavior taking a boolean as toggle:
+```python
+def set_local_cache_reuse(reuse):
+    """
+    Define whether or not data_file_mover will reuse files in local cache where already available pre-execution.
+
+    Args:
+        reuse (bool): set to `True` to enable local cache reuse behavior (default `False`)
+    """
+    global _reuse_local_cache
+    _reuse_local_cache = reuse
+    if reuse:
+        print("Local cache reuse enabled.")
+    else:
+        print("Local cache reuse disabled.")
+```
+
+When called with boolean values, this function works just fine:
+```python
+>>> set_local_cache_reuse(True)
+Local cache reuse enabled.
+>>> set_local_cache_reuse(False)
+Local cache reuse disabled.
+```
+However when calling with a String instead of a boolean, you may get unexpected behaviour:
+```python
+>>> set_local_cache_reuse("False")
+Local cache reuse enabled.
+```
+This happens because Python dynamically types and converts types at runtime,
+and all strings except empty ones are converted to boolean value `True`.
+
+To prevent this, add type hints to your code:
+```python
+def set_local_cache_reuse(reuse: bool) -> None:
+    """
+    Define whether or not data_file_mover will reuse files in local cache where already available pre-execution.
+
+    Args:
+        reuse (bool): set to `True` to enable local cache reuse behavior (default `False`)
+    """
+    global _reuse_local_cache
+    _reuse_local_cache = reuse
+    if reuse:
+        print("Local cache reuse enabled.")
+    else:
+        print("Local cache reuse disabled.")
+set_local_cache_reuse("False")
+```
+
+Type hints themselves are not enforced at runtime, and will thus not stop the code from running (incorrectly),
+but using `myPy` those errors can be revealed before merging this code. Storing the above code snippet in a file
+called `set_local_cache_reuse.py` and running `myPy` on it gives the following result:
+```shell
+> mypy set_local_cache_reuse.py
+set_local_cache_reuse.py:9: error: Name "_reuse_local_cache" is not defined  [name-defined]
+set_local_cache_reuse.py:14: error: Argument 1 to "set_local_cache_reuse" has incompatible type "str"; expected "bool"  [arg-type]
+Found 2 errors in 1 file (checked 1 source file)
+```
+With the myPy output, we can now return to the code and fix the errors reported
+which would otherwise result in silent unexpected behavior and bugs.
+
+To run `mypy` on the seq_retrieval module in this folder in your local development environment,
+run the following shell command:
+```shell
+make check-python-typing-dev
+```
+
+`mypy` checks are automatically run and enforced as part of the PR validation
+and all reported errors must be fixed to enable merging each PR into `main`.  
+If the `pipeline/seq_retrieval check python typing` status check fails on a PR in github,
+click the details link and inspect the failing step output for hints on what to fix.

From c0fee617f24e2559a89a10e3f53c4f418aa66b07 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Wed, 13 Mar 2024 18:39:05 +0000
Subject: [PATCH 28/32] Comply with google-style docstring documentation

This enables relevant IDE tooltip hints in VS Code (and presumably other IDEs) and thus more efficient coding.
---
 pipeline/seq_retrieval/README.md              |  5 ++
 .../src/data_mover/data_file_mover.py         | 74 +++++++++++++++++--
 pipeline/seq_retrieval/src/main.py            | 42 ++++++++---
 .../src/seq_region/seq_region.py              | 57 +++++++++++---
 4 files changed, 154 insertions(+), 24 deletions(-)

diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md
index 6f4ad2c1..add19607 100644
--- a/pipeline/seq_retrieval/README.md
+++ b/pipeline/seq_retrieval/README.md
@@ -117,3 +117,8 @@ make check-python-typing-dev
 and all reported errors must be fixed to enable merging each PR into `main`.  
 If the `pipeline/seq_retrieval check python typing` status check fails on a PR in github,
 click the details link and inspect the failing step output for hints on what to fix.
+
+#### Code documentation (docstrings)
+All modules, functions, classes and methods should have their input, attributes and output documented
+through docstrings to make the code easy to read and understand for anyone reading it.
+To ensure this is done in a uniform way accross all code, follow the [Google Python Style Guide on docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings).
diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index c507ac6e..713a1ba4 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -1,5 +1,5 @@
 """
-Moving files to and from remote locations
+Module used to find, access and copy files at/to/from remote locations
 """
 import os.path
 from pathlib import Path
@@ -8,19 +8,43 @@
 from urllib.parse import urlparse, unquote
 
 _stored_files: typing.Dict[str, str] = dict()
+"""Module level memory cache of filepaths for all files accessed through this module."""
+
 _DEFAULT_DIR = '/tmp/pavi/'
+"""Module level default destination directory to search/download remote files in/to."""
+
 _reuse_local_cache = False
+"""
+Module level toggle defining local file cache reuse behaviour.
+ * When True, reuse identically named files existing pre-runtime at local destination location for remote files
+ * When False, delete identically named files existing pre-runtime at local destination location for remote files before downloading.
+
+Change the value through the `set_local_cache_reuse` function.
+"""
 
 def set_local_cache_reuse(reuse: bool) -> None:
     """
-    Set _reuse_local_cache (True or False, default False)
+    Define data_file_mover module-level behaviour on local file cache reuse.
+
+    data_file_mover will
+     * reuse identically named files at local target location when available pre-runtime when set to `True`.
+     * remove identically named files at local target location when present pre-runtime when set to `False`.
+
+    Args:
+        reuse (bool): set to `True` to enable local cache reuse behavior (default `False`)
     """
     global _reuse_local_cache
     _reuse_local_cache = reuse
 
 def is_accessible_url(url: str) -> bool:
     """
-    Returns True when provided `url` is an accessible URL
+    Check whether provided `url` is an accessible (remote) URL
+
+    Args:
+        url: URL to be checked for accessability
+
+    Returns:
+        `True` when provided `url` is an accessible URL, `False` otherwise
     """
     response = requests.head(url)
     if response.ok:
@@ -30,7 +54,19 @@ def is_accessible_url(url: str) -> bool:
 
 def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str:
     """
-    Fetch file from URL, return its local path.
+    Fetch file from URL and return its local path.
+
+    Parses `url` to determine scheme and sends it to the appropriate data_file_mover function for retrieval.
+    Result is cached in the data_file_mover module's `_stored_files` memory cache, which is used to speed up
+    repeated retrieval.
+
+    Args:
+        url: URL to fetch local file for
+        dest_dir: Destination directory to search/download remote files in/to.
+        reuse_local_cache: Argument to override local cache reuse behavior defined at data_file_mover level.
+
+    Returns:
+        Absolute path to local file matching the requested URL (string).
     """
     global _stored_files
     local_path = None
@@ -50,7 +86,15 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing
 def find_local_file(path: str) -> str:
     """
     Find a file locally based on path and return its absolute path.
-    If no file was found at given path, throws Exception.
+
+    Args:
+        path: path to local file to find, can be relative.
+
+    Returns:
+        Absolute path to file found at `path` (string).
+
+    Raises:
+        `FileNotFoundError`: If `path` is not found, or is not a file.
     """
     if not os.path.exists(path):
         raise FileNotFoundError(f"No file found at path '{path}'.")
@@ -61,6 +105,26 @@ def find_local_file(path: str) -> str:
             return str(Path(path).resolve())
 
 def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None) -> str:
+    """
+    Download file from remote URL and return its absolute local path.
+
+    Parses `url` to determine scheme and downloads the remote file to local filesystem as appropriate.
+    When local cache reuse is enabled, does not download but returns identically named files at `dest_dir` location when found.
+    When local cache reuse is disabled, removes identically named files at `dest_dir` location when found before initiating download.
+
+    Args:
+        url: URL to remote file to download
+        dest_dir: Destination directory to search/download remote file in/to.
+        chunk_size: Chunk size use while downloading
+        reuse_local_cache: Argument to override local cache reuse behavior defined at data_file_mover level.
+
+    Returns:
+        Absolute path to the found/downloaded file (string).
+
+    Raises:
+        `ValueError`: if `url` scheme is not supported or `url` is not accessible.
+    """
+
     if reuse_local_cache == None:
         reuse_local_cache = _reuse_local_cache
 
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index 1f544945..c2b65a3a 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -1,3 +1,8 @@
+"""
+Main module serving the CLI for PAVI sequence retrieval.
+
+Retrieves multiple sequence regions and returns them as one chained sequence.
+"""
 import click
 import typing
 import json
@@ -6,9 +11,15 @@
 from seq_region import SeqRegion, chain_seq_region_seqs
 
 def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']:
-    """Returns a normalised version of strings representing a strand.
-    Negative strand is normalised to '-', positive strand to '+'.
-    Throws a click.BadParameter exception if an unrecognised string was provided."""
+    """
+    Processes and normalises the value of click input argument `strand`.
+
+    Returns:
+        A normalised version of strings representing a strand: '-' or '+'
+
+    Raises:
+        click.BadParameter: If an unrecognised string was provided.
+    """
     POS_CHOICES = ['+', '+1', 'pos']
     NEG_CHOICES = ['-', '-1', 'neg']
     if   value in POS_CHOICES:
@@ -19,12 +30,20 @@ def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']:
         raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
 
 def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,typing.Any]]:
-    """Parse the seq_regions parameter value and validate it's structure.
+    """
+    Parse the value of click input parameter seq_regions and validate it's structure.
+
     Value is expected to be a JSON-formatted list of sequence regions to retrieve.
     Each region should have:
-     * a 'start' property indicating the region start (inclusive)
-     * a 'end' property indicating the region end (inclusive)
-    Throws a click.BadParameter exception if value could not be parsed as JSON or had an invalid structure."""
+     * a 'start' property indicating the region start (1-based, inclusive)
+     * a 'end' property indicating the region end (1-based, inclusive)
+
+    Returns:
+        List of dicts representing start and end of seq region
+
+    Raises:
+        click.BadParameter: If value could not be parsed as JSON or had an invalid structure or values.
+    """
     seq_regions = None
     try:
         seq_regions = json.loads(value)
@@ -63,9 +82,12 @@ def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,
               help="""When defined and using remote `fasta_file_url`, reused local files
               if file already exists at destination path, rather than re-downloading and overwritting.""")
 def main(seq_id: str, seq_strand: str, seq_regions: typing.List, fasta_file_url: str, reuse_local_cache: bool):
-    """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
-    Returns a single (transcript) sequence made by concatenating all sequence regions requested
-    (in positional order defined by specified seq_strand)."""
+    """
+    Main method for sequence retrieval from JBrowse faidx indexed fasta files. Receives input args from click.
+
+    Prints a single (transcript) sequence obtained by concatenating the sequence of
+    all sequence regions requested (in positional order defined by specified seq_strand).
+    """
 
     click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
 
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 06c5e009..6a36ffe6 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -1,6 +1,5 @@
 """
-Module containing the SeqRegion class and all methods to handle them.
-SeqRegion class is used to define and retrieve sequence regions.
+Module containing the SeqRegion class and all functions to handle SeqRegion entities.
 """
 import typing
 
@@ -10,16 +9,45 @@
 from data_mover import data_file_mover
 
 class SeqRegion():
+    """
+    Defines a DNA sequence region.
+    """
+
     seq_id: str
+    """The sequence identifier found in the fasta file on which the sequence region is located"""
+
     start: int
+    """The start position of the sequence region (1-based, inclusive). Asserted to be `start` < `end`."""
+
     end: int
+    """The end position of the sequence region (1-base, inclusive). Asserted to be `start` < `end`."""
+
     strand: str
+    """The (genomic) strand of the sequence region"""
+
     fasta_file_path: str
+    """Absolute path to (faidx indexed) FASTA file containing the sequences"""
+
     sequence: typing.Optional[str]
+    """the DNA sequence of a sequence region"""
 
     def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: typing.Optional[str] = None):
         """
-        Uses 1-based, fully inclusive coordinates
+        Initializes a SeqRegion instance
+
+        Args:
+            seq_id: The sequence identifier found in the fasta file on which the sequence region is located
+            start: The start position of the sequence region (1-based, inclusive).\
+                   If negative strand, `start` and `end` are swapped if `end` < `start`.
+            end: The end position of the sequence region (1-base, inclusive).\
+                 If negative strand, `start` and `end` are swapped if `end` < `start`.
+            strand: the (genomic) strand of the sequence region
+            fasta_file_url: Path to local faidx-indexed FASTA file containing the sequences to retrieve (regions of).\
+                            Faidx-index files `fasta_file_url`.fai and `fasta_file_url`.gzi for compressed fasta file must be accessible URLs.
+            seq: optional DNA sequence of the sequence region
+
+        Raises:
+            ValueError: if value of `end` < `start` and `strand` is '+'
         """
         self.seq_id = seq_id
         self.strand = strand
@@ -59,9 +87,12 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
 
 
     def fetch_seq(self) -> None:
-        """Fetch sequence found at `seq_id`:`start`-`end`:`strand`
+        """
+        Fetch sequence found at `seq_id`:`start`-`end`:`strand`
         by reading from faidx files at `fasta_file_path`.
-        Uses 1-based, fully inclusive coordinates."""
+
+        Stores resulting sequence in `sequence` attribute.
+        """
         try:
             fasta_file = pysam.FastaFile(self.fasta_file_path)
         except ValueError:
@@ -78,15 +109,23 @@ def fetch_seq(self) -> None:
         self.sequence = seq
 
     def get_sequence(self) -> str:
-        """Return SeqRegion's sequence as a string (empty string if `None`)."""
+        """Return `sequence` attribute as a string (empty string if `None`)."""
         return str(self.sequence)
 
 def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) -> str:
     """
     Chain multiple SeqRegions' sequenes together into one continuous sequence.
-    SeqRegions are chained together in an order based on the 'start' position of each:
-     * Ascending order when positive strand
-     * Descending order when negative strand
+
+    SeqRegions are chained together in an order based on the `start` attribute of each:
+     * Ascending order when `seq_strand` is positive strand
+     * Descending order when `seq_strand` is negative strand
+
+    Args:
+        seq_regions: list of SeqRegion objects to chain together
+        seq_strand: sequence strand which defines the chaining order
+
+    Returns:
+        String representing the chained sequence of all input SeqRegions
     """
 
     sort_args: typing.Dict[str, typing.Any] = dict(key=lambda region: region.start, reverse=False)

From 0129f4c3eef3ce747c7f27568f500a261b796baa Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Wed, 13 Mar 2024 19:09:28 +0000
Subject: [PATCH 29/32] Added sequence length assertion to SeqRegion class

---
 .../src/seq_region/seq_region.py              | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 6a36ffe6..1546be41 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -106,7 +106,27 @@ def fetch_seq(self) -> None:
             if self.strand == '-':
                 seq = Seq.reverse_complement(seq)
 
-        self.sequence = seq
+        self.set_sequence(seq)
+
+    def set_sequence(self, sequence: str) -> None:
+        """
+        Set the `sequence` attribute.
+
+        Asserts the length of `sequence` matches the expected sequence length for this region.
+
+        Args:
+            sequence: DNA sequence (string)
+
+        Raises:
+            valueError: If the length of `sequence` provided does not match the region length
+        """
+
+        seq_len = len(sequence)
+        expected_len = self.end - self.start + 1
+        if seq_len != expected_len:
+            raise ValueError(f"Sequence length {seq_len} does not equal length expected on region positions {expected_len}.")
+        else:
+            self.sequence = sequence
 
     def get_sequence(self) -> str:
         """Return `sequence` attribute as a string (empty string if `None`)."""

From 48549c9ab090fe481b5db79cc2154b143951a296 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 14 Mar 2024 19:34:38 +0000
Subject: [PATCH 30/32] Improved readability of typing + add typing for click
 values

---
 .../seq_retrieval/src/data_mover/data_file_mover.py    |  8 ++++----
 pipeline/seq_retrieval/src/main.py                     |  8 ++++----
 pipeline/seq_retrieval/src/seq_region/seq_region.py    | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index 713a1ba4..c9b2ac69 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -3,11 +3,11 @@
 """
 import os.path
 from pathlib import Path
-import typing
+from typing import Dict, Optional
 import requests
 from urllib.parse import urlparse, unquote
 
-_stored_files: typing.Dict[str, str] = dict()
+_stored_files: Dict[str, str] = dict()
 """Module level memory cache of filepaths for all files accessed through this module."""
 
 _DEFAULT_DIR = '/tmp/pavi/'
@@ -52,7 +52,7 @@ def is_accessible_url(url: str) -> bool:
     else:
         return False
 
-def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str:
+def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Optional[bool] = None) -> str:
     """
     Fetch file from URL and return its local path.
 
@@ -104,7 +104,7 @@ def find_local_file(path: str) -> str:
         else:
             return str(Path(path).resolve())
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None) -> str:
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str:
     """
     Download file from remote URL and return its absolute local path.
 
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index c2b65a3a..fcdcba7f 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -4,13 +4,13 @@
 Retrieves multiple sequence regions and returns them as one chained sequence.
 """
 import click
-import typing
+from typing import Any, Dict, List, Literal
 import json
 
 import data_mover.data_file_mover as data_file_mover
 from seq_region import SeqRegion, chain_seq_region_seqs
 
-def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']:
+def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']:
     """
     Processes and normalises the value of click input argument `strand`.
 
@@ -29,7 +29,7 @@ def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']:
     else:
         raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
 
-def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,typing.Any]]:
+def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
     """
     Parse the value of click input parameter seq_regions and validate it's structure.
 
@@ -81,7 +81,7 @@ def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,
 @click.option("--reuse_local_cache", is_flag=True,
               help="""When defined and using remote `fasta_file_url`, reused local files
               if file already exists at destination path, rather than re-downloading and overwritting.""")
-def main(seq_id: str, seq_strand: str, seq_regions: typing.List, fasta_file_url: str, reuse_local_cache: bool):
+def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, reuse_local_cache: bool):
     """
     Main method for sequence retrieval from JBrowse faidx indexed fasta files. Receives input args from click.
 
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 1546be41..2668a044 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -1,7 +1,7 @@
 """
 Module containing the SeqRegion class and all functions to handle SeqRegion entities.
 """
-import typing
+from typing import Any, Dict, List, Optional
 
 from Bio import Seq #Bio.Seq biopython submodule
 import pysam
@@ -28,10 +28,10 @@ class SeqRegion():
     fasta_file_path: str
     """Absolute path to (faidx indexed) FASTA file containing the sequences"""
 
-    sequence: typing.Optional[str]
+    sequence: Optional[str]
     """the DNA sequence of a sequence region"""
 
-    def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: typing.Optional[str] = None):
+    def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: Optional[str] = None):
         """
         Initializes a SeqRegion instance
 
@@ -132,7 +132,7 @@ def get_sequence(self) -> str:
         """Return `sequence` attribute as a string (empty string if `None`)."""
         return str(self.sequence)
 
-def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) -> str:
+def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str:
     """
     Chain multiple SeqRegions' sequenes together into one continuous sequence.
 
@@ -148,7 +148,7 @@ def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str)
         String representing the chained sequence of all input SeqRegions
     """
 
-    sort_args: typing.Dict[str, typing.Any] = dict(key=lambda region: region.start, reverse=False)
+    sort_args: Dict[str, Any] = dict(key=lambda region: region.start, reverse=False)
 
     if seq_strand == '-':
         sort_args['reverse'] = True

From 3b0dca6b8d359f5922401d1541c7daa086926814 Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 14 Mar 2024 19:45:54 +0000
Subject: [PATCH 31/32] Moved tests files and configs to more standard path

---
 pipeline/seq_retrieval/Makefile                                 | 2 +-
 .../type-testing-requirements.txt => tests/requirements.txt}    | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename pipeline/seq_retrieval/{test/type-testing-requirements.txt => tests/requirements.txt} (100%)

diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
index d371eefd..7a788b0b 100644
--- a/pipeline/seq_retrieval/Makefile
+++ b/pipeline/seq_retrieval/Makefile
@@ -9,7 +9,7 @@ python-dependencies-update:
 	pip install -U -r requirements.txt
 
 check-python-typing: python-dependencies
-	pip install -r test/type-testing-requirements.txt
+	pip install -r tests/requirements.txt
 	mypy --install-types --non-interactive src/main.py
 
 check-venv-active:
diff --git a/pipeline/seq_retrieval/test/type-testing-requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt
similarity index 100%
rename from pipeline/seq_retrieval/test/type-testing-requirements.txt
rename to pipeline/seq_retrieval/tests/requirements.txt

From 2207ad6a6c74fa3c14cc5f3eb1d914a05558f02c Mon Sep 17 00:00:00 2001
From: Manuel Luypaert <luypaert@ebi.ac.uk>
Date: Thu, 14 Mar 2024 21:09:57 +0000
Subject: [PATCH 32/32] Added flake8 style check and made code compliant

---
 .github/workflows/PR-validation.yml           | 24 +++++++++++++++++--
 pipeline/seq_retrieval/.flake8                |  6 +++++
 pipeline/seq_retrieval/Makefile               |  7 +++++-
 .../src/data_mover/data_file_mover.py         | 17 ++++++++-----
 pipeline/seq_retrieval/src/main.py            | 18 ++++++++------
 .../seq_retrieval/src/seq_region/__init__.py  |  2 +-
 .../src/seq_region/seq_region.py              | 17 ++++++-------
 pipeline/seq_retrieval/tests/requirements.txt |  3 ++-
 8 files changed, 68 insertions(+), 26 deletions(-)
 create mode 100644 pipeline/seq_retrieval/.flake8

diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml
index 47fa9ce3..c39ae7d2 100644
--- a/.github/workflows/PR-validation.yml
+++ b/.github/workflows/PR-validation.yml
@@ -22,8 +22,8 @@ jobs:
       - name: Build container image
         run: |
           make container-image
-  pipeline-seq-retrieval-check-python-typing:
-    name: pipeline/seq_retrieval check python typing
+  pipeline-seq-retrieval-python-typing-check:
+    name: pipeline/seq_retrieval python typing check
     runs-on: ubuntu-22.04
     defaults:
       run:
@@ -42,5 +42,25 @@ jobs:
       - name: Check python typing
         run: |
           make check-python-typing
+  pipeline-seq-retrieval-python-style-check:
+    name: pipeline/seq_retrieval python style check
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Check python style
+        run: |
+          make check-python-style
   #TODO: add unit testing
   #TODO: add integration testing
diff --git a/pipeline/seq_retrieval/.flake8 b/pipeline/seq_retrieval/.flake8
new file mode 100644
index 00000000..02b9d28d
--- /dev/null
+++ b/pipeline/seq_retrieval/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+ignore = E203, E266, E501, F401, W503, E501
+max-complexity = 18
+select = B,C,E,F,W,T4
+exclude = venv
+per-file-ignores = __init__.py:F401
diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
index 7a788b0b..6f2acd17 100644
--- a/pipeline/seq_retrieval/Makefile
+++ b/pipeline/seq_retrieval/Makefile
@@ -8,10 +8,15 @@ python-dependencies:
 python-dependencies-update:
 	pip install -U -r requirements.txt
 
-check-python-typing: python-dependencies
+python-test-dependencies:
 	pip install -r tests/requirements.txt
+
+check-python-typing: python-dependencies python-test-dependencies
 	mypy --install-types --non-interactive src/main.py
 
+check-python-style: python-dependencies python-test-dependencies
+	flake8 ./
+
 check-venv-active:
 ifeq ($(VIRTUAL_ENV),)
 	@echo 'No active python virtual environment found.'\
diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
index c9b2ac69..435c908d 100644
--- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
+++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -22,6 +22,7 @@
 Change the value through the `set_local_cache_reuse` function.
 """
 
+
 def set_local_cache_reuse(reuse: bool) -> None:
     """
     Define data_file_mover module-level behaviour on local file cache reuse.
@@ -36,6 +37,7 @@ def set_local_cache_reuse(reuse: bool) -> None:
     global _reuse_local_cache
     _reuse_local_cache = reuse
 
+
 def is_accessible_url(url: str) -> bool:
     """
     Check whether provided `url` is an accessible (remote) URL
@@ -52,6 +54,7 @@ def is_accessible_url(url: str) -> bool:
     else:
         return False
 
+
 def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Optional[bool] = None) -> str:
     """
     Fetch file from URL and return its local path.
@@ -83,6 +86,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Option
 
     return local_path
 
+
 def find_local_file(path: str) -> str:
     """
     Find a file locally based on path and return its absolute path.
@@ -104,7 +108,8 @@ def find_local_file(path: str) -> str:
         else:
             return str(Path(path).resolve())
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str:
+
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size: int = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str:
     """
     Download file from remote URL and return its absolute local path.
 
@@ -125,7 +130,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
         `ValueError`: if `url` scheme is not supported or `url` is not accessible.
     """
 
-    if reuse_local_cache == None:
+    if reuse_local_cache is None:
         reuse_local_cache = _reuse_local_cache
 
     url_components = urlparse(url)
@@ -140,13 +145,13 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
         local_file_path = os.path.join(dest_dir, filename)
 
         if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
-            if reuse_local_cache == True:
-                #Return the local file path without downloading new content
+            if reuse_local_cache is True:
+                # Return the local file path without downloading new content
                 return str(Path(local_file_path).resolve())
             else:
                 os.remove(local_file_path)
 
-        #Download file through streaming to support large files
+        # Download file through streaming to support large files
         tmp_file_path = f"{local_file_path}.part"
         response = requests.get(url, stream=True)
 
@@ -158,5 +163,5 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
 
         return str(Path(local_file_path).resolve())
     else:
-        #Currently not supported
+        # Currently not supported
         raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.")
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
index fcdcba7f..0e4b0e62 100644
--- a/pipeline/seq_retrieval/src/main.py
+++ b/pipeline/seq_retrieval/src/main.py
@@ -10,6 +10,7 @@
 import data_mover.data_file_mover as data_file_mover
 from seq_region import SeqRegion, chain_seq_region_seqs
 
+
 def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']:
     """
     Processes and normalises the value of click input argument `strand`.
@@ -22,14 +23,15 @@ def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']:
     """
     POS_CHOICES = ['+', '+1', 'pos']
     NEG_CHOICES = ['-', '-1', 'neg']
-    if   value in POS_CHOICES:
+    if value in POS_CHOICES:
         return '+'
     elif value in NEG_CHOICES:
         return '-'
     else:
         raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
 
-def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
+
+def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str, Any]]:
     """
     Parse the value of click input parameter seq_regions and validate it's structure.
 
@@ -47,8 +49,8 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
     seq_regions = None
     try:
         seq_regions = json.loads(value)
-    except:
-        raise click.BadParameter(f"Must be a valid JSON-formatted string.")
+    except Exception:
+        raise click.BadParameter("Must be a valid JSON-formatted string.")
     else:
         if not isinstance(seq_regions, list):
             raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.")
@@ -66,6 +68,7 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
 
         return seq_regions
 
+
 @click.command()
 @click.option("--seq_id", type=click.STRING, required=True,
               help="The sequence ID to retrieve sequences for.")
@@ -96,15 +99,16 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r
     seq_region_objs = []
     for region in seq_regions:
         seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
-                                          fasta_file_url=fasta_file_url))
+                                         fasta_file_url=fasta_file_url))
 
     for seq_region in seq_region_objs:
-        #Retrieve sequence for region
+        # Retrieve sequence for region
         seq_region.fetch_seq()
 
-    #Concatenate all regions into single sequence
+    # Concatenate all regions into single sequence
     seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand)
     click.echo(f"\nSeq concat: {seq_concat}")
 
+
 if __name__ == '__main__':
     main()
diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py
index 75eecf7f..8a4ddd88 100644
--- a/pipeline/seq_retrieval/src/seq_region/__init__.py
+++ b/pipeline/seq_retrieval/src/seq_region/__init__.py
@@ -1,2 +1,2 @@
 from .seq_region import SeqRegion
-from .seq_region import chain_seq_region_seqs
\ No newline at end of file
+from .seq_region import chain_seq_region_seqs
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
index 2668a044..c8d777a6 100644
--- a/pipeline/seq_retrieval/src/seq_region/seq_region.py
+++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -3,11 +3,12 @@
 """
 from typing import Any, Dict, List, Optional
 
-from Bio import Seq #Bio.Seq biopython submodule
+from Bio import Seq  # Bio.Seq biopython submodule
 import pysam
 
 from data_mover import data_file_mover
 
+
 class SeqRegion():
     """
     Defines a DNA sequence region.
@@ -52,7 +53,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
         self.seq_id = seq_id
         self.strand = strand
 
-        #If strand is -, ensure start <= end (swap as required)
+        # If strand is -, ensure start <= end (swap as required)
         if strand == '-':
             if end < start:
                 self.start = end
@@ -60,7 +61,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
             else:
                 self.start = start
                 self.end = end
-        #If strand is +, throw error when end < start (likely user error)
+        # If strand is +, throw error when end < start (likely user error)
         else:
             if end < start:
                 raise ValueError(f"Unexpected position order: end {end} < start {start}.")
@@ -73,19 +74,18 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
 
         # Fetch additional faidx index files in addition to fasta file itself
         # (to the same location)
-        index_files = [fasta_file_url+'.fai']
+        index_files = [fasta_file_url + '.fai']
         if fasta_file_url.endswith('.gz'):
-            index_files.append(fasta_file_url+'.gzi')
+            index_files.append(fasta_file_url + '.gzi')
 
         for index_file in index_files:
             data_file_mover.fetch_file(index_file)
 
         self.fasta_file_path = local_fasta_file_path
 
-        if seq != None:
+        if seq is not None:
             self.sequence = seq
 
-
     def fetch_seq(self) -> None:
         """
         Fetch sequence found at `seq_id`:`start`-`end`:`strand`
@@ -100,7 +100,7 @@ def fetch_seq(self) -> None:
         except IOError:
             raise IOError(f"Error while reading fasta file or index matching path {self.fasta_file_path}.")
         else:
-            seq = fasta_file.fetch(reference=self.seq_id, start=self.start-1, end=self.end)
+            seq = fasta_file.fetch(reference=self.seq_id, start=(self.start - 1), end=self.end)
             fasta_file.close()
 
             if self.strand == '-':
@@ -132,6 +132,7 @@ def get_sequence(self) -> str:
         """Return `sequence` attribute as a string (empty string if `None`)."""
         return str(self.sequence)
 
+
 def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str:
     """
     Chain multiple SeqRegions' sequenes together into one continuous sequence.
diff --git a/pipeline/seq_retrieval/tests/requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt
index fcea0984..d6d09df3 100644
--- a/pipeline/seq_retrieval/tests/requirements.txt
+++ b/pipeline/seq_retrieval/tests/requirements.txt
@@ -1 +1,2 @@
-mypy==1.9.*
\ No newline at end of file
+flake8==7.0.*
+mypy==1.9.*