From 6f699f78b4e502ac9c1fea2b8e8a09dae622adae Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Wed, 6 Mar 2024 10:51:54 +0000 Subject: [PATCH 01/32] Built basic framework for seq retrieval component --- pipeline/seq_retrieval/Dockerfile | 11 +++++++++ pipeline/seq_retrieval/Makefile | 2 ++ pipeline/seq_retrieval/README.md | 32 +++++++++++++++++++++++++ pipeline/seq_retrieval/requirements.txt | 1 + pipeline/seq_retrieval/src/main.py | 12 ++++++++++ 5 files changed, 58 insertions(+) create mode 100644 pipeline/seq_retrieval/Dockerfile create mode 100644 pipeline/seq_retrieval/Makefile create mode 100644 pipeline/seq_retrieval/README.md create mode 100644 pipeline/seq_retrieval/requirements.txt create mode 100644 pipeline/seq_retrieval/src/main.py diff --git a/pipeline/seq_retrieval/Dockerfile b/pipeline/seq_retrieval/Dockerfile new file mode 100644 index 00000000..949423a4 --- /dev/null +++ b/pipeline/seq_retrieval/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.12-alpine + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY src/ ./ + +ENTRYPOINT [ "python", "main.py"] +CMD [ "--help" ] diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile new file mode 100644 index 00000000..d8a0daec --- /dev/null +++ b/pipeline/seq_retrieval/Makefile @@ -0,0 +1,2 @@ +container-image: + docker build -t agr_pavi/seq_retrieval . diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md new file mode 100644 index 00000000..242eeb1e --- /dev/null +++ b/pipeline/seq_retrieval/README.md @@ -0,0 +1,32 @@ +# PAVI Sequence retrieval +This subdirectory contains all code and configs for the PAVI sequence retrieval component. + +## Development +In order to enable isolated local development that does not interfere with the global system python setup, +a virtual environment is used to do code development and testing for this component. + +To start developing on a new system, create a virtual environment using the following command +(having this directory as your working directory): +```bash +python3.12 -m venv ./venv +``` + +Then when developing this component, activated the virtual environment with: +```bash +source venv/bin/activate +``` +Once the virtual environment is activated, all packages will be installed in this isolated virtual environment. +To install all python package dependencies: +```bash +pip install -r requirements.txt +``` + +To upgrade all packages (already installed) to the latest available version matching the requirements: +```bash +pip install -U -r requirements.txt +``` + +Once ending development for this component, deactivate the virtual environment with: +```bash +deactivate +``` diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt new file mode 100644 index 00000000..5a1f42a8 --- /dev/null +++ b/pipeline/seq_retrieval/requirements.txt @@ -0,0 +1 @@ +click==8.1.* diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py new file mode 100644 index 00000000..2ece0e0f --- /dev/null +++ b/pipeline/seq_retrieval/src/main.py @@ -0,0 +1,12 @@ +import click + +@click.command() +@click.option("--seq_id", required=True, help="The sequence ID to retrieve sequences for.") +@click.option("--seq_strand", required=True, help="The sequence strand to retrieve sequences for.") +@click.option("--genes", required=True, help="A list of gene maps to retrieve sequences for.") +def main(seq_id, seq_strand, genes): + """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files.""" + click.echo(f"Received request to retrieve sequences for {seq_id} strand {seq_strand}!") + +if __name__ == '__main__': + main() From 33396e2ba4effd05acc852f8ea43fbf27d6d2d8e Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 7 Mar 2024 11:47:54 +0000 Subject: [PATCH 02/32] Updated input params to match single sequence retrieval and added input validation --- pipeline/seq_retrieval/src/main.py | 60 +++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 2ece0e0f..b98edd08 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,12 +1,60 @@ import click +import json + +def validate_strand(ctx, param, value): + """Returns a normalised version of strings representing a strand. + Negative strand is normalised to '-', positive strand to '+'. + Throws a click.BadParameter exception if an unrecognised string was provided.""" + POS_CHOICES = ['+', '+1', 'pos'] + NEG_CHOICES = ['-', '-1', 'neg'] + if value in POS_CHOICES: + return '+' + elif value in NEG_CHOICES: + return '-' + else: + raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") + +def process_seq_regions(ctx, param, value): + """Parse the seq_regions parameter value and validate it's structure. + Value is expected to be a JSON-formatted list of sequence regions to retrieve. + Each region should have: + * a 'start' property indicating the region start (inclusive) + * a 'end' property indicating the region end (inclusive) + Throws a click.BadParameter exception if value could not be parsed as JSON or had an invalid structure.""" + seq_regions = None + try: + seq_regions = json.loads(value) + except: + raise click.BadParameter(f"Must be a valid JSON-formatted string.") + else: + if not isinstance(seq_regions, list): + raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.") + for region in seq_regions: + if not isinstance(region, dict): + raise click.BadParameter(f"Region {region} is not a valid dict. All regions in seq_regions list must be valid dicts (JSON-objects).") + if 'start' not in region.keys(): + raise click.BadParameter(f"Region {region} does not have a 'start' property, which is a required property.") + if 'end' not in region.keys(): + raise click.BadParameter(f"Region {region} does not have a 'end' property, which is a required property.") + if not isinstance(region['start'], int): + raise click.BadParameter(f"'start' property of region {region} is not an integer. All positions must be integers.") + if not isinstance(region['end'], int): + raise click.BadParameter(f"'end' property of region {region} is not an integer. All positions must be integers.") + + return seq_regions @click.command() -@click.option("--seq_id", required=True, help="The sequence ID to retrieve sequences for.") -@click.option("--seq_strand", required=True, help="The sequence strand to retrieve sequences for.") -@click.option("--genes", required=True, help="A list of gene maps to retrieve sequences for.") -def main(seq_id, seq_strand, genes): - """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files.""" - click.echo(f"Received request to retrieve sequences for {seq_id} strand {seq_strand}!") +@click.option("--seq_id", type=click.STRING, required=True, + help="The sequence ID to retrieve sequences for.") +@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand, + help="The sequence strand to retrieve sequences for.") +@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions, + help="A list of sequence regions to retrieve sequences for.") +def main(seq_id, seq_strand, seq_regions): + """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files. + Returns a single (transcript) sequence made by concatenating all sequence regions requested + (in specified order).""" + click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") if __name__ == '__main__': main() From 14090ac23d48fe10d7b2dd0cd5a1cc9b55e8ee25 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 7 Mar 2024 14:08:47 +0000 Subject: [PATCH 03/32] Added code for basic (+ strand) seq retrieval --- pipeline/seq_retrieval/requirements.txt | 1 + pipeline/seq_retrieval/src/faidx.py | 19 +++++++++++++++++++ pipeline/seq_retrieval/src/main.py | 22 ++++++++++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 pipeline/seq_retrieval/src/faidx.py diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt index 5a1f42a8..9dc491f0 100644 --- a/pipeline/seq_retrieval/requirements.txt +++ b/pipeline/seq_retrieval/requirements.txt @@ -1 +1,2 @@ click==8.1.* +pysam==0.22.* diff --git a/pipeline/seq_retrieval/src/faidx.py b/pipeline/seq_retrieval/src/faidx.py new file mode 100644 index 00000000..f44e4aca --- /dev/null +++ b/pipeline/seq_retrieval/src/faidx.py @@ -0,0 +1,19 @@ +""" +Module containing all functions used to handle and access faidx files. +""" +import pysam + +def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand): + """Return sequence found at `seq_id`:`seq_start`-`seq_end`:`seq_strand` + by reading from faidx files at `file_path`. Use 1-based, fully inclusive coordinates.""" + try: + fasta_file = pysam.FastaFile(fasta_file_path) + except ValueError: + raise FileNotFoundError(f"Missing index file matching path {fasta_file_path}.") + except IOError: + raise IOError(f"Error while reading fasta file or index matching path {fasta_file_path}.") + else: + seq = fasta_file.fetch(reference=seq_id, start=seq_start-1, end=seq_end) + + fasta_file.close() + return seq diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index b98edd08..f40dd174 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,6 +1,8 @@ import click import json +import faidx + def validate_strand(ctx, param, value): """Returns a normalised version of strings representing a strand. Negative strand is normalised to '-', positive strand to '+'. @@ -50,11 +52,27 @@ def process_seq_regions(ctx, param, value): help="The sequence strand to retrieve sequences for.") @click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions, help="A list of sequence regions to retrieve sequences for.") -def main(seq_id, seq_strand, seq_regions): - """Main method for sequence retrieval from JBrowse fastaidx indexed fasta files. +@click.option("--fasta_file_path", type=click.STRING, required=True, + help="""Path to (local) faidx-indexed fasta file.\ + Assumes the fasta file is compressed, + and an index file is found at `.gzi`.""") +def main(seq_id, seq_strand, seq_regions, fasta_file_path): + """Main method for sequence retrieval from JBrowse faidx indexed fasta files. Returns a single (transcript) sequence made by concatenating all sequence regions requested (in specified order).""" click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") + #Retrieve sequence for each region + click.echo(f"\nRegion seqs:") + for region in seq_regions: + seq = faidx.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand, + fasta_file_path=fasta_file_path) + click.echo(seq) + region['seq'] = seq + + #Concatenate all regions into single sequence + seq_concat = ''.join(map(lambda region:region['seq'], seq_regions)) + click.echo(f"\nSeq concat: {seq_concat}") + if __name__ == '__main__': main() From 29c4dff2ab3f66cb377ec6607a4a786304514f65 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 7 Mar 2024 16:27:01 +0000 Subject: [PATCH 04/32] Refactored faidx to enable more broad seq functions import --- pipeline/seq_retrieval/src/main.py | 4 ++-- pipeline/seq_retrieval/src/{faidx.py => seq_fns.py} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename pipeline/seq_retrieval/src/{faidx.py => seq_fns.py} (88%) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index f40dd174..86b68e23 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,7 +1,7 @@ import click import json -import faidx +import seq_fns def validate_strand(ctx, param, value): """Returns a normalised version of strings representing a strand. @@ -65,7 +65,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_path): #Retrieve sequence for each region click.echo(f"\nRegion seqs:") for region in seq_regions: - seq = faidx.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand, + seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand, fasta_file_path=fasta_file_path) click.echo(seq) region['seq'] = seq diff --git a/pipeline/seq_retrieval/src/faidx.py b/pipeline/seq_retrieval/src/seq_fns.py similarity index 88% rename from pipeline/seq_retrieval/src/faidx.py rename to pipeline/seq_retrieval/src/seq_fns.py index f44e4aca..a3b3d130 100644 --- a/pipeline/seq_retrieval/src/faidx.py +++ b/pipeline/seq_retrieval/src/seq_fns.py @@ -1,5 +1,5 @@ """ -Module containing all functions used to handle and access faidx files. +Module containing all functions used to handle and access sequences and sequence files. """ import pysam From 048b9f14679dff36f3d47d609c4bde70eff1e8d0 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 7 Mar 2024 19:19:09 +0000 Subject: [PATCH 05/32] Added support for correct negative strand sequence retrieval (and chaining) --- pipeline/seq_retrieval/requirements.txt | 1 + pipeline/seq_retrieval/src/main.py | 13 ++++++++++-- pipeline/seq_retrieval/src/seq_fns.py | 27 +++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt index 9dc491f0..50fce435 100644 --- a/pipeline/seq_retrieval/requirements.txt +++ b/pipeline/seq_retrieval/requirements.txt @@ -1,2 +1,3 @@ click==8.1.* +biopython==1.83 pysam==0.22.* diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 86b68e23..61e9ca62 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -60,18 +60,27 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_path): """Main method for sequence retrieval from JBrowse faidx indexed fasta files. Returns a single (transcript) sequence made by concatenating all sequence regions requested (in specified order).""" + click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") - #Retrieve sequence for each region click.echo(f"\nRegion seqs:") for region in seq_regions: + #If seq_strand is -, ensure seq_start < seq_end (swap as required) + if seq_strand == '-' and region['end'] < region['start']: + seq_start = region['end'] + seq_end = region['start'] + + region['start'] = seq_start + region['end'] = seq_end + + #Retrieve sequence for region seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand, fasta_file_path=fasta_file_path) click.echo(seq) region['seq'] = seq #Concatenate all regions into single sequence - seq_concat = ''.join(map(lambda region:region['seq'], seq_regions)) + seq_concat = seq_fns.chain_seq_region_seqs(seq_regions, seq_strand) click.echo(f"\nSeq concat: {seq_concat}") if __name__ == '__main__': diff --git a/pipeline/seq_retrieval/src/seq_fns.py b/pipeline/seq_retrieval/src/seq_fns.py index a3b3d130..cd9a5aac 100644 --- a/pipeline/seq_retrieval/src/seq_fns.py +++ b/pipeline/seq_retrieval/src/seq_fns.py @@ -1,6 +1,7 @@ """ Module containing all functions used to handle and access sequences and sequence files. """ +from Bio import Seq #Bio.Seq biopython submodule import pysam def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand): @@ -14,6 +15,28 @@ def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand): raise IOError(f"Error while reading fasta file or index matching path {fasta_file_path}.") else: seq = fasta_file.fetch(reference=seq_id, start=seq_start-1, end=seq_end) - fasta_file.close() - return seq + + if seq_strand == '-': + seq = Seq.reverse_complement(seq) + + return seq + +def chain_seq_region_seqs(seq_regions: list, seq_strand: str): + """ + Chain multiple sequence region seq's together into one continuous sequence. + Regions are chained together in an order based on the 'start' position of each: + * Ascending order when positive strand + * Descending order when negative strand + """ + + sort_args = dict(key=lambda region: region['start']) + + if seq_strand == '-': + sort_args['reverse'] = True + + sorted_regions = seq_regions + seq_regions.sort(**sort_args) + chained_seq = ''.join(map(lambda region: region['seq'], sorted_regions)) + + return chained_seq From 26fc87817fcef3321141a1854c8c2270e57b35f2 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 7 Mar 2024 19:20:24 +0000 Subject: [PATCH 06/32] Added additional python pysam and biopython package installation dependencies --- pipeline/seq_retrieval/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipeline/seq_retrieval/Dockerfile b/pipeline/seq_retrieval/Dockerfile index 949423a4..5043e735 100644 --- a/pipeline/seq_retrieval/Dockerfile +++ b/pipeline/seq_retrieval/Dockerfile @@ -2,6 +2,8 @@ FROM python:3.12-alpine WORKDIR /usr/src/app +RUN apk add --no-cache build-base zlib-dev bzip2-dev xz-dev + COPY requirements.txt ./ RUN pip install --no-cache-dir -r requirements.txt From 3965c482e69dc05ed923356161a0dacfb53195f5 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Fri, 8 Mar 2024 14:23:11 +0000 Subject: [PATCH 07/32] Updated fasta_file option to support remote URLs --- pipeline/seq_retrieval/requirements.txt | 1 + .../src/data_mover/data_file_mover.py | 75 +++++++++++++++++++ pipeline/seq_retrieval/src/main.py | 27 +++++-- 3 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 pipeline/seq_retrieval/src/data_mover/data_file_mover.py diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt index 50fce435..30887157 100644 --- a/pipeline/seq_retrieval/requirements.txt +++ b/pipeline/seq_retrieval/requirements.txt @@ -1,3 +1,4 @@ click==8.1.* biopython==1.83 pysam==0.22.* +requests==2.31.* diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py new file mode 100644 index 00000000..36bf885a --- /dev/null +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -0,0 +1,75 @@ +""" +Moving files to and from remote locations +""" +import os.path +from pathlib import Path +import requests +from urllib.parse import urlparse, unquote + +_stored_files = dict() +_DEFAULT_DIR = '/tmp/pavi/' + +def is_accessible_url(url: str): + """ + Returns True when provided `url` is an accessible URL + """ + response = requests.head(url) + if response.ok: + return True + else: + return False + +def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR): + """ + Fetch file from URL, return its local path. + """ + local_path = None + if url not in _stored_files.keys(): + url_components = urlparse(url) + if url_components.scheme == 'file': + filepath = url_components.netloc + url_components.path + local_path = find_local_file(filepath) + else: + local_path = download_from_url(url, dest_dir) + _stored_files[url] = local_path + else: + local_path = _stored_files[url] + + return local_path + +def find_local_file(path: str): + """ + Find a file locally based on path and return its absolute path. + If no file was found at given path, throws Exception. + """ + if not os.path.exists(path): + raise FileNotFoundError(f"No file found at path '{path}'.") + else: + if not os.path.isfile(path): + raise FileNotFoundError(f"Specified path '{path}' exists but is not a file.") + else: + return Path(path).resolve() + +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024): + url_components = urlparse(url) + if url_components.scheme in ['http', 'https']: + + if not is_accessible_url(url): + raise ValueError(f"URL {url} is not accessible.") + + Path(dest_dir).mkdir(parents=True, exist_ok=True) + + filename = unquote(os.path.basename(url_components.path)) + local_file_path = os.path.join(dest_dir, filename) + + #Download file through streaming to support large files + response = requests.get(url, stream=True) + + with open(local_file_path, mode="wb") as local_file: + for chunk in response.iter_content(chunk_size=chunk_size): + local_file.write(chunk) + + return Path(local_file_path).resolve() + else: + #Currently not supported + raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.") diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 61e9ca62..211d24d2 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,6 +1,8 @@ import click import json +from os import path +import data_mover.data_file_mover as data_file_mover import seq_fns def validate_strand(ctx, param, value): @@ -52,17 +54,30 @@ def process_seq_regions(ctx, param, value): help="The sequence strand to retrieve sequences for.") @click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions, help="A list of sequence regions to retrieve sequences for.") -@click.option("--fasta_file_path", type=click.STRING, required=True, - help="""Path to (local) faidx-indexed fasta file.\ - Assumes the fasta file is compressed, - and an index file is found at `.gzi`.""") -def main(seq_id, seq_strand, seq_regions, fasta_file_path): +@click.option("--fasta_file_url", type=click.STRING, required=True, + help="""URL to (faidx-indexed) fasta file to retrieve sequences from.\ + Assumes additional index files can be found at `.fai`, + and at `.gzi` if the fastafile is compressed. + Use "file://*" URL for local file or "http(s)://*" for remote files.""") +def main(seq_id, seq_strand, seq_regions, fasta_file_url: str): """Main method for sequence retrieval from JBrowse faidx indexed fasta files. Returns a single (transcript) sequence made by concatenating all sequence regions requested (in specified order).""" click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") + # Fetch the file(s) + local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url) + + # Fetch additional faidx index files in addition to fasta file itself + # (to the same location) + index_files = [fasta_file_url+'.fai'] + if fasta_file_url.endswith('.gz'): + index_files.append(fasta_file_url+'.gzi') + + for index_file in index_files: + data_file_mover.fetch_file(index_file) + click.echo(f"\nRegion seqs:") for region in seq_regions: #If seq_strand is -, ensure seq_start < seq_end (swap as required) @@ -75,7 +90,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_path): #Retrieve sequence for region seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand, - fasta_file_path=fasta_file_path) + fasta_file_path=local_fasta_file_path) click.echo(seq) region['seq'] = seq From 0d2400b248d1f01bcb4f7ae7c49fc2637ff6c1cd Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Fri, 8 Mar 2024 14:25:27 +0000 Subject: [PATCH 08/32] Added Make targets for easy python dependency installation --- pipeline/seq_retrieval/Makefile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index d8a0daec..63937f80 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -1,2 +1,19 @@ +.PHONY: check-venv-active + container-image: docker build -t agr_pavi/seq_retrieval . +python-dependencies: + pip install -r requirements.txt +python-dependencies-update: + pip install -U -r requirements.txt +check-venv-active: +ifeq ($(VIRTUAL_ENV),) + @echo 'No active python virtual environment found.'\ + 'Please active the virtual environment first by running `source venv/bin/activate`,'\ + 'or read README.md for instructions on how to set up a new one.' + @exit 1 +else + @: +endif +python-dependencies-dev: check-venv-active python-dependencies +python-dependencies-dev-update: check-venv-active python-dependencies-update From a759e4d5f767c70b5ce53c8d7cde88a4417ffe55 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Fri, 8 Mar 2024 14:25:54 +0000 Subject: [PATCH 09/32] Sorted python requirements alphabetically --- pipeline/seq_retrieval/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt index 30887157..ae3751d8 100644 --- a/pipeline/seq_retrieval/requirements.txt +++ b/pipeline/seq_retrieval/requirements.txt @@ -1,4 +1,4 @@ -click==8.1.* biopython==1.83 +click==8.1.* pysam==0.22.* requests==2.31.* From 47d62a9cc6d001ad731293663218bd171cb165ae Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Fri, 8 Mar 2024 14:31:13 +0000 Subject: [PATCH 10/32] Minor docstring update --- pipeline/seq_retrieval/src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 211d24d2..69f047b8 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -62,7 +62,7 @@ def process_seq_regions(ctx, param, value): def main(seq_id, seq_strand, seq_regions, fasta_file_url: str): """Main method for sequence retrieval from JBrowse faidx indexed fasta files. Returns a single (transcript) sequence made by concatenating all sequence regions requested - (in specified order).""" + (in positional order defined by specified seq_strand).""" click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") From 50ced5c34ccd110c2e582683d87a23805573c9a3 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Fri, 8 Mar 2024 15:32:18 +0000 Subject: [PATCH 11/32] Added option to reuse local cache --- .../seq_retrieval/src/data_mover/data_file_mover.py | 13 ++++++++++--- pipeline/seq_retrieval/src/main.py | 9 ++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index 36bf885a..8568bf21 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -19,7 +19,7 @@ def is_accessible_url(url: str): else: return False -def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR): +def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = False): """ Fetch file from URL, return its local path. """ @@ -30,7 +30,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR): filepath = url_components.netloc + url_components.path local_path = find_local_file(filepath) else: - local_path = download_from_url(url, dest_dir) + local_path = download_from_url(url, dest_dir, reuse_local_cache=reuse_local_cache) _stored_files[url] = local_path else: local_path = _stored_files[url] @@ -50,7 +50,7 @@ def find_local_file(path: str): else: return Path(path).resolve() -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024): +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = False): url_components = urlparse(url) if url_components.scheme in ['http', 'https']: @@ -62,6 +62,13 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * filename = unquote(os.path.basename(url_components.path)) local_file_path = os.path.join(dest_dir, filename) + if os.path.exists(local_file_path) and os.path.isfile(local_file_path): + if reuse_local_cache == True: + #Return the local file path without downloading new content + return Path(local_file_path).resolve() + else: + os.remove(local_file_path) + #Download file through streaming to support large files response = requests.get(url, stream=True) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 69f047b8..8d8baa5b 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -59,7 +59,10 @@ def process_seq_regions(ctx, param, value): Assumes additional index files can be found at `.fai`, and at `.gzi` if the fastafile is compressed. Use "file://*" URL for local file or "http(s)://*" for remote files.""") -def main(seq_id, seq_strand, seq_regions, fasta_file_url: str): +@click.option("--reuse_local_cache", is_flag=True, + help="""When defined and using remote `fasta_file_url`, reused local files + if file already exists at destination path, rather than re-downloading and overwritting.""") +def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache: bool): """Main method for sequence retrieval from JBrowse faidx indexed fasta files. Returns a single (transcript) sequence made by concatenating all sequence regions requested (in positional order defined by specified seq_strand).""" @@ -67,7 +70,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_url: str): click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") # Fetch the file(s) - local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url) + local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url, reuse_local_cache=reuse_local_cache) # Fetch additional faidx index files in addition to fasta file itself # (to the same location) @@ -76,7 +79,7 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_url: str): index_files.append(fasta_file_url+'.gzi') for index_file in index_files: - data_file_mover.fetch_file(index_file) + data_file_mover.fetch_file(index_file, reuse_local_cache=reuse_local_cache) click.echo(f"\nRegion seqs:") for region in seq_regions: From 44dbe05d88e6e6b326ef7f11f9be1bff47506699 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Fri, 8 Mar 2024 15:54:12 +0000 Subject: [PATCH 12/32] Prevent reads of partial streamed file after interrupts --- pipeline/seq_retrieval/src/data_mover/data_file_mover.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index 8568bf21..fda72d68 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -70,12 +70,15 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * os.remove(local_file_path) #Download file through streaming to support large files + tmp_file_path = f"{local_file_path}.part" response = requests.get(url, stream=True) - with open(local_file_path, mode="wb") as local_file: + with open(tmp_file_path, mode="wb") as local_file: for chunk in response.iter_content(chunk_size=chunk_size): local_file.write(chunk) + os.rename(tmp_file_path, local_file_path) + return Path(local_file_path).resolve() else: #Currently not supported From 3228d457c1aaaa39e59dc62a9f1e33e3928782ce Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Mon, 11 Mar 2024 19:37:30 +0000 Subject: [PATCH 13/32] Refactor to improve code readability --- .../src/data_mover/data_file_mover.py | 16 +++- pipeline/seq_retrieval/src/main.py | 33 ++----- pipeline/seq_retrieval/src/seq_fns.py | 42 -------- .../seq_retrieval/src/seq_region/__init__.py | 2 + .../src/seq_region/seq_region.py | 95 +++++++++++++++++++ 5 files changed, 119 insertions(+), 69 deletions(-) delete mode 100644 pipeline/seq_retrieval/src/seq_fns.py create mode 100644 pipeline/seq_retrieval/src/seq_region/__init__.py create mode 100644 pipeline/seq_retrieval/src/seq_region/seq_region.py diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index fda72d68..f8463814 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -8,6 +8,14 @@ _stored_files = dict() _DEFAULT_DIR = '/tmp/pavi/' +_reuse_local_cache = False + +def set_local_cache_reuse(reuse: bool): + """ + Set _reuse_local_cache (True or False, default False) + """ + global _reuse_local_cache + _reuse_local_cache = reuse def is_accessible_url(url: str): """ @@ -19,10 +27,11 @@ def is_accessible_url(url: str): else: return False -def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = False): +def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = None): """ Fetch file from URL, return its local path. """ + global _stored_files local_path = None if url not in _stored_files.keys(): url_components = urlparse(url) @@ -50,7 +59,10 @@ def find_local_file(path: str): else: return Path(path).resolve() -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = False): +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = None): + if reuse_local_cache == None: + reuse_local_cache = _reuse_local_cache + url_components = urlparse(url) if url_components.scheme in ['http', 'https']: diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 8d8baa5b..e59fa976 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -3,7 +3,7 @@ from os import path import data_mover.data_file_mover as data_file_mover -import seq_fns +from seq_region import SeqRegion, chain_seq_region_seqs def validate_strand(ctx, param, value): """Returns a normalised version of strings representing a strand. @@ -69,36 +69,19 @@ def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") - # Fetch the file(s) - local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url, reuse_local_cache=reuse_local_cache) + data_file_mover.set_local_cache_reuse(reuse_local_cache) - # Fetch additional faidx index files in addition to fasta file itself - # (to the same location) - index_files = [fasta_file_url+'.fai'] - if fasta_file_url.endswith('.gz'): - index_files.append(fasta_file_url+'.gzi') - - for index_file in index_files: - data_file_mover.fetch_file(index_file, reuse_local_cache=reuse_local_cache) - - click.echo(f"\nRegion seqs:") + seq_region_objs = [] for region in seq_regions: - #If seq_strand is -, ensure seq_start < seq_end (swap as required) - if seq_strand == '-' and region['end'] < region['start']: - seq_start = region['end'] - seq_end = region['start'] - - region['start'] = seq_start - region['end'] = seq_end + seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand, + fasta_file_url=fasta_file_url)) + for seq_region in seq_region_objs: #Retrieve sequence for region - seq = seq_fns.get_seq(seq_id=seq_id, seq_start=region['start'], seq_end=region['end'], seq_strand=seq_strand, - fasta_file_path=local_fasta_file_path) - click.echo(seq) - region['seq'] = seq + seq_region.fetch_seq() #Concatenate all regions into single sequence - seq_concat = seq_fns.chain_seq_region_seqs(seq_regions, seq_strand) + seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand) click.echo(f"\nSeq concat: {seq_concat}") if __name__ == '__main__': diff --git a/pipeline/seq_retrieval/src/seq_fns.py b/pipeline/seq_retrieval/src/seq_fns.py deleted file mode 100644 index cd9a5aac..00000000 --- a/pipeline/seq_retrieval/src/seq_fns.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Module containing all functions used to handle and access sequences and sequence files. -""" -from Bio import Seq #Bio.Seq biopython submodule -import pysam - -def get_seq(fasta_file_path, seq_id, seq_start, seq_end, seq_strand): - """Return sequence found at `seq_id`:`seq_start`-`seq_end`:`seq_strand` - by reading from faidx files at `file_path`. Use 1-based, fully inclusive coordinates.""" - try: - fasta_file = pysam.FastaFile(fasta_file_path) - except ValueError: - raise FileNotFoundError(f"Missing index file matching path {fasta_file_path}.") - except IOError: - raise IOError(f"Error while reading fasta file or index matching path {fasta_file_path}.") - else: - seq = fasta_file.fetch(reference=seq_id, start=seq_start-1, end=seq_end) - fasta_file.close() - - if seq_strand == '-': - seq = Seq.reverse_complement(seq) - - return seq - -def chain_seq_region_seqs(seq_regions: list, seq_strand: str): - """ - Chain multiple sequence region seq's together into one continuous sequence. - Regions are chained together in an order based on the 'start' position of each: - * Ascending order when positive strand - * Descending order when negative strand - """ - - sort_args = dict(key=lambda region: region['start']) - - if seq_strand == '-': - sort_args['reverse'] = True - - sorted_regions = seq_regions - seq_regions.sort(**sort_args) - chained_seq = ''.join(map(lambda region: region['seq'], sorted_regions)) - - return chained_seq diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py new file mode 100644 index 00000000..75eecf7f --- /dev/null +++ b/pipeline/seq_retrieval/src/seq_region/__init__.py @@ -0,0 +1,2 @@ +from .seq_region import SeqRegion +from .seq_region import chain_seq_region_seqs \ No newline at end of file diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py new file mode 100644 index 00000000..4486fd9d --- /dev/null +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -0,0 +1,95 @@ +""" +Module containing the SeqRegion class and all methods to handle them. +SeqRegion class is used to define and retrieve sequence regions. +""" +from Bio import Seq #Bio.Seq biopython submodule +import pysam + +from data_mover import data_file_mover + +class SeqRegion(): + seq_id = str + start = int + end = int + strand = str + fasta_file_path = str + sequence = str + + def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: str = None): + """ + Uses 1-based, fully inclusive coordinates + """ + self.seq_id = seq_id + self.strand = strand + + #If strand is -, ensure start <= end (swap as required) + if strand == '-': + if end < start: + self.start = end + self.end = start + else: + self.start = start + self.end = end + #If strand is +, throw error when end < start (likely user error) + else: + if end < start: + raise ValueError(f"Unexpected position order: end {end} < start {start}.") + else: + self.start = start + self.end = end + + # Fetch the file(s) + local_fasta_file_path = data_file_mover.fetch_file(fasta_file_url) + + # Fetch additional faidx index files in addition to fasta file itself + # (to the same location) + index_files = [fasta_file_url+'.fai'] + if fasta_file_url.endswith('.gz'): + index_files.append(fasta_file_url+'.gzi') + + for index_file in index_files: + data_file_mover.fetch_file(index_file) + + self.fasta_file_path = local_fasta_file_path + + if seq != None: + self.sequence = seq + + + def fetch_seq(self): + """Fetch sequence found at `seq_id`:`start`-`end`:`strand` + by reading from faidx files at `fasta_file_path`. + Uses 1-based, fully inclusive coordinates.""" + try: + fasta_file = pysam.FastaFile(self.fasta_file_path) + except ValueError: + raise FileNotFoundError(f"Missing index file matching path {self.fasta_file_path}.") + except IOError: + raise IOError(f"Error while reading fasta file or index matching path {self.fasta_file_path}.") + else: + seq = fasta_file.fetch(reference=self.seq_id, start=self.start-1, end=self.end) + fasta_file.close() + + if self.strand == '-': + seq = Seq.reverse_complement(seq) + + self.sequence = seq + +def chain_seq_region_seqs(seq_regions: list, seq_strand: str): + """ + Chain multiple SeqRegions' sequenes together into one continuous sequence. + SeqRegions are chained together in an order based on the 'start' position of each: + * Ascending order when positive strand + * Descending order when negative strand + """ + + sort_args = dict(key=lambda region: region.start) + + if seq_strand == '-': + sort_args['reverse'] = True + + sorted_regions = seq_regions + seq_regions.sort(**sort_args) + chained_seq = ''.join(map(lambda region : region.sequence, sorted_regions)) + + return chained_seq From bde10e2bd7880abfced5c725c08a38fc7ab78daa Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Mon, 11 Mar 2024 19:41:07 +0000 Subject: [PATCH 14/32] Minor refactor to improve code readability --- pipeline/seq_retrieval/src/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index e59fa976..e7f867ac 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -5,7 +5,7 @@ import data_mover.data_file_mover as data_file_mover from seq_region import SeqRegion, chain_seq_region_seqs -def validate_strand(ctx, param, value): +def validate_strand_param(ctx, param, value): """Returns a normalised version of strings representing a strand. Negative strand is normalised to '-', positive strand to '+'. Throws a click.BadParameter exception if an unrecognised string was provided.""" @@ -18,7 +18,7 @@ def validate_strand(ctx, param, value): else: raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") -def process_seq_regions(ctx, param, value): +def process_seq_regions_param(ctx, param, value): """Parse the seq_regions parameter value and validate it's structure. Value is expected to be a JSON-formatted list of sequence regions to retrieve. Each region should have: @@ -50,9 +50,9 @@ def process_seq_regions(ctx, param, value): @click.command() @click.option("--seq_id", type=click.STRING, required=True, help="The sequence ID to retrieve sequences for.") -@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand, +@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand_param, help="The sequence strand to retrieve sequences for.") -@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions, +@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions_param, help="A list of sequence regions to retrieve sequences for.") @click.option("--fasta_file_url", type=click.STRING, required=True, help="""URL to (faidx-indexed) fasta file to retrieve sequences from.\ From 78d4c639396a3e811492bbb01503def8430bf4a2 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 09:41:35 +0000 Subject: [PATCH 15/32] Unused import cleanup --- pipeline/seq_retrieval/src/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index e7f867ac..00dcf118 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,6 +1,5 @@ import click import json -from os import path import data_mover.data_file_mover as data_file_mover from seq_region import SeqRegion, chain_seq_region_seqs From fee42df8d13e4f004a34baf97a99c4d3d549057e Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 13:43:44 +0000 Subject: [PATCH 16/32] Added framework for PR validation through GH actions --- .github/workflows/PR-validation.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/PR-validation.yml diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml new file mode 100644 index 00000000..9d1f59fb --- /dev/null +++ b/.github/workflows/PR-validation.yml @@ -0,0 +1,27 @@ +name: PR validation +on: + pull_request: + types: [synchronize, opened, reopened, edited] + branches: + - main +jobs: + pipeline-seq-retrieval-container-image-build: + name: pipeline/seq_retrieval container-image build + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + working-directory: ./pipeline/seq_retrieval/ + steps: + - name: Check out repository code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + sparse-checkout: | + pipeline/seq_retrieval/ + - name: Build container image + run: | + make container-image + #TODO: add typing testing (mypy) + #TODO: add unit testing + #TODO: add integration testing From b6286581858cb055393e58a333c16fa637faccb3 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 14:15:15 +0000 Subject: [PATCH 17/32] Added python type testing + included in PR validation --- .github/workflows/PR-validation.yml | 21 ++++++++++++++++++- pipeline/seq_retrieval/Makefile | 13 +++++++++++- .../test/type-testing-requirements.txt | 1 + 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 pipeline/seq_retrieval/test/type-testing-requirements.txt diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml index 9d1f59fb..ebc1f3b0 100644 --- a/.github/workflows/PR-validation.yml +++ b/.github/workflows/PR-validation.yml @@ -22,6 +22,25 @@ jobs: - name: Build container image run: | make container-image - #TODO: add typing testing (mypy) + pipeline-seq-retrieval-check-python-typing: + name: pipeline/seq_retrieval check python typing + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + working-directory: ./pipeline/seq_retrieval/ + steps: + - name: Check out repository code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + sparse-checkout: | + pipeline/seq_retrieval/ + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Check python typing + run: | + make check-python-typing #TODO: add unit testing #TODO: add integration testing diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index 63937f80..8a5fc4fc 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -1,11 +1,18 @@ -.PHONY: check-venv-active +.PHONY: check-venv-active check-python-typing container-image: docker build -t agr_pavi/seq_retrieval . python-dependencies: pip install -r requirements.txt + python-dependencies-update: pip install -U -r requirements.txt + +check-python-typing: + pip install -r test/type-testing-requirements.txt + mypy --install-types --non-interactive + mypy src/main.py + check-venv-active: ifeq ($(VIRTUAL_ENV),) @echo 'No active python virtual environment found.'\ @@ -15,5 +22,9 @@ ifeq ($(VIRTUAL_ENV),) else @: endif + python-dependencies-dev: check-venv-active python-dependencies + python-dependencies-dev-update: check-venv-active python-dependencies-update + +check-python-typing-dev: check-venv-active check-python-typing diff --git a/pipeline/seq_retrieval/test/type-testing-requirements.txt b/pipeline/seq_retrieval/test/type-testing-requirements.txt new file mode 100644 index 00000000..fcea0984 --- /dev/null +++ b/pipeline/seq_retrieval/test/type-testing-requirements.txt @@ -0,0 +1 @@ +mypy==1.9.* \ No newline at end of file From d02d0ab48bed2c2cbafb3d617bdc0748a3c71535 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 14:21:52 +0000 Subject: [PATCH 18/32] Python type checking bugfix --- pipeline/seq_retrieval/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index 8a5fc4fc..03927c5b 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -10,8 +10,7 @@ python-dependencies-update: check-python-typing: pip install -r test/type-testing-requirements.txt - mypy --install-types --non-interactive - mypy src/main.py + mypy --install-types --non-interactive src/main.py check-venv-active: ifeq ($(VIRTUAL_ENV),) From 51fa3aacb5243d12b7a5e4ff3a828b58c96a7613 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 14:26:29 +0000 Subject: [PATCH 19/32] Added main app dependencies as dependency for type testing --- pipeline/seq_retrieval/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index 03927c5b..d371eefd 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -8,7 +8,7 @@ python-dependencies: python-dependencies-update: pip install -U -r requirements.txt -check-python-typing: +check-python-typing: python-dependencies pip install -r test/type-testing-requirements.txt mypy --install-types --non-interactive src/main.py From 4fae5a7b36cbc4ec83061fc167e280be58c683a1 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 15:06:48 +0000 Subject: [PATCH 20/32] Fixed typing errors reported by mypy --- .../src/data_mover/data_file_mover.py | 7 +++--- .../src/seq_region/seq_region.py | 22 ++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index f8463814..33bcd98c 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -3,10 +3,11 @@ """ import os.path from pathlib import Path +import typing import requests from urllib.parse import urlparse, unquote -_stored_files = dict() +_stored_files: typing.Dict[str, str] = dict() _DEFAULT_DIR = '/tmp/pavi/' _reuse_local_cache = False @@ -27,7 +28,7 @@ def is_accessible_url(url: str): else: return False -def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: bool = None): +def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str: """ Fetch file from URL, return its local path. """ @@ -59,7 +60,7 @@ def find_local_file(path: str): else: return Path(path).resolve() -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: bool = None): +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None): if reuse_local_cache == None: reuse_local_cache = _reuse_local_cache diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 4486fd9d..37fccb1d 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -2,20 +2,22 @@ Module containing the SeqRegion class and all methods to handle them. SeqRegion class is used to define and retrieve sequence regions. """ +import typing + from Bio import Seq #Bio.Seq biopython submodule import pysam from data_mover import data_file_mover class SeqRegion(): - seq_id = str - start = int - end = int - strand = str - fasta_file_path = str - sequence = str - - def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: str = None): + seq_id: str + start: int + end: int + strand: str + fasta_file_path: str + sequence: typing.Optional[str] + + def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: typing.Optional[str] = None): """ Uses 1-based, fully inclusive coordinates """ @@ -56,7 +58,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur self.sequence = seq - def fetch_seq(self): + def fetch_seq(self) -> None: """Fetch sequence found at `seq_id`:`start`-`end`:`strand` by reading from faidx files at `fasta_file_path`. Uses 1-based, fully inclusive coordinates.""" @@ -83,7 +85,7 @@ def chain_seq_region_seqs(seq_regions: list, seq_strand: str): * Descending order when negative strand """ - sort_args = dict(key=lambda region: region.start) + sort_args: typing.Dict[str, typing.Any] = dict(key=lambda region: region.start, reverse=False) if seq_strand == '-': sort_args['reverse'] = True From 4e25e6f9b1872d8435da742cc557b29672d16b92 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 15:13:00 +0000 Subject: [PATCH 21/32] Updated GH actions runner to current Ubuntu LTS (22.04) --- .github/workflows/PR-validation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml index ebc1f3b0..5f1406d8 100644 --- a/.github/workflows/PR-validation.yml +++ b/.github/workflows/PR-validation.yml @@ -7,7 +7,7 @@ on: jobs: pipeline-seq-retrieval-container-image-build: name: pipeline/seq_retrieval container-image build - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 defaults: run: shell: bash @@ -24,7 +24,7 @@ jobs: make container-image pipeline-seq-retrieval-check-python-typing: name: pipeline/seq_retrieval check python typing - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 defaults: run: shell: bash From b5e4a988c03c562ef360ddcb4e3e5f753411a613 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 15:13:41 +0000 Subject: [PATCH 22/32] GH actions checkout version bump (to deprecate node.js v16 usage) --- .github/workflows/PR-validation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml index 5f1406d8..47fa9ce3 100644 --- a/.github/workflows/PR-validation.yml +++ b/.github/workflows/PR-validation.yml @@ -14,7 +14,7 @@ jobs: working-directory: ./pipeline/seq_retrieval/ steps: - name: Check out repository code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 sparse-checkout: | @@ -31,7 +31,7 @@ jobs: working-directory: ./pipeline/seq_retrieval/ steps: - name: Check out repository code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 sparse-checkout: | From 2ac053730ab2c5f64d66e82de78d8f2ddcdfdf21 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 15:45:03 +0000 Subject: [PATCH 23/32] Added vscode configs to gitignore --- .gitignore | 64 +++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 68bc17f9..af90381a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,13 @@ -# Byte-compiled / optimized / DLL files +# Python files +## Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class -# C extensions +## C extensions *.so -# Distribution / packaging +## Distribution / packaging .Python build/ develop-eggs/ @@ -26,17 +27,17 @@ share/python-wheels/ *.egg MANIFEST -# PyInstaller +## PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec -# Installer logs +## package installer logs pip-log.txt pip-delete-this-directory.txt -# Unit test / coverage reports +## Unit test / coverage reports htmlcov/ .tox/ .nox/ @@ -51,57 +52,57 @@ coverage.xml .pytest_cache/ cover/ -# Translations +## Translations *.mo *.pot -# Django stuff: +## Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal -# Flask stuff: +## Flask stuff: instance/ .webassets-cache -# Scrapy stuff: +## Scrapy stuff: .scrapy -# Sphinx documentation +## Sphinx documentation docs/_build/ -# PyBuilder +## PyBuilder .pybuilder/ target/ -# Jupyter Notebook +## Jupyter Notebook .ipynb_checkpoints -# IPython +## IPython profile_default/ ipython_config.py -# pyenv +## pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version -# pipenv +## pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock -# poetry +## poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock -# pdm +## pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it @@ -109,17 +110,17 @@ ipython_config.py # https://pdm.fming.dev/#use-with-ide .pdm.toml -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +## PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ -# Celery stuff +## Celery stuff celerybeat-schedule celerybeat.pid -# SageMath parsed files +## SageMath parsed files *.sage.py -# Environments +## Environments .env .venv env/ @@ -128,33 +129,36 @@ ENV/ env.bak/ venv.bak/ -# Spyder project settings +## Spyder project settings .spyderproject .spyproject -# Rope project settings +## Rope project settings .ropeproject -# mkdocs documentation +## mkdocs documentation /site -# mypy +## mypy .mypy_cache/ .dmypy.json dmypy.json -# Pyre type checker +## Pyre type checker .pyre/ -# pytype static type analyzer +## pytype static type analyzer .pytype/ -# Cython debug symbols +## Cython debug symbols cython_debug/ -# PyCharm +## PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# VS Code +.vscode/ From 2acb5f992886e6e70dd6d1df09fb131a46998bf8 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 16:28:05 +0000 Subject: [PATCH 24/32] Added output typing for all methods and functions --- .../src/data_mover/data_file_mover.py | 14 +++++++------- .../seq_retrieval/src/seq_region/seq_region.py | 8 ++++++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index 33bcd98c..c507ac6e 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -11,14 +11,14 @@ _DEFAULT_DIR = '/tmp/pavi/' _reuse_local_cache = False -def set_local_cache_reuse(reuse: bool): +def set_local_cache_reuse(reuse: bool) -> None: """ Set _reuse_local_cache (True or False, default False) """ global _reuse_local_cache _reuse_local_cache = reuse -def is_accessible_url(url: str): +def is_accessible_url(url: str) -> bool: """ Returns True when provided `url` is an accessible URL """ @@ -47,7 +47,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing return local_path -def find_local_file(path: str): +def find_local_file(path: str) -> str: """ Find a file locally based on path and return its absolute path. If no file was found at given path, throws Exception. @@ -58,9 +58,9 @@ def find_local_file(path: str): if not os.path.isfile(path): raise FileNotFoundError(f"Specified path '{path}' exists but is not a file.") else: - return Path(path).resolve() + return str(Path(path).resolve()) -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None): +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None) -> str: if reuse_local_cache == None: reuse_local_cache = _reuse_local_cache @@ -78,7 +78,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * if os.path.exists(local_file_path) and os.path.isfile(local_file_path): if reuse_local_cache == True: #Return the local file path without downloading new content - return Path(local_file_path).resolve() + return str(Path(local_file_path).resolve()) else: os.remove(local_file_path) @@ -92,7 +92,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * os.rename(tmp_file_path, local_file_path) - return Path(local_file_path).resolve() + return str(Path(local_file_path).resolve()) else: #Currently not supported raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.") diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 37fccb1d..701c9cf7 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -77,7 +77,11 @@ def fetch_seq(self) -> None: self.sequence = seq -def chain_seq_region_seqs(seq_regions: list, seq_strand: str): + def get_sequence(self) -> str: + """Return SeqRegion's sequence as a string (empty string if `None`).""" + return str(self.sequence) + +def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) -> str: """ Chain multiple SeqRegions' sequenes together into one continuous sequence. SeqRegions are chained together in an order based on the 'start' position of each: @@ -92,6 +96,6 @@ def chain_seq_region_seqs(seq_regions: list, seq_strand: str): sorted_regions = seq_regions seq_regions.sort(**sort_args) - chained_seq = ''.join(map(lambda region : region.sequence, sorted_regions)) + chained_seq = ''.join(map(lambda region : region.get_sequence(), sorted_regions)) return chained_seq From 4b068192234c46f81d64924cfd9bf2da512ee102 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 16:29:41 +0000 Subject: [PATCH 25/32] Bugfix for seqRegion sorting in chain_seq_region_seqs() function --- pipeline/seq_retrieval/src/seq_region/seq_region.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 701c9cf7..06c5e009 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -95,7 +95,7 @@ def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) sort_args['reverse'] = True sorted_regions = seq_regions - seq_regions.sort(**sort_args) + sorted_regions.sort(**sort_args) chained_seq = ''.join(map(lambda region : region.get_sequence(), sorted_regions)) return chained_seq From 84a03dfb45c1cac4e72f1985729f124ec0cfaf29 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Tue, 12 Mar 2024 16:56:59 +0000 Subject: [PATCH 26/32] Additional main functions arguments and output typing --- pipeline/seq_retrieval/src/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 00dcf118..1f544945 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,10 +1,11 @@ import click +import typing import json import data_mover.data_file_mover as data_file_mover from seq_region import SeqRegion, chain_seq_region_seqs -def validate_strand_param(ctx, param, value): +def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']: """Returns a normalised version of strings representing a strand. Negative strand is normalised to '-', positive strand to '+'. Throws a click.BadParameter exception if an unrecognised string was provided.""" @@ -17,7 +18,7 @@ def validate_strand_param(ctx, param, value): else: raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") -def process_seq_regions_param(ctx, param, value): +def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,typing.Any]]: """Parse the seq_regions parameter value and validate it's structure. Value is expected to be a JSON-formatted list of sequence regions to retrieve. Each region should have: @@ -61,7 +62,7 @@ def process_seq_regions_param(ctx, param, value): @click.option("--reuse_local_cache", is_flag=True, help="""When defined and using remote `fasta_file_url`, reused local files if file already exists at destination path, rather than re-downloading and overwritting.""") -def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache: bool): +def main(seq_id: str, seq_strand: str, seq_regions: typing.List, fasta_file_url: str, reuse_local_cache: bool): """Main method for sequence retrieval from JBrowse faidx indexed fasta files. Returns a single (transcript) sequence made by concatenating all sequence regions requested (in positional order defined by specified seq_strand).""" From 7cbd950728ad4647ef69379df4642c3a2b85d070 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Wed, 13 Mar 2024 11:15:54 +0000 Subject: [PATCH 27/32] Added typing guidelines to README --- pipeline/seq_retrieval/README.md | 87 ++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md index 242eeb1e..6f4ad2c1 100644 --- a/pipeline/seq_retrieval/README.md +++ b/pipeline/seq_retrieval/README.md @@ -2,6 +2,7 @@ This subdirectory contains all code and configs for the PAVI sequence retrieval component. ## Development +### Local dev environment In order to enable isolated local development that does not interfere with the global system python setup, a virtual environment is used to do code development and testing for this component. @@ -30,3 +31,89 @@ Once ending development for this component, deactivate the virtual environment w ```bash deactivate ``` + +### Code guidelines +#### Input/output typing and checking +**TL;DR** +Type hints are required wherever possible and `mypy` is used to enforce this on PR validation. +To check all code part of the seq_retrieval module in this folder (on local development), run: +```shell +make check-python-typing-dev +``` + +**Detailed explanation** +Altought the Python interpreter always uses dynamic typing, it is recommended to use type hints wherever possible, +to ensure all functions and methods always receive the expected type as to not get caught by unexpected errors. + +As an example, the following untyped python function will define local cache usage behavior taking a boolean as toggle: +```python +def set_local_cache_reuse(reuse): + """ + Define whether or not data_file_mover will reuse files in local cache where already available pre-execution. + + Args: + reuse (bool): set to `True` to enable local cache reuse behavior (default `False`) + """ + global _reuse_local_cache + _reuse_local_cache = reuse + if reuse: + print("Local cache reuse enabled.") + else: + print("Local cache reuse disabled.") +``` + +When called with boolean values, this function works just fine: +```python +>>> set_local_cache_reuse(True) +Local cache reuse enabled. +>>> set_local_cache_reuse(False) +Local cache reuse disabled. +``` +However when calling with a String instead of a boolean, you may get unexpected behaviour: +```python +>>> set_local_cache_reuse("False") +Local cache reuse enabled. +``` +This happens because Python dynamically types and converts types at runtime, +and all strings except empty ones are converted to boolean value `True`. + +To prevent this, add type hints to your code: +```python +def set_local_cache_reuse(reuse: bool) -> None: + """ + Define whether or not data_file_mover will reuse files in local cache where already available pre-execution. + + Args: + reuse (bool): set to `True` to enable local cache reuse behavior (default `False`) + """ + global _reuse_local_cache + _reuse_local_cache = reuse + if reuse: + print("Local cache reuse enabled.") + else: + print("Local cache reuse disabled.") +set_local_cache_reuse("False") +``` + +Type hints themselves are not enforced at runtime, and will thus not stop the code from running (incorrectly), +but using `myPy` those errors can be revealed before merging this code. Storing the above code snippet in a file +called `set_local_cache_reuse.py` and running `myPy` on it gives the following result: +```shell +> mypy set_local_cache_reuse.py +set_local_cache_reuse.py:9: error: Name "_reuse_local_cache" is not defined [name-defined] +set_local_cache_reuse.py:14: error: Argument 1 to "set_local_cache_reuse" has incompatible type "str"; expected "bool" [arg-type] +Found 2 errors in 1 file (checked 1 source file) +``` +With the myPy output, we can now return to the code and fix the errors reported +which would otherwise result in silent unexpected behavior and bugs. + +To run `mypy` on the seq_retrieval module in this folder in your local development environment, +run the following shell command: +```shell +make check-python-typing-dev +``` + +`mypy` checks are automatically run and enforced as part of the PR validation +and all reported errors must be fixed to enable merging each PR into `main`. +If the `pipeline/seq_retrieval check python typing` status check fails on a PR in github, +click the details link and inspect the failing step output for hints on what to fix. From c0fee617f24e2559a89a10e3f53c4f418aa66b07 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Wed, 13 Mar 2024 18:39:05 +0000 Subject: [PATCH 28/32] Comply with google-style docstring documentation This enables relevant IDE tooltip hints in VS Code (and presumably other IDEs) and thus more efficient coding. --- pipeline/seq_retrieval/README.md | 5 ++ .../src/data_mover/data_file_mover.py | 74 +++++++++++++++++-- pipeline/seq_retrieval/src/main.py | 42 ++++++++--- .../src/seq_region/seq_region.py | 57 +++++++++++--- 4 files changed, 154 insertions(+), 24 deletions(-) diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md index 6f4ad2c1..add19607 100644 --- a/pipeline/seq_retrieval/README.md +++ b/pipeline/seq_retrieval/README.md @@ -117,3 +117,8 @@ make check-python-typing-dev and all reported errors must be fixed to enable merging each PR into `main`. If the `pipeline/seq_retrieval check python typing` status check fails on a PR in github, click the details link and inspect the failing step output for hints on what to fix. + +#### Code documentation (docstrings) +All modules, functions, classes and methods should have their input, attributes and output documented +through docstrings to make the code easy to read and understand for anyone reading it. +To ensure this is done in a uniform way accross all code, follow the [Google Python Style Guide on docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings). diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index c507ac6e..713a1ba4 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -1,5 +1,5 @@ """ -Moving files to and from remote locations +Module used to find, access and copy files at/to/from remote locations """ import os.path from pathlib import Path @@ -8,19 +8,43 @@ from urllib.parse import urlparse, unquote _stored_files: typing.Dict[str, str] = dict() +"""Module level memory cache of filepaths for all files accessed through this module.""" + _DEFAULT_DIR = '/tmp/pavi/' +"""Module level default destination directory to search/download remote files in/to.""" + _reuse_local_cache = False +""" +Module level toggle defining local file cache reuse behaviour. + * When True, reuse identically named files existing pre-runtime at local destination location for remote files + * When False, delete identically named files existing pre-runtime at local destination location for remote files before downloading. + +Change the value through the `set_local_cache_reuse` function. +""" def set_local_cache_reuse(reuse: bool) -> None: """ - Set _reuse_local_cache (True or False, default False) + Define data_file_mover module-level behaviour on local file cache reuse. + + data_file_mover will + * reuse identically named files at local target location when available pre-runtime when set to `True`. + * remove identically named files at local target location when present pre-runtime when set to `False`. + + Args: + reuse (bool): set to `True` to enable local cache reuse behavior (default `False`) """ global _reuse_local_cache _reuse_local_cache = reuse def is_accessible_url(url: str) -> bool: """ - Returns True when provided `url` is an accessible URL + Check whether provided `url` is an accessible (remote) URL + + Args: + url: URL to be checked for accessability + + Returns: + `True` when provided `url` is an accessible URL, `False` otherwise """ response = requests.head(url) if response.ok: @@ -30,7 +54,19 @@ def is_accessible_url(url: str) -> bool: def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str: """ - Fetch file from URL, return its local path. + Fetch file from URL and return its local path. + + Parses `url` to determine scheme and sends it to the appropriate data_file_mover function for retrieval. + Result is cached in the data_file_mover module's `_stored_files` memory cache, which is used to speed up + repeated retrieval. + + Args: + url: URL to fetch local file for + dest_dir: Destination directory to search/download remote files in/to. + reuse_local_cache: Argument to override local cache reuse behavior defined at data_file_mover level. + + Returns: + Absolute path to local file matching the requested URL (string). """ global _stored_files local_path = None @@ -50,7 +86,15 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing def find_local_file(path: str) -> str: """ Find a file locally based on path and return its absolute path. - If no file was found at given path, throws Exception. + + Args: + path: path to local file to find, can be relative. + + Returns: + Absolute path to file found at `path` (string). + + Raises: + `FileNotFoundError`: If `path` is not found, or is not a file. """ if not os.path.exists(path): raise FileNotFoundError(f"No file found at path '{path}'.") @@ -61,6 +105,26 @@ def find_local_file(path: str) -> str: return str(Path(path).resolve()) def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None) -> str: + """ + Download file from remote URL and return its absolute local path. + + Parses `url` to determine scheme and downloads the remote file to local filesystem as appropriate. + When local cache reuse is enabled, does not download but returns identically named files at `dest_dir` location when found. + When local cache reuse is disabled, removes identically named files at `dest_dir` location when found before initiating download. + + Args: + url: URL to remote file to download + dest_dir: Destination directory to search/download remote file in/to. + chunk_size: Chunk size use while downloading + reuse_local_cache: Argument to override local cache reuse behavior defined at data_file_mover level. + + Returns: + Absolute path to the found/downloaded file (string). + + Raises: + `ValueError`: if `url` scheme is not supported or `url` is not accessible. + """ + if reuse_local_cache == None: reuse_local_cache = _reuse_local_cache diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 1f544945..c2b65a3a 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -1,3 +1,8 @@ +""" +Main module serving the CLI for PAVI sequence retrieval. + +Retrieves multiple sequence regions and returns them as one chained sequence. +""" import click import typing import json @@ -6,9 +11,15 @@ from seq_region import SeqRegion, chain_seq_region_seqs def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']: - """Returns a normalised version of strings representing a strand. - Negative strand is normalised to '-', positive strand to '+'. - Throws a click.BadParameter exception if an unrecognised string was provided.""" + """ + Processes and normalises the value of click input argument `strand`. + + Returns: + A normalised version of strings representing a strand: '-' or '+' + + Raises: + click.BadParameter: If an unrecognised string was provided. + """ POS_CHOICES = ['+', '+1', 'pos'] NEG_CHOICES = ['-', '-1', 'neg'] if value in POS_CHOICES: @@ -19,12 +30,20 @@ def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']: raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,typing.Any]]: - """Parse the seq_regions parameter value and validate it's structure. + """ + Parse the value of click input parameter seq_regions and validate it's structure. + Value is expected to be a JSON-formatted list of sequence regions to retrieve. Each region should have: - * a 'start' property indicating the region start (inclusive) - * a 'end' property indicating the region end (inclusive) - Throws a click.BadParameter exception if value could not be parsed as JSON or had an invalid structure.""" + * a 'start' property indicating the region start (1-based, inclusive) + * a 'end' property indicating the region end (1-based, inclusive) + + Returns: + List of dicts representing start and end of seq region + + Raises: + click.BadParameter: If value could not be parsed as JSON or had an invalid structure or values. + """ seq_regions = None try: seq_regions = json.loads(value) @@ -63,9 +82,12 @@ def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str, help="""When defined and using remote `fasta_file_url`, reused local files if file already exists at destination path, rather than re-downloading and overwritting.""") def main(seq_id: str, seq_strand: str, seq_regions: typing.List, fasta_file_url: str, reuse_local_cache: bool): - """Main method for sequence retrieval from JBrowse faidx indexed fasta files. - Returns a single (transcript) sequence made by concatenating all sequence regions requested - (in positional order defined by specified seq_strand).""" + """ + Main method for sequence retrieval from JBrowse faidx indexed fasta files. Receives input args from click. + + Prints a single (transcript) sequence obtained by concatenating the sequence of + all sequence regions requested (in positional order defined by specified seq_strand). + """ click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 06c5e009..6a36ffe6 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -1,6 +1,5 @@ """ -Module containing the SeqRegion class and all methods to handle them. -SeqRegion class is used to define and retrieve sequence regions. +Module containing the SeqRegion class and all functions to handle SeqRegion entities. """ import typing @@ -10,16 +9,45 @@ from data_mover import data_file_mover class SeqRegion(): + """ + Defines a DNA sequence region. + """ + seq_id: str + """The sequence identifier found in the fasta file on which the sequence region is located""" + start: int + """The start position of the sequence region (1-based, inclusive). Asserted to be `start` < `end`.""" + end: int + """The end position of the sequence region (1-base, inclusive). Asserted to be `start` < `end`.""" + strand: str + """The (genomic) strand of the sequence region""" + fasta_file_path: str + """Absolute path to (faidx indexed) FASTA file containing the sequences""" + sequence: typing.Optional[str] + """the DNA sequence of a sequence region""" def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: typing.Optional[str] = None): """ - Uses 1-based, fully inclusive coordinates + Initializes a SeqRegion instance + + Args: + seq_id: The sequence identifier found in the fasta file on which the sequence region is located + start: The start position of the sequence region (1-based, inclusive).\ + If negative strand, `start` and `end` are swapped if `end` < `start`. + end: The end position of the sequence region (1-base, inclusive).\ + If negative strand, `start` and `end` are swapped if `end` < `start`. + strand: the (genomic) strand of the sequence region + fasta_file_url: Path to local faidx-indexed FASTA file containing the sequences to retrieve (regions of).\ + Faidx-index files `fasta_file_url`.fai and `fasta_file_url`.gzi for compressed fasta file must be accessible URLs. + seq: optional DNA sequence of the sequence region + + Raises: + ValueError: if value of `end` < `start` and `strand` is '+' """ self.seq_id = seq_id self.strand = strand @@ -59,9 +87,12 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur def fetch_seq(self) -> None: - """Fetch sequence found at `seq_id`:`start`-`end`:`strand` + """ + Fetch sequence found at `seq_id`:`start`-`end`:`strand` by reading from faidx files at `fasta_file_path`. - Uses 1-based, fully inclusive coordinates.""" + + Stores resulting sequence in `sequence` attribute. + """ try: fasta_file = pysam.FastaFile(self.fasta_file_path) except ValueError: @@ -78,15 +109,23 @@ def fetch_seq(self) -> None: self.sequence = seq def get_sequence(self) -> str: - """Return SeqRegion's sequence as a string (empty string if `None`).""" + """Return `sequence` attribute as a string (empty string if `None`).""" return str(self.sequence) def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) -> str: """ Chain multiple SeqRegions' sequenes together into one continuous sequence. - SeqRegions are chained together in an order based on the 'start' position of each: - * Ascending order when positive strand - * Descending order when negative strand + + SeqRegions are chained together in an order based on the `start` attribute of each: + * Ascending order when `seq_strand` is positive strand + * Descending order when `seq_strand` is negative strand + + Args: + seq_regions: list of SeqRegion objects to chain together + seq_strand: sequence strand which defines the chaining order + + Returns: + String representing the chained sequence of all input SeqRegions """ sort_args: typing.Dict[str, typing.Any] = dict(key=lambda region: region.start, reverse=False) From 0129f4c3eef3ce747c7f27568f500a261b796baa Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Wed, 13 Mar 2024 19:09:28 +0000 Subject: [PATCH 29/32] Added sequence length assertion to SeqRegion class --- .../src/seq_region/seq_region.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 6a36ffe6..1546be41 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -106,7 +106,27 @@ def fetch_seq(self) -> None: if self.strand == '-': seq = Seq.reverse_complement(seq) - self.sequence = seq + self.set_sequence(seq) + + def set_sequence(self, sequence: str) -> None: + """ + Set the `sequence` attribute. + + Asserts the length of `sequence` matches the expected sequence length for this region. + + Args: + sequence: DNA sequence (string) + + Raises: + valueError: If the length of `sequence` provided does not match the region length + """ + + seq_len = len(sequence) + expected_len = self.end - self.start + 1 + if seq_len != expected_len: + raise ValueError(f"Sequence length {seq_len} does not equal length expected on region positions {expected_len}.") + else: + self.sequence = sequence def get_sequence(self) -> str: """Return `sequence` attribute as a string (empty string if `None`).""" From 48549c9ab090fe481b5db79cc2154b143951a296 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 14 Mar 2024 19:34:38 +0000 Subject: [PATCH 30/32] Improved readability of typing + add typing for click values --- .../seq_retrieval/src/data_mover/data_file_mover.py | 8 ++++---- pipeline/seq_retrieval/src/main.py | 8 ++++---- pipeline/seq_retrieval/src/seq_region/seq_region.py | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index 713a1ba4..c9b2ac69 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -3,11 +3,11 @@ """ import os.path from pathlib import Path -import typing +from typing import Dict, Optional import requests from urllib.parse import urlparse, unquote -_stored_files: typing.Dict[str, str] = dict() +_stored_files: Dict[str, str] = dict() """Module level memory cache of filepaths for all files accessed through this module.""" _DEFAULT_DIR = '/tmp/pavi/' @@ -52,7 +52,7 @@ def is_accessible_url(url: str) -> bool: else: return False -def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str: +def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Optional[bool] = None) -> str: """ Fetch file from URL and return its local path. @@ -104,7 +104,7 @@ def find_local_file(path: str) -> str: else: return str(Path(path).resolve()) -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None) -> str: +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str: """ Download file from remote URL and return its absolute local path. diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index c2b65a3a..fcdcba7f 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -4,13 +4,13 @@ Retrieves multiple sequence regions and returns them as one chained sequence. """ import click -import typing +from typing import Any, Dict, List, Literal import json import data_mover.data_file_mover as data_file_mover from seq_region import SeqRegion, chain_seq_region_seqs -def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']: +def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']: """ Processes and normalises the value of click input argument `strand`. @@ -29,7 +29,7 @@ def validate_strand_param(ctx, param, value) -> typing.Literal['+', '-']: else: raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") -def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str,typing.Any]]: +def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: """ Parse the value of click input parameter seq_regions and validate it's structure. @@ -81,7 +81,7 @@ def process_seq_regions_param(ctx, param, value) -> typing.List[typing.Dict[str, @click.option("--reuse_local_cache", is_flag=True, help="""When defined and using remote `fasta_file_url`, reused local files if file already exists at destination path, rather than re-downloading and overwritting.""") -def main(seq_id: str, seq_strand: str, seq_regions: typing.List, fasta_file_url: str, reuse_local_cache: bool): +def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, reuse_local_cache: bool): """ Main method for sequence retrieval from JBrowse faidx indexed fasta files. Receives input args from click. diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 1546be41..2668a044 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -1,7 +1,7 @@ """ Module containing the SeqRegion class and all functions to handle SeqRegion entities. """ -import typing +from typing import Any, Dict, List, Optional from Bio import Seq #Bio.Seq biopython submodule import pysam @@ -28,10 +28,10 @@ class SeqRegion(): fasta_file_path: str """Absolute path to (faidx indexed) FASTA file containing the sequences""" - sequence: typing.Optional[str] + sequence: Optional[str] """the DNA sequence of a sequence region""" - def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: typing.Optional[str] = None): + def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_url: str, seq: Optional[str] = None): """ Initializes a SeqRegion instance @@ -132,7 +132,7 @@ def get_sequence(self) -> str: """Return `sequence` attribute as a string (empty string if `None`).""" return str(self.sequence) -def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) -> str: +def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str: """ Chain multiple SeqRegions' sequenes together into one continuous sequence. @@ -148,7 +148,7 @@ def chain_seq_region_seqs(seq_regions: typing.List[SeqRegion], seq_strand: str) String representing the chained sequence of all input SeqRegions """ - sort_args: typing.Dict[str, typing.Any] = dict(key=lambda region: region.start, reverse=False) + sort_args: Dict[str, Any] = dict(key=lambda region: region.start, reverse=False) if seq_strand == '-': sort_args['reverse'] = True From 3b0dca6b8d359f5922401d1541c7daa086926814 Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 14 Mar 2024 19:45:54 +0000 Subject: [PATCH 31/32] Moved tests files and configs to more standard path --- pipeline/seq_retrieval/Makefile | 2 +- .../type-testing-requirements.txt => tests/requirements.txt} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pipeline/seq_retrieval/{test/type-testing-requirements.txt => tests/requirements.txt} (100%) diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index d371eefd..7a788b0b 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -9,7 +9,7 @@ python-dependencies-update: pip install -U -r requirements.txt check-python-typing: python-dependencies - pip install -r test/type-testing-requirements.txt + pip install -r tests/requirements.txt mypy --install-types --non-interactive src/main.py check-venv-active: diff --git a/pipeline/seq_retrieval/test/type-testing-requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt similarity index 100% rename from pipeline/seq_retrieval/test/type-testing-requirements.txt rename to pipeline/seq_retrieval/tests/requirements.txt From 2207ad6a6c74fa3c14cc5f3eb1d914a05558f02c Mon Sep 17 00:00:00 2001 From: Manuel Luypaert Date: Thu, 14 Mar 2024 21:09:57 +0000 Subject: [PATCH 32/32] Added flake8 style check and made code compliant --- .github/workflows/PR-validation.yml | 24 +++++++++++++++++-- pipeline/seq_retrieval/.flake8 | 6 +++++ pipeline/seq_retrieval/Makefile | 7 +++++- .../src/data_mover/data_file_mover.py | 17 ++++++++----- pipeline/seq_retrieval/src/main.py | 18 ++++++++------ .../seq_retrieval/src/seq_region/__init__.py | 2 +- .../src/seq_region/seq_region.py | 17 ++++++------- pipeline/seq_retrieval/tests/requirements.txt | 3 ++- 8 files changed, 68 insertions(+), 26 deletions(-) create mode 100644 pipeline/seq_retrieval/.flake8 diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml index 47fa9ce3..c39ae7d2 100644 --- a/.github/workflows/PR-validation.yml +++ b/.github/workflows/PR-validation.yml @@ -22,8 +22,8 @@ jobs: - name: Build container image run: | make container-image - pipeline-seq-retrieval-check-python-typing: - name: pipeline/seq_retrieval check python typing + pipeline-seq-retrieval-python-typing-check: + name: pipeline/seq_retrieval python typing check runs-on: ubuntu-22.04 defaults: run: @@ -42,5 +42,25 @@ jobs: - name: Check python typing run: | make check-python-typing + pipeline-seq-retrieval-python-style-check: + name: pipeline/seq_retrieval python style check + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash + working-directory: ./pipeline/seq_retrieval/ + steps: + - name: Check out repository code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + sparse-checkout: | + pipeline/seq_retrieval/ + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Check python style + run: | + make check-python-style #TODO: add unit testing #TODO: add integration testing diff --git a/pipeline/seq_retrieval/.flake8 b/pipeline/seq_retrieval/.flake8 new file mode 100644 index 00000000..02b9d28d --- /dev/null +++ b/pipeline/seq_retrieval/.flake8 @@ -0,0 +1,6 @@ +[flake8] +ignore = E203, E266, E501, F401, W503, E501 +max-complexity = 18 +select = B,C,E,F,W,T4 +exclude = venv +per-file-ignores = __init__.py:F401 diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index 7a788b0b..6f2acd17 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -8,10 +8,15 @@ python-dependencies: python-dependencies-update: pip install -U -r requirements.txt -check-python-typing: python-dependencies +python-test-dependencies: pip install -r tests/requirements.txt + +check-python-typing: python-dependencies python-test-dependencies mypy --install-types --non-interactive src/main.py +check-python-style: python-dependencies python-test-dependencies + flake8 ./ + check-venv-active: ifeq ($(VIRTUAL_ENV),) @echo 'No active python virtual environment found.'\ diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index c9b2ac69..435c908d 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -22,6 +22,7 @@ Change the value through the `set_local_cache_reuse` function. """ + def set_local_cache_reuse(reuse: bool) -> None: """ Define data_file_mover module-level behaviour on local file cache reuse. @@ -36,6 +37,7 @@ def set_local_cache_reuse(reuse: bool) -> None: global _reuse_local_cache _reuse_local_cache = reuse + def is_accessible_url(url: str) -> bool: """ Check whether provided `url` is an accessible (remote) URL @@ -52,6 +54,7 @@ def is_accessible_url(url: str) -> bool: else: return False + def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Optional[bool] = None) -> str: """ Fetch file from URL and return its local path. @@ -83,6 +86,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Option return local_path + def find_local_file(path: str) -> str: """ Find a file locally based on path and return its absolute path. @@ -104,7 +108,8 @@ def find_local_file(path: str) -> str: else: return str(Path(path).resolve()) -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str: + +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size: int = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str: """ Download file from remote URL and return its absolute local path. @@ -125,7 +130,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * `ValueError`: if `url` scheme is not supported or `url` is not accessible. """ - if reuse_local_cache == None: + if reuse_local_cache is None: reuse_local_cache = _reuse_local_cache url_components = urlparse(url) @@ -140,13 +145,13 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * local_file_path = os.path.join(dest_dir, filename) if os.path.exists(local_file_path) and os.path.isfile(local_file_path): - if reuse_local_cache == True: - #Return the local file path without downloading new content + if reuse_local_cache is True: + # Return the local file path without downloading new content return str(Path(local_file_path).resolve()) else: os.remove(local_file_path) - #Download file through streaming to support large files + # Download file through streaming to support large files tmp_file_path = f"{local_file_path}.part" response = requests.get(url, stream=True) @@ -158,5 +163,5 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * return str(Path(local_file_path).resolve()) else: - #Currently not supported + # Currently not supported raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.") diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index fcdcba7f..0e4b0e62 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -10,6 +10,7 @@ import data_mover.data_file_mover as data_file_mover from seq_region import SeqRegion, chain_seq_region_seqs + def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']: """ Processes and normalises the value of click input argument `strand`. @@ -22,14 +23,15 @@ def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']: """ POS_CHOICES = ['+', '+1', 'pos'] NEG_CHOICES = ['-', '-1', 'neg'] - if value in POS_CHOICES: + if value in POS_CHOICES: return '+' elif value in NEG_CHOICES: return '-' else: raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") -def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: + +def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str, Any]]: """ Parse the value of click input parameter seq_regions and validate it's structure. @@ -47,8 +49,8 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: seq_regions = None try: seq_regions = json.loads(value) - except: - raise click.BadParameter(f"Must be a valid JSON-formatted string.") + except Exception: + raise click.BadParameter("Must be a valid JSON-formatted string.") else: if not isinstance(seq_regions, list): raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.") @@ -66,6 +68,7 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: return seq_regions + @click.command() @click.option("--seq_id", type=click.STRING, required=True, help="The sequence ID to retrieve sequences for.") @@ -96,15 +99,16 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r seq_region_objs = [] for region in seq_regions: seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand, - fasta_file_url=fasta_file_url)) + fasta_file_url=fasta_file_url)) for seq_region in seq_region_objs: - #Retrieve sequence for region + # Retrieve sequence for region seq_region.fetch_seq() - #Concatenate all regions into single sequence + # Concatenate all regions into single sequence seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand) click.echo(f"\nSeq concat: {seq_concat}") + if __name__ == '__main__': main() diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py index 75eecf7f..8a4ddd88 100644 --- a/pipeline/seq_retrieval/src/seq_region/__init__.py +++ b/pipeline/seq_retrieval/src/seq_region/__init__.py @@ -1,2 +1,2 @@ from .seq_region import SeqRegion -from .seq_region import chain_seq_region_seqs \ No newline at end of file +from .seq_region import chain_seq_region_seqs diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 2668a044..c8d777a6 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -3,11 +3,12 @@ """ from typing import Any, Dict, List, Optional -from Bio import Seq #Bio.Seq biopython submodule +from Bio import Seq # Bio.Seq biopython submodule import pysam from data_mover import data_file_mover + class SeqRegion(): """ Defines a DNA sequence region. @@ -52,7 +53,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur self.seq_id = seq_id self.strand = strand - #If strand is -, ensure start <= end (swap as required) + # If strand is -, ensure start <= end (swap as required) if strand == '-': if end < start: self.start = end @@ -60,7 +61,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur else: self.start = start self.end = end - #If strand is +, throw error when end < start (likely user error) + # If strand is +, throw error when end < start (likely user error) else: if end < start: raise ValueError(f"Unexpected position order: end {end} < start {start}.") @@ -73,19 +74,18 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur # Fetch additional faidx index files in addition to fasta file itself # (to the same location) - index_files = [fasta_file_url+'.fai'] + index_files = [fasta_file_url + '.fai'] if fasta_file_url.endswith('.gz'): - index_files.append(fasta_file_url+'.gzi') + index_files.append(fasta_file_url + '.gzi') for index_file in index_files: data_file_mover.fetch_file(index_file) self.fasta_file_path = local_fasta_file_path - if seq != None: + if seq is not None: self.sequence = seq - def fetch_seq(self) -> None: """ Fetch sequence found at `seq_id`:`start`-`end`:`strand` @@ -100,7 +100,7 @@ def fetch_seq(self) -> None: except IOError: raise IOError(f"Error while reading fasta file or index matching path {self.fasta_file_path}.") else: - seq = fasta_file.fetch(reference=self.seq_id, start=self.start-1, end=self.end) + seq = fasta_file.fetch(reference=self.seq_id, start=(self.start - 1), end=self.end) fasta_file.close() if self.strand == '-': @@ -132,6 +132,7 @@ def get_sequence(self) -> str: """Return `sequence` attribute as a string (empty string if `None`).""" return str(self.sequence) + def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str: """ Chain multiple SeqRegions' sequenes together into one continuous sequence. diff --git a/pipeline/seq_retrieval/tests/requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt index fcea0984..d6d09df3 100644 --- a/pipeline/seq_retrieval/tests/requirements.txt +++ b/pipeline/seq_retrieval/tests/requirements.txt @@ -1 +1,2 @@ -mypy==1.9.* \ No newline at end of file +flake8==7.0.* +mypy==1.9.*