diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml index 47fa9ce3..c39ae7d2 100644 --- a/.github/workflows/PR-validation.yml +++ b/.github/workflows/PR-validation.yml @@ -22,8 +22,8 @@ jobs: - name: Build container image run: | make container-image - pipeline-seq-retrieval-check-python-typing: - name: pipeline/seq_retrieval check python typing + pipeline-seq-retrieval-python-typing-check: + name: pipeline/seq_retrieval python typing check runs-on: ubuntu-22.04 defaults: run: @@ -42,5 +42,25 @@ jobs: - name: Check python typing run: | make check-python-typing + pipeline-seq-retrieval-python-style-check: + name: pipeline/seq_retrieval python style check + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash + working-directory: ./pipeline/seq_retrieval/ + steps: + - name: Check out repository code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + sparse-checkout: | + pipeline/seq_retrieval/ + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Check python style + run: | + make check-python-style #TODO: add unit testing #TODO: add integration testing diff --git a/pipeline/seq_retrieval/.flake8 b/pipeline/seq_retrieval/.flake8 new file mode 100644 index 00000000..02b9d28d --- /dev/null +++ b/pipeline/seq_retrieval/.flake8 @@ -0,0 +1,6 @@ +[flake8] +ignore = E203, E266, E501, F401, W503, E501 +max-complexity = 18 +select = B,C,E,F,W,T4 +exclude = venv +per-file-ignores = __init__.py:F401 diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index 7a788b0b..6f2acd17 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -8,10 +8,15 @@ python-dependencies: python-dependencies-update: pip install -U -r requirements.txt -check-python-typing: python-dependencies +python-test-dependencies: pip install -r tests/requirements.txt + +check-python-typing: python-dependencies python-test-dependencies mypy --install-types --non-interactive src/main.py +check-python-style: python-dependencies python-test-dependencies + flake8 ./ + check-venv-active: ifeq ($(VIRTUAL_ENV),) @echo 'No active python virtual environment found.'\ diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py index c9b2ac69..435c908d 100644 --- a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py +++ b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py @@ -22,6 +22,7 @@ Change the value through the `set_local_cache_reuse` function. """ + def set_local_cache_reuse(reuse: bool) -> None: """ Define data_file_mover module-level behaviour on local file cache reuse. @@ -36,6 +37,7 @@ def set_local_cache_reuse(reuse: bool) -> None: global _reuse_local_cache _reuse_local_cache = reuse + def is_accessible_url(url: str) -> bool: """ Check whether provided `url` is an accessible (remote) URL @@ -52,6 +54,7 @@ def is_accessible_url(url: str) -> bool: else: return False + def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Optional[bool] = None) -> str: """ Fetch file from URL and return its local path. @@ -83,6 +86,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Option return local_path + def find_local_file(path: str) -> str: """ Find a file locally based on path and return its absolute path. @@ -104,7 +108,8 @@ def find_local_file(path: str) -> str: else: return str(Path(path).resolve()) -def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str: + +def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size: int = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str: """ Download file from remote URL and return its absolute local path. @@ -125,7 +130,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * `ValueError`: if `url` scheme is not supported or `url` is not accessible. """ - if reuse_local_cache == None: + if reuse_local_cache is None: reuse_local_cache = _reuse_local_cache url_components = urlparse(url) @@ -140,13 +145,13 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * local_file_path = os.path.join(dest_dir, filename) if os.path.exists(local_file_path) and os.path.isfile(local_file_path): - if reuse_local_cache == True: - #Return the local file path without downloading new content + if reuse_local_cache is True: + # Return the local file path without downloading new content return str(Path(local_file_path).resolve()) else: os.remove(local_file_path) - #Download file through streaming to support large files + # Download file through streaming to support large files tmp_file_path = f"{local_file_path}.part" response = requests.get(url, stream=True) @@ -158,5 +163,5 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * return str(Path(local_file_path).resolve()) else: - #Currently not supported + # Currently not supported raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.") diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index fcdcba7f..0e4b0e62 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -10,6 +10,7 @@ import data_mover.data_file_mover as data_file_mover from seq_region import SeqRegion, chain_seq_region_seqs + def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']: """ Processes and normalises the value of click input argument `strand`. @@ -22,14 +23,15 @@ def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']: """ POS_CHOICES = ['+', '+1', 'pos'] NEG_CHOICES = ['-', '-1', 'neg'] - if value in POS_CHOICES: + if value in POS_CHOICES: return '+' elif value in NEG_CHOICES: return '-' else: raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.") -def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: + +def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str, Any]]: """ Parse the value of click input parameter seq_regions and validate it's structure. @@ -47,8 +49,8 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: seq_regions = None try: seq_regions = json.loads(value) - except: - raise click.BadParameter(f"Must be a valid JSON-formatted string.") + except Exception: + raise click.BadParameter("Must be a valid JSON-formatted string.") else: if not isinstance(seq_regions, list): raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.") @@ -66,6 +68,7 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]: return seq_regions + @click.command() @click.option("--seq_id", type=click.STRING, required=True, help="The sequence ID to retrieve sequences for.") @@ -96,15 +99,16 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r seq_region_objs = [] for region in seq_regions: seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand, - fasta_file_url=fasta_file_url)) + fasta_file_url=fasta_file_url)) for seq_region in seq_region_objs: - #Retrieve sequence for region + # Retrieve sequence for region seq_region.fetch_seq() - #Concatenate all regions into single sequence + # Concatenate all regions into single sequence seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand) click.echo(f"\nSeq concat: {seq_concat}") + if __name__ == '__main__': main() diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py index 75eecf7f..8a4ddd88 100644 --- a/pipeline/seq_retrieval/src/seq_region/__init__.py +++ b/pipeline/seq_retrieval/src/seq_region/__init__.py @@ -1,2 +1,2 @@ from .seq_region import SeqRegion -from .seq_region import chain_seq_region_seqs \ No newline at end of file +from .seq_region import chain_seq_region_seqs diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index 2668a044..c8d777a6 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -3,11 +3,12 @@ """ from typing import Any, Dict, List, Optional -from Bio import Seq #Bio.Seq biopython submodule +from Bio import Seq # Bio.Seq biopython submodule import pysam from data_mover import data_file_mover + class SeqRegion(): """ Defines a DNA sequence region. @@ -52,7 +53,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur self.seq_id = seq_id self.strand = strand - #If strand is -, ensure start <= end (swap as required) + # If strand is -, ensure start <= end (swap as required) if strand == '-': if end < start: self.start = end @@ -60,7 +61,7 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur else: self.start = start self.end = end - #If strand is +, throw error when end < start (likely user error) + # If strand is +, throw error when end < start (likely user error) else: if end < start: raise ValueError(f"Unexpected position order: end {end} < start {start}.") @@ -73,19 +74,18 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur # Fetch additional faidx index files in addition to fasta file itself # (to the same location) - index_files = [fasta_file_url+'.fai'] + index_files = [fasta_file_url + '.fai'] if fasta_file_url.endswith('.gz'): - index_files.append(fasta_file_url+'.gzi') + index_files.append(fasta_file_url + '.gzi') for index_file in index_files: data_file_mover.fetch_file(index_file) self.fasta_file_path = local_fasta_file_path - if seq != None: + if seq is not None: self.sequence = seq - def fetch_seq(self) -> None: """ Fetch sequence found at `seq_id`:`start`-`end`:`strand` @@ -100,7 +100,7 @@ def fetch_seq(self) -> None: except IOError: raise IOError(f"Error while reading fasta file or index matching path {self.fasta_file_path}.") else: - seq = fasta_file.fetch(reference=self.seq_id, start=self.start-1, end=self.end) + seq = fasta_file.fetch(reference=self.seq_id, start=(self.start - 1), end=self.end) fasta_file.close() if self.strand == '-': @@ -132,6 +132,7 @@ def get_sequence(self) -> str: """Return `sequence` attribute as a string (empty string if `None`).""" return str(self.sequence) + def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str: """ Chain multiple SeqRegions' sequenes together into one continuous sequence. diff --git a/pipeline/seq_retrieval/tests/requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt index fcea0984..d6d09df3 100644 --- a/pipeline/seq_retrieval/tests/requirements.txt +++ b/pipeline/seq_retrieval/tests/requirements.txt @@ -1 +1,2 @@ -mypy==1.9.* \ No newline at end of file +flake8==7.0.* +mypy==1.9.*