Added flake8 style check and made code compliant

alliance-genome · Mar 14, 2024 · 2207ad6 · 2207ad6
1 parent 3b0dca6
commit 2207ad6
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 26 deletions.
diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml
@@ -22,8 +22,8 @@ jobs:
       - name: Build container image
         run: |
           make container-image
-  pipeline-seq-retrieval-check-python-typing:
-    name: pipeline/seq_retrieval check python typing
+  pipeline-seq-retrieval-python-typing-check:
+    name: pipeline/seq_retrieval python typing check
     runs-on: ubuntu-22.04
     defaults:
       run:
@@ -42,5 +42,25 @@ jobs:
       - name: Check python typing
         run: |
           make check-python-typing
+  pipeline-seq-retrieval-python-style-check:
+    name: pipeline/seq_retrieval python style check
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Check python style
+        run: |
+          make check-python-style
   #TODO: add unit testing
   #TODO: add integration testing
diff --git a/pipeline/seq_retrieval/.flake8 b/pipeline/seq_retrieval/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+ignore = E203, E266, E501, F401, W503, E501
+max-complexity = 18
+select = B,C,E,F,W,T4
+exclude = venv
+per-file-ignores = __init__.py:F401
diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
@@ -8,10 +8,15 @@ python-dependencies:
 python-dependencies-update:
 	pip install -U -r requirements.txt
 
-check-python-typing: python-dependencies
+python-test-dependencies:
 	pip install -r tests/requirements.txt
+
+check-python-typing: python-dependencies python-test-dependencies
 	mypy --install-types --non-interactive src/main.py
 
+check-python-style: python-dependencies python-test-dependencies
+	flake8 ./
+
 check-venv-active:
 ifeq ($(VIRTUAL_ENV),)
 	@echo 'No active python virtual environment found.'\

diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -22,6 +22,7 @@
 Change the value through the `set_local_cache_reuse` function.
 """
 
+
 def set_local_cache_reuse(reuse: bool) -> None:
     """
     Define data_file_mover module-level behaviour on local file cache reuse.
@@ -36,6 +37,7 @@ def set_local_cache_reuse(reuse: bool) -> None:
     global _reuse_local_cache
     _reuse_local_cache = reuse
 
+
 def is_accessible_url(url: str) -> bool:
     """
     Check whether provided `url` is an accessible (remote) URL
@@ -52,6 +54,7 @@ def is_accessible_url(url: str) -> bool:
     else:
         return False
 
+
 def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Optional[bool] = None) -> str:
     """
     Fetch file from URL and return its local path.
@@ -83,6 +86,7 @@ def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: Option
 
     return local_path
 
+
 def find_local_file(path: str) -> str:
     """
     Find a file locally based on path and return its absolute path.
@@ -104,7 +108,8 @@ def find_local_file(path: str) -> str:
         else:
             return str(Path(path).resolve())
 
-def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str:
+
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size: int = 10 * 1024, reuse_local_cache: Optional[bool] = None) -> str:
     """
     Download file from remote URL and return its absolute local path.
 
@@ -125,7 +130,7 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
         `ValueError`: if `url` scheme is not supported or `url` is not accessible.
     """
 
-    if reuse_local_cache == None:
+    if reuse_local_cache is None:
         reuse_local_cache = _reuse_local_cache
 
     url_components = urlparse(url)
@@ -140,13 +145,13 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
         local_file_path = os.path.join(dest_dir, filename)
 
         if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
-            if reuse_local_cache == True:
-                #Return the local file path without downloading new content
+            if reuse_local_cache is True:
+                # Return the local file path without downloading new content
                 return str(Path(local_file_path).resolve())
             else:
                 os.remove(local_file_path)
 
-        #Download file through streaming to support large files
+        # Download file through streaming to support large files
         tmp_file_path = f"{local_file_path}.part"
         response = requests.get(url, stream=True)
 
@@ -158,5 +163,5 @@ def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 *
 
         return str(Path(local_file_path).resolve())
     else:
-        #Currently not supported
+        # Currently not supported
         raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.")
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
@@ -10,6 +10,7 @@
 import data_mover.data_file_mover as data_file_mover
 from seq_region import SeqRegion, chain_seq_region_seqs
 
+
 def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']:
     """
     Processes and normalises the value of click input argument `strand`.
@@ -22,14 +23,15 @@ def validate_strand_param(ctx, param, value: str) -> Literal['+', '-']:
     """
     POS_CHOICES = ['+', '+1', 'pos']
     NEG_CHOICES = ['-', '-1', 'neg']
-    if   value in POS_CHOICES:
+    if value in POS_CHOICES:
         return '+'
     elif value in NEG_CHOICES:
         return '-'
     else:
         raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
 
-def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
+
+def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str, Any]]:
     """
     Parse the value of click input parameter seq_regions and validate it's structure.
 
@@ -47,8 +49,8 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
     seq_regions = None
     try:
         seq_regions = json.loads(value)
-    except:
-        raise click.BadParameter(f"Must be a valid JSON-formatted string.")
+    except Exception:
+        raise click.BadParameter("Must be a valid JSON-formatted string.")
     else:
         if not isinstance(seq_regions, list):
             raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.")
@@ -66,6 +68,7 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str,Any]]:
 
         return seq_regions
 
+
 @click.command()
 @click.option("--seq_id", type=click.STRING, required=True,
               help="The sequence ID to retrieve sequences for.")
@@ -96,15 +99,16 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r
     seq_region_objs = []
     for region in seq_regions:
         seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
-                                          fasta_file_url=fasta_file_url))
+                                         fasta_file_url=fasta_file_url))
 
     for seq_region in seq_region_objs:
-        #Retrieve sequence for region
+        # Retrieve sequence for region
         seq_region.fetch_seq()
 
-    #Concatenate all regions into single sequence
+    # Concatenate all regions into single sequence
     seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand)
     click.echo(f"\nSeq concat: {seq_concat}")
 
+
 if __name__ == '__main__':
     main()
diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py
@@ -1,2 +1,2 @@
 from .seq_region import SeqRegion
-from .seq_region import chain_seq_region_seqs
+from .seq_region import chain_seq_region_seqs
diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -3,11 +3,12 @@
 """
 from typing import Any, Dict, List, Optional
 
-from Bio import Seq #Bio.Seq biopython submodule
+from Bio import Seq  # Bio.Seq biopython submodule
 import pysam
 
 from data_mover import data_file_mover
 
+
 class SeqRegion():
     """
     Defines a DNA sequence region.
@@ -52,15 +53,15 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
         self.seq_id = seq_id
         self.strand = strand
 
-        #If strand is -, ensure start <= end (swap as required)
+        # If strand is -, ensure start <= end (swap as required)
         if strand == '-':
             if end < start:
                 self.start = end
                 self.end = start
             else:
                 self.start = start
                 self.end = end
-        #If strand is +, throw error when end < start (likely user error)
+        # If strand is +, throw error when end < start (likely user error)
         else:
             if end < start:
                 raise ValueError(f"Unexpected position order: end {end} < start {start}.")
@@ -73,19 +74,18 @@ def __init__(self, seq_id: str, start: int, end: int, strand: str, fasta_file_ur
 
         # Fetch additional faidx index files in addition to fasta file itself
         # (to the same location)
-        index_files = [fasta_file_url+'.fai']
+        index_files = [fasta_file_url + '.fai']
         if fasta_file_url.endswith('.gz'):
-            index_files.append(fasta_file_url+'.gzi')
+            index_files.append(fasta_file_url + '.gzi')
 
         for index_file in index_files:
             data_file_mover.fetch_file(index_file)
 
         self.fasta_file_path = local_fasta_file_path
 
-        if seq != None:
+        if seq is not None:
             self.sequence = seq
 
-
     def fetch_seq(self) -> None:
         """
         Fetch sequence found at `seq_id`:`start`-`end`:`strand`
@@ -100,7 +100,7 @@ def fetch_seq(self) -> None:
         except IOError:
             raise IOError(f"Error while reading fasta file or index matching path {self.fasta_file_path}.")
         else:
-            seq = fasta_file.fetch(reference=self.seq_id, start=self.start-1, end=self.end)
+            seq = fasta_file.fetch(reference=self.seq_id, start=(self.start - 1), end=self.end)
             fasta_file.close()
 
             if self.strand == '-':
@@ -132,6 +132,7 @@ def get_sequence(self) -> str:
         """Return `sequence` attribute as a string (empty string if `None`)."""
         return str(self.sequence)
 
+
 def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str:
     """
     Chain multiple SeqRegions' sequenes together into one continuous sequence.

diff --git a/pipeline/seq_retrieval/tests/requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt
@@ -1 +1,2 @@
-mypy==1.9.*
+flake8==7.0.*
+mypy==1.9.*