alliance-genome · mluypaert · Mar 15, 2024 · Mar 14, 2024 · Mar 15, 2024 · Mar 15, 2024
@@ -62,5 +62,24 @@ jobs:
       - name: Check python style
         run: |
           make check-python-style
-  #TODO: add unit testing
+  pipeline-seq-retrieval-unit-tests:
+    name: pipeline/seq_retrieval unit tests
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Run unit tests
+        run: |
+          make run-unit-tests
   #TODO: add integration testing
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,6 @@ cython_debug/
 
 # VS Code
 .vscode/
+
+# tmp directories used for downloading and storing local testing resources
+tmp/
diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile
@@ -17,6 +17,9 @@ check-python-typing: python-dependencies python-test-dependencies
 check-python-style: python-dependencies python-test-dependencies
 	flake8 ./
 
+run-unit-tests: python-dependencies python-test-dependencies
+	python -m pytest
+
 check-venv-active:
 ifeq ($(VIRTUAL_ENV),)
 	@echo 'No active python virtual environment found.'\

diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md
@@ -115,10 +115,39 @@ make check-python-typing-dev
 
 `mypy` checks are automatically run and enforced as part of the PR validation
 and all reported errors must be fixed to enable merging each PR into `main`.  
-If the `pipeline/seq_retrieval check python typing` status check fails on a PR in github,
+If the `pipeline/seq_retrieval python typing check` status check fails on a PR in github,
 click the details link and inspect the failing step output for hints on what to fix.
 
 #### Code documentation (docstrings)
 All modules, functions, classes and methods should have their input, attributes and output documented
 through docstrings to make the code easy to read and understand for anyone reading it.
 To ensure this is done in a uniform way accross all code, follow the [Google Python Style Guide on docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings).
+
+#### Style
+[Flake8](https://flake8.pycqa.org/en/latest/) is being used for style Guide Enforcement.
+For detailed list of all rules available through flake8, see https://www.flake8rules.com/.
+
+To check if your code complies with all style rules set by flake8, run the following command:
+```shell
+make check-python-style
+```
+With the flake8 output, we can now return to the code and fix the errors reported
+which would otherwise result in inconsistent code style and reduced readability.
+
+These style checks are automatically run and enforced as part of the PR validation
+and all reported errors must be fixed to enable merging each PR into `main`.  
+If the `pipeline/seq_retrieval python style check` status check fails on a PR in github,
+click the details link and inspect the failing step output for hints on what to fix.
+
+#### Unit testing
+[Pytest](https://pytest.org/) is being used for unit testing.
+
+To run unit testing locally, run the following command:
+```shell
+make run-unit-tests
+```
+
+All unit tests are automatically run and enforced as part of the PR validation
+and all reported errors must be fixed to enable merging each PR into `main`.  
+If the `pipeline/seq_retrieval unit tests` status check fails on a PR in github,
+click the details link and inspect the failing step output for hints on what to fix.
diff --git a/pipeline/seq_retrieval/pyproject.toml b/pipeline/seq_retrieval/pyproject.toml
@@ -0,0 +1,5 @@
+[tool.pytest.ini_options]
+addopts = [
+    "--import-mode=importlib",
+]
+pythonpath = "src"
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
@@ -84,19 +84,19 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str, Any]]:
 @click.option("--reuse_local_cache", is_flag=True,
               help="""When defined and using remote `fasta_file_url`, reused local files
               if file already exists at destination path, rather than re-downloading and overwritting.""")
-def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, reuse_local_cache: bool):
+@click.option("--unmasked", is_flag=True,
+              help="""When defined, return unmasked sequences (undo soft masking present in reference files).""")
+def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, reuse_local_cache: bool, unmasked: bool):
     """
     Main method for sequence retrieval from JBrowse faidx indexed fasta files. Receives input args from click.
 
     Prints a single (transcript) sequence obtained by concatenating the sequence of
     all sequence regions requested (in positional order defined by specified seq_strand).
     """
 
-    click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
-
     data_file_mover.set_local_cache_reuse(reuse_local_cache)
 
-    seq_region_objs = []
+    seq_region_objs: List[SeqRegion] = []
     for region in seq_regions:
         seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
                                          fasta_file_url=fasta_file_url))
@@ -106,8 +106,8 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r
         seq_region.fetch_seq()
 
     # Concatenate all regions into single sequence
-    seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand)
-    click.echo(f"\nSeq concat: {seq_concat}")
+    seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand, unmasked=unmasked)
+    click.echo(seq_concat)
 
 
 if __name__ == '__main__':

diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py
@@ -128,12 +128,25 @@ def set_sequence(self, sequence: str) -> None:
         else:
             self.sequence = sequence
 
-    def get_sequence(self) -> str:
-        """Return `sequence` attribute as a string (empty string if `None`)."""
-        return str(self.sequence)
+    def get_sequence(self, unmasked: bool = False) -> str:
+        """
+        Method to return `sequence` attribute as a string (optionally with modifications).
+
+        Args:
+            unmasked: Flag to remove soft masking (lowercase letters) \
+                      and return unmasked sequence instead (uppercase). Default `False`.
+        Returns:
+            The sequence of a seq region as a string (empty string if `None`).
+        """
+
+        seq = str(self.sequence)
+        if unmasked:
+            seq = seq.upper()
+
+        return seq
 
 
-def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str:
+def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str, unmasked: bool = False) -> str:
     """
     Chain multiple SeqRegions' sequenes together into one continuous sequence.
 
@@ -144,6 +157,7 @@ def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str:
     Args:
         seq_regions: list of SeqRegion objects to chain together
         seq_strand: sequence strand which defines the chaining order
+        unmasked: Return unmasked sequence (undo any soft masking present in source fasta file)
 
     Returns:
         String representing the chained sequence of all input SeqRegions
@@ -156,6 +170,6 @@ def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str:
 
     sorted_regions = seq_regions
     sorted_regions.sort(**sort_args)
-    chained_seq = ''.join(map(lambda region : region.get_sequence(), sorted_regions))
+    chained_seq = ''.join(map(lambda region : region.get_sequence(unmasked=unmasked), sorted_regions))
 
     return chained_seq
diff --git a/pipeline/seq_retrieval/tests/requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt
@@ -1,2 +1,3 @@
 flake8==7.0.*
 mypy==1.9.*
+pytest==8.0.*
diff --git a/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz
diff --git a/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.fai b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.fai
@@ -0,0 +1 @@
+X	17718942	3	80	81
diff --git a/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.gzi b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.gzi
diff --git a/pipeline/seq_retrieval/tests/unit/data_mover/test_data_file_mover.py b/pipeline/seq_retrieval/tests/unit/data_mover/test_data_file_mover.py
@@ -0,0 +1,40 @@
+"""
+Unit testing for data_file_mover module
+"""
+
+from pathlib import Path
+import os.path
+
+from data_mover.data_file_mover import is_accessible_url, download_from_url, find_local_file, fetch_file
+
+
+FASTA_URL = 'https://s3.amazonaws.com/agrjbrowse/fasta/GCF_000146045.2_R64_genomic.fna.gz'
+DOWNLOAD_DIR = 'tests/tmp/'
+
+
+def test_is_accessible_url():
+    response = is_accessible_url(FASTA_URL)
+
+    assert isinstance(response, bool)
+    assert response is True
+
+
+def test_download_from_url():
+    downloaded_file_path = download_from_url(url=FASTA_URL, dest_dir=DOWNLOAD_DIR, reuse_local_cache=False)
+
+    expected_rel_file_path = os.path.join(DOWNLOAD_DIR, 'GCF_000146045.2_R64_genomic.fna.gz')
+    expected_abs_file_path = str(Path(expected_rel_file_path).resolve())
+
+    assert isinstance(downloaded_file_path, str)
+    assert downloaded_file_path == expected_abs_file_path
+
+    assert find_local_file(expected_rel_file_path) == expected_abs_file_path
+
+    # Test cached retrieval
+    download_last_modified = os.path.getmtime(filename=downloaded_file_path)
+
+    fetched_file_path = fetch_file(url=FASTA_URL, dest_dir=DOWNLOAD_DIR, reuse_local_cache=True)
+    fetch_last_modified = os.path.getmtime(filename=fetched_file_path)
+
+    # Assert fetched file has not been redownloaded but is the previously downloaded file
+    assert download_last_modified == fetch_last_modified
diff --git a/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py b/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py
@@ -0,0 +1,51 @@
+"""
+Unit testing for SeqRegion class and related functions
+"""
+
+from seq_region import SeqRegion, chain_seq_region_seqs
+
+
+FASTA_FILE_URL = 'file://tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz'
+
+
+def test_seq_region_class():
+
+    # WBGene00000149 Transcript:C42D8.8b.1 Exon 1 (mRNA start)
+    exon_1: SeqRegion = SeqRegion(seq_id='X', start=5116799, end=5116864, strand='-',
+                                  fasta_file_url=FASTA_FILE_URL)
+
+    assert isinstance(exon_1, SeqRegion)
+
+    exon_1.fetch_seq()
+    exon_1_seq: str = exon_1.get_sequence()
+
+    assert isinstance(exon_1_seq, str)
+    assert exon_1_seq == 'atgACGGTGGGTAAACTAATGATTGGCTTACTTATACCGATTCTTGTCGCCACAGTTTACGCAGAG'
+
+
+def test_seq_region_chaining():
+
+    # WBGene00000149 Transcript:C42D8.8b.1 Exon 1 (mRNA start)
+    exon_1: SeqRegion = SeqRegion(seq_id='X', start=5116799, end=5116864, strand='-',
+                                  fasta_file_url=FASTA_FILE_URL)
+    EXON_1_SEQ = 'atgACGGTGGGTAAACTAATGATTGGCTTACTTATACCGATTCTTGTCGCCACAGTTTACGCAGAG'
+
+    # WBGene00000149 Transcript:C42D8.8b.1 Exon 2
+    exon_2: SeqRegion = SeqRegion(seq_id='X', start=5116171, end=5116338, strand='-',
+                                  fasta_file_url=FASTA_FILE_URL)
+    EXON_2_SEQ = 'ggTTCCCCAGCAGGCAGCAAGCGACATGAGAAGTTCATTCCAATGGTCGCATTTTCATGTGGATACCGCAACCAGTATATGACCGAAGAGGGATCATGGAAGACTGATGATGAACGATATGCCACCTGCTTCTCTGGCAAACTTGACATCCTCAAGTACTGCCGCAAG'
+
+    # WBGene00000149 Transcript:C42D8.8b.1 Exon 3
+    exon_3: SeqRegion = SeqRegion(seq_id='X', start=5115556, end=5115682, strand='-',
+                                  fasta_file_url=FASTA_FILE_URL)
+    EXON_3_SEQ = 'GCTTATCCATCCATGAACATCACCAACATCGTCGAATACTCGCACGAAGTGAGCATCTCCGACTGGTGCCGCGAGGAAGGATCGCCATGCAAGTGGACTCACAGTGTCAGACCGTACCATTGCATTG'
+
+    seq_region_list = [exon_1, exon_2, exon_3]
+
+    for exon in seq_region_list:
+        exon.fetch_seq()
+
+    chained_seq = chain_seq_region_seqs(seq_region_list, '-')
+
+    assert isinstance(chained_seq, str)
+    assert chained_seq == EXON_1_SEQ + EXON_2_SEQ + EXON_3_SEQ