diff --git a/.github/workflows/PR-validation.yml b/.github/workflows/PR-validation.yml index c39ae7d2..775fa4a0 100644 --- a/.github/workflows/PR-validation.yml +++ b/.github/workflows/PR-validation.yml @@ -62,5 +62,24 @@ jobs: - name: Check python style run: | make check-python-style - #TODO: add unit testing + pipeline-seq-retrieval-unit-tests: + name: pipeline/seq_retrieval unit tests + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash + working-directory: ./pipeline/seq_retrieval/ + steps: + - name: Check out repository code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + sparse-checkout: | + pipeline/seq_retrieval/ + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Run unit tests + run: | + make run-unit-tests #TODO: add integration testing diff --git a/.gitignore b/.gitignore index af90381a..94a59c24 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,6 @@ cython_debug/ # VS Code .vscode/ + +# tmp directories used for downloading and storing local testing resources +tmp/ diff --git a/pipeline/seq_retrieval/Makefile b/pipeline/seq_retrieval/Makefile index 6f2acd17..136bdbe1 100644 --- a/pipeline/seq_retrieval/Makefile +++ b/pipeline/seq_retrieval/Makefile @@ -17,6 +17,9 @@ check-python-typing: python-dependencies python-test-dependencies check-python-style: python-dependencies python-test-dependencies flake8 ./ +run-unit-tests: python-dependencies python-test-dependencies + python -m pytest + check-venv-active: ifeq ($(VIRTUAL_ENV),) @echo 'No active python virtual environment found.'\ diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md index add19607..d28dd118 100644 --- a/pipeline/seq_retrieval/README.md +++ b/pipeline/seq_retrieval/README.md @@ -115,10 +115,39 @@ make check-python-typing-dev `mypy` checks are automatically run and enforced as part of the PR validation and all reported errors must be fixed to enable merging each PR into `main`. -If the `pipeline/seq_retrieval check python typing` status check fails on a PR in github, +If the `pipeline/seq_retrieval python typing check` status check fails on a PR in github, click the details link and inspect the failing step output for hints on what to fix. #### Code documentation (docstrings) All modules, functions, classes and methods should have their input, attributes and output documented through docstrings to make the code easy to read and understand for anyone reading it. To ensure this is done in a uniform way accross all code, follow the [Google Python Style Guide on docstrings](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings). + +#### Style +[Flake8](https://flake8.pycqa.org/en/latest/) is being used for style Guide Enforcement. +For detailed list of all rules available through flake8, see https://www.flake8rules.com/. + +To check if your code complies with all style rules set by flake8, run the following command: +```shell +make check-python-style +``` +With the flake8 output, we can now return to the code and fix the errors reported +which would otherwise result in inconsistent code style and reduced readability. + +These style checks are automatically run and enforced as part of the PR validation +and all reported errors must be fixed to enable merging each PR into `main`. +If the `pipeline/seq_retrieval python style check` status check fails on a PR in github, +click the details link and inspect the failing step output for hints on what to fix. + +#### Unit testing +[Pytest](https://pytest.org/) is being used for unit testing. + +To run unit testing locally, run the following command: +```shell +make run-unit-tests +``` + +All unit tests are automatically run and enforced as part of the PR validation +and all reported errors must be fixed to enable merging each PR into `main`. +If the `pipeline/seq_retrieval unit tests` status check fails on a PR in github, +click the details link and inspect the failing step output for hints on what to fix. diff --git a/pipeline/seq_retrieval/pyproject.toml b/pipeline/seq_retrieval/pyproject.toml new file mode 100644 index 00000000..bbb6979b --- /dev/null +++ b/pipeline/seq_retrieval/pyproject.toml @@ -0,0 +1,5 @@ +[tool.pytest.ini_options] +addopts = [ + "--import-mode=importlib", +] +pythonpath = "src" diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py index 0e4b0e62..47e1891f 100644 --- a/pipeline/seq_retrieval/src/main.py +++ b/pipeline/seq_retrieval/src/main.py @@ -84,7 +84,9 @@ def process_seq_regions_param(ctx, param, value: str) -> List[Dict[str, Any]]: @click.option("--reuse_local_cache", is_flag=True, help="""When defined and using remote `fasta_file_url`, reused local files if file already exists at destination path, rather than re-downloading and overwritting.""") -def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, reuse_local_cache: bool): +@click.option("--unmasked", is_flag=True, + help="""When defined, return unmasked sequences (undo soft masking present in reference files).""") +def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, reuse_local_cache: bool, unmasked: bool): """ Main method for sequence retrieval from JBrowse faidx indexed fasta files. Receives input args from click. @@ -92,11 +94,9 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r all sequence regions requested (in positional order defined by specified seq_strand). """ - click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!") - data_file_mover.set_local_cache_reuse(reuse_local_cache) - seq_region_objs = [] + seq_region_objs: List[SeqRegion] = [] for region in seq_regions: seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand, fasta_file_url=fasta_file_url)) @@ -106,8 +106,8 @@ def main(seq_id: str, seq_strand: str, seq_regions: List, fasta_file_url: str, r seq_region.fetch_seq() # Concatenate all regions into single sequence - seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand) - click.echo(f"\nSeq concat: {seq_concat}") + seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand, unmasked=unmasked) + click.echo(seq_concat) if __name__ == '__main__': diff --git a/pipeline/seq_retrieval/src/seq_region/seq_region.py b/pipeline/seq_retrieval/src/seq_region/seq_region.py index c8d777a6..475e122c 100644 --- a/pipeline/seq_retrieval/src/seq_region/seq_region.py +++ b/pipeline/seq_retrieval/src/seq_region/seq_region.py @@ -128,12 +128,25 @@ def set_sequence(self, sequence: str) -> None: else: self.sequence = sequence - def get_sequence(self) -> str: - """Return `sequence` attribute as a string (empty string if `None`).""" - return str(self.sequence) + def get_sequence(self, unmasked: bool = False) -> str: + """ + Method to return `sequence` attribute as a string (optionally with modifications). + + Args: + unmasked: Flag to remove soft masking (lowercase letters) \ + and return unmasked sequence instead (uppercase). Default `False`. + Returns: + The sequence of a seq region as a string (empty string if `None`). + """ + + seq = str(self.sequence) + if unmasked: + seq = seq.upper() + + return seq -def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str: +def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str, unmasked: bool = False) -> str: """ Chain multiple SeqRegions' sequenes together into one continuous sequence. @@ -144,6 +157,7 @@ def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str: Args: seq_regions: list of SeqRegion objects to chain together seq_strand: sequence strand which defines the chaining order + unmasked: Return unmasked sequence (undo any soft masking present in source fasta file) Returns: String representing the chained sequence of all input SeqRegions @@ -156,6 +170,6 @@ def chain_seq_region_seqs(seq_regions: List[SeqRegion], seq_strand: str) -> str: sorted_regions = seq_regions sorted_regions.sort(**sort_args) - chained_seq = ''.join(map(lambda region : region.get_sequence(), sorted_regions)) + chained_seq = ''.join(map(lambda region : region.get_sequence(unmasked=unmasked), sorted_regions)) return chained_seq diff --git a/pipeline/seq_retrieval/tests/requirements.txt b/pipeline/seq_retrieval/tests/requirements.txt index d6d09df3..7576ac0a 100644 --- a/pipeline/seq_retrieval/tests/requirements.txt +++ b/pipeline/seq_retrieval/tests/requirements.txt @@ -1,2 +1,3 @@ flake8==7.0.* mypy==1.9.* +pytest==8.0.* diff --git a/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz new file mode 100644 index 00000000..f497b023 Binary files /dev/null and b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz differ diff --git a/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.fai b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.fai new file mode 100644 index 00000000..c96f82f3 --- /dev/null +++ b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.fai @@ -0,0 +1 @@ +X 17718942 3 80 81 diff --git a/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.gzi b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.gzi new file mode 100644 index 00000000..cc354d14 Binary files /dev/null and b/pipeline/seq_retrieval/tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz.gzi differ diff --git a/pipeline/seq_retrieval/tests/unit/data_mover/test_data_file_mover.py b/pipeline/seq_retrieval/tests/unit/data_mover/test_data_file_mover.py new file mode 100644 index 00000000..fd08fbae --- /dev/null +++ b/pipeline/seq_retrieval/tests/unit/data_mover/test_data_file_mover.py @@ -0,0 +1,40 @@ +""" +Unit testing for data_file_mover module +""" + +from pathlib import Path +import os.path + +from data_mover.data_file_mover import is_accessible_url, download_from_url, find_local_file, fetch_file + + +FASTA_URL = 'https://s3.amazonaws.com/agrjbrowse/fasta/GCF_000146045.2_R64_genomic.fna.gz' +DOWNLOAD_DIR = 'tests/tmp/' + + +def test_is_accessible_url(): + response = is_accessible_url(FASTA_URL) + + assert isinstance(response, bool) + assert response is True + + +def test_download_from_url(): + downloaded_file_path = download_from_url(url=FASTA_URL, dest_dir=DOWNLOAD_DIR, reuse_local_cache=False) + + expected_rel_file_path = os.path.join(DOWNLOAD_DIR, 'GCF_000146045.2_R64_genomic.fna.gz') + expected_abs_file_path = str(Path(expected_rel_file_path).resolve()) + + assert isinstance(downloaded_file_path, str) + assert downloaded_file_path == expected_abs_file_path + + assert find_local_file(expected_rel_file_path) == expected_abs_file_path + + # Test cached retrieval + download_last_modified = os.path.getmtime(filename=downloaded_file_path) + + fetched_file_path = fetch_file(url=FASTA_URL, dest_dir=DOWNLOAD_DIR, reuse_local_cache=True) + fetch_last_modified = os.path.getmtime(filename=fetched_file_path) + + # Assert fetched file has not been redownloaded but is the previously downloaded file + assert download_last_modified == fetch_last_modified diff --git a/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py b/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py new file mode 100644 index 00000000..0f7672d2 --- /dev/null +++ b/pipeline/seq_retrieval/tests/unit/seq_region/test_seq_region.py @@ -0,0 +1,51 @@ +""" +Unit testing for SeqRegion class and related functions +""" + +from seq_region import SeqRegion, chain_seq_region_seqs + + +FASTA_FILE_URL = 'file://tests/resources/GCF_000002985.6_WBcel235_genomic_X.fna.gz' + + +def test_seq_region_class(): + + # WBGene00000149 Transcript:C42D8.8b.1 Exon 1 (mRNA start) + exon_1: SeqRegion = SeqRegion(seq_id='X', start=5116799, end=5116864, strand='-', + fasta_file_url=FASTA_FILE_URL) + + assert isinstance(exon_1, SeqRegion) + + exon_1.fetch_seq() + exon_1_seq: str = exon_1.get_sequence() + + assert isinstance(exon_1_seq, str) + assert exon_1_seq == 'atgACGGTGGGTAAACTAATGATTGGCTTACTTATACCGATTCTTGTCGCCACAGTTTACGCAGAG' + + +def test_seq_region_chaining(): + + # WBGene00000149 Transcript:C42D8.8b.1 Exon 1 (mRNA start) + exon_1: SeqRegion = SeqRegion(seq_id='X', start=5116799, end=5116864, strand='-', + fasta_file_url=FASTA_FILE_URL) + EXON_1_SEQ = 'atgACGGTGGGTAAACTAATGATTGGCTTACTTATACCGATTCTTGTCGCCACAGTTTACGCAGAG' + + # WBGene00000149 Transcript:C42D8.8b.1 Exon 2 + exon_2: SeqRegion = SeqRegion(seq_id='X', start=5116171, end=5116338, strand='-', + fasta_file_url=FASTA_FILE_URL) + EXON_2_SEQ = 'ggTTCCCCAGCAGGCAGCAAGCGACATGAGAAGTTCATTCCAATGGTCGCATTTTCATGTGGATACCGCAACCAGTATATGACCGAAGAGGGATCATGGAAGACTGATGATGAACGATATGCCACCTGCTTCTCTGGCAAACTTGACATCCTCAAGTACTGCCGCAAG' + + # WBGene00000149 Transcript:C42D8.8b.1 Exon 3 + exon_3: SeqRegion = SeqRegion(seq_id='X', start=5115556, end=5115682, strand='-', + fasta_file_url=FASTA_FILE_URL) + EXON_3_SEQ = 'GCTTATCCATCCATGAACATCACCAACATCGTCGAATACTCGCACGAAGTGAGCATCTCCGACTGGTGCCGCGAGGAAGGATCGCCATGCAAGTGGACTCACAGTGTCAGACCGTACCATTGCATTG' + + seq_region_list = [exon_1, exon_2, exon_3] + + for exon in seq_region_list: + exon.fetch_seq() + + chained_seq = chain_seq_region_seqs(seq_region_list, '-') + + assert isinstance(chained_seq, str) + assert chained_seq == EXON_1_SEQ + EXON_2_SEQ + EXON_3_SEQ