diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ab7432fc..56e1483c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -232,7 +232,7 @@ jobs: run: harpy qc -x "--low_complexity_filter" --quiet test/fastq - name: harpy qc all options shell: micromamba-shell {0} - run: harpy qc -a -d -c 21,40,3,0 --quiet test/fastq + run: harpy qc -a auto -d -c 21,40,3,0 --quiet test/fastq deconvolve: needs: [changes, pkgbuild] if: ${{ needs.changes.outputs.deconvolve == 'true' && needs.pkgbuild.result == 'success' }} diff --git a/harpy/_validations.py b/harpy/_validations.py index 5fb039a0..13e10c98 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -351,7 +351,11 @@ def validate_regions(regioninput, genome): def check_fasta(genofile): """perform validations on fasta file for extensions and file contents""" # validate fasta file contents - opener = gzip.open if is_gzip(genofile) else open + try: + opener = gzip.open if is_gzip(genofile) else open + except: + print_error("incorrect file format", f"The file must be plain-text or b/gzipped, but failed to be recognized as either. Please check that [blue]{genofile}[/blue] is indeed a fasta file.") + sys.exit(1) mode = "rt" if is_gzip(genofile) else "r" line_num = 0 seq_id = 0 diff --git a/harpy/qc.py b/harpy/qc.py index 704feebe..a9da7138 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -12,6 +12,7 @@ from ._cli_types_generic import HPCProfile, IntList, SnakemakeParams from ._cli_types_params import FastpParams from ._parsers import parse_fastq_inputs +from ._validations import check_fasta docstring = { "harpy qc": [ @@ -34,7 +35,7 @@ @click.option('-n', '--min-length', default = 30, show_default = True, type=int, help = 'Discard reads shorter than this length') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "QC", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -@click.option('-a', '--trim-adapters', is_flag = True, default = False, help = 'Detect and trim adapters') +@click.option('-a', '--trim-adapters', type = str, help = 'Detect and trim adapters') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @@ -49,11 +50,13 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d Provide the input fastq files and/or directories at the end of the command as individual files/folders, using shell wildcards (e.g. `data/acronotus*.fq`), or both. - The input reads will be quality trimmed using: + The input reads can be quality trimmed using: - a sliding window from front to tail - poly-G tail removal - - `-a` automatically detects and remove adapters - - `-d` finds and remove PCR duplicates + - `-a` remove adapters + - accepts `auto` to automatically detect adapters + - accepts a FASTA file of adapters to remove + - `-d` finds and removes optical PCR duplicates - `-c` resolves barcodes clashing between unrelated sequences - off by default, activated with [4 integers](https://github.com/RolandFaure/QuickDeconvolution?tab=readme-ov-file#usage), separated by commas - use `21,40,3,0` for QuickDeconvolution defaults (or adjust as needed) @@ -71,6 +74,14 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d command += snakemake fqlist, sample_count = parse_fastq_inputs(inputs) + if trim_adapters: + if trim_adapters != "auto": + if not os.path.exists(trim_adapters): + raise click.BadParameter(f"--trim-adapters was given {trim_adapters}, but that file does not exist. Please check the spelling or verify the location of the file.") + if not os.access(trim_adapters, os.R_OK): + raise click.BadParameter(f"--trim-adapters was given {trim_adapters}, but that file does not have read permissions. Please modify the persmissions of the file to grant read access.") + check_fasta(trim_adapters) + os.makedirs(workflowdir, exist_ok=True) fetch_rule(workflowdir, "qc.smk") fetch_report(workflowdir, "bx_count.Rmd") diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index 8752cfbd..f450656b 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -19,7 +19,7 @@ envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") min_len = config["min_len"] max_len = config["max_len"] extra = config.get("extra", "") -trimadapters = config["trim_adapters"] +trim_adapters = config.get("trim_adapters", None) dedup = config["deduplicate"] deconvolve = config.get("deconvolve", False) if deconvolve: @@ -30,6 +30,10 @@ if deconvolve: skip_reports = config["reports"]["skip"] bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} +if trim_adapters: + trim_arg = "--detect_adapter_for_pe" if trim_adapters == "auto" else f"--adapter_fasta {trim_adapters}" +else: + trim_arg = "--disable_adapter_trimming" def get_fq1(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. @@ -56,7 +60,7 @@ if not deconvolve: params: minlen = f"--length_required {min_len}", maxlen = f"--max_len1 {max_len}", - trim_adapters = "--detect_adapter_for_pe" if trimadapters else "--disable_adapter_trimming" , + trim_adapters = trim_arg, dedup = "-D" if dedup else "", extra = extra threads: @@ -81,7 +85,7 @@ else: params: minlen = f"--length_required {min_len}", maxlen = f"--max_len1 {max_len}", - trim_adapters = "--detect_adapter_for_pe" if trimadapters else "--disable_adapter_trimming" , + trim_adapters = trim_arg, dedup = "-D" if dedup else "", extra = extra threads: @@ -177,7 +181,7 @@ rule workflow_summary: params: minlen = f"--length_required {min_len}", maxlen = f"--max_len1 {max_len}", - trim_adapters = "--detect_adapter_for_pe" if trimadapters else "--disable_adapter_trimming" , + trim_adapters = trim_arg, dedup = "-D" if dedup else "", extra = extra run: