From a7c4f165757e611bdf27f49aab125deca721cc35 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 31 Oct 2024 16:52:49 -0400 Subject: [PATCH 01/27] add harpytype --- harpy/_cli_types.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/harpy/_cli_types.py b/harpy/_cli_types.py index 1bd7d190a..559605923 100644 --- a/harpy/_cli_types.py +++ b/harpy/_cli_types.py @@ -52,3 +52,21 @@ def convert(self, value, param, ctx): else: return [i.strip() for i in value.split(',')] +class HarpyFile(click.ParamType): + """A class for a click type that verifies that a file exists and that it has an expected extension""" + def __init__(self, extensions): + super().__init__() + self.extensions = extensions + + def convert(self, value, param, ctx): + if not os.path.exists(value): + self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) + if os.path.isdir(value): + self.fail(f"{value} is a directory, but input should be a file.", param, ctx) + valid = False + lowercase = value.lower() + extensions = [extensions] if not isinstance(extensions, list) else extensions + for ext in extensions: + valid = True if lowercase.endswith(ext) else valid + if not valid: + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) \ No newline at end of file From 00a5ddf409f12fdbe1f4c4ab91442e255460faa1 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 09:45:41 -0400 Subject: [PATCH 02/27] improve genome input --- harpy/_cli_types.py | 18 ++++++++++++------ harpy/align.py | 8 ++++---- harpy/phase.py | 4 ++-- harpy/snp.py | 5 +++-- harpy/sv.py | 6 +++--- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/harpy/_cli_types.py b/harpy/_cli_types.py index 559605923..c68d63df6 100644 --- a/harpy/_cli_types.py +++ b/harpy/_cli_types.py @@ -52,21 +52,27 @@ def convert(self, value, param, ctx): else: return [i.strip() for i in value.split(',')] -class HarpyFile(click.ParamType): +class InputFile(click.ParamType): """A class for a click type that verifies that a file exists and that it has an expected extension""" - def __init__(self, extensions): + def __init__(self, extensions, gzip_ok): super().__init__() self.extensions = extensions - + self.gzip_ok = gzip_ok def convert(self, value, param, ctx): if not os.path.exists(value): self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) + elif not os.access(value, os.R_OK): + self.fail(f"{value} is not readable. Please check file/directory permissions and try again", param, ctx) if os.path.isdir(value): self.fail(f"{value} is a directory, but input should be a file.", param, ctx) valid = False lowercase = value.lower() - extensions = [extensions] if not isinstance(extensions, list) else extensions + extensions = [self.extensions] if not isinstance(self.extensions, list) else self.extensions for ext in extensions: valid = True if lowercase.endswith(ext) else valid - if not valid: - self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) \ No newline at end of file + if self.gzip_ok: + valid = True if lowercase.endswith(ext + ".gz") else valid + if not valid and not self.gzip_ok: + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) + if not valid and self.gzip_ok: + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility. Gzip compression (ending in .gz) is allowed.", param, ctx) diff --git a/harpy/align.py b/harpy/align.py index 8872db79a..4f77b9a5d 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -10,7 +10,7 @@ import rich_click as click from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import ContigList +from ._cli_types import ContigList, InputFile from ._launch import launch_snakemake from ._parsers import parse_fastq_inputs from ._printing import print_error, print_solution, print_notice @@ -65,7 +65,7 @@ def align(): } @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/") -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional bwa mem parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @@ -152,7 +152,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True, help = 'Output directory name') @@ -254,7 +254,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary") @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/") -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') diff --git a/harpy/phase.py b/harpy/phase.py index c5e650bd4..4e33ed7f5 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList +from ._cli_types import ContigList, InputFile from ._parsers import parse_alignment_inputs from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, validate_input_by_ext, vcf_contig_match @@ -28,7 +28,7 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/phase") @click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Phase", show_default=True, help = 'Output directory name') diff --git a/harpy/snp.py b/harpy/snp.py index becb7498c..9903f1730 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -7,6 +7,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import InputFile from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -48,7 +49,7 @@ def snp(): @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") @@ -153,7 +154,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") diff --git a/harpy/sv.py b/harpy/sv.py index 40c2b5eca..1c84b170c 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -7,10 +7,10 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import ContigList, InputFile from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList from ._parsers import parse_alignment_inputs from ._validations import check_fasta, check_phase_vcf from ._validations import validate_popfile, validate_popsamples, fasta_contig_match @@ -53,7 +53,7 @@ def sv(): @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/sv/leviathan/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @@ -142,7 +142,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/sv/naibr/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Genome assembly') +@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for calling variants') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-q', '--min-quality', show_default = True, default=30, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality of reads to use') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') From 7b6d367a7ddb118348fab81c504080d10803d202 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 09:48:22 -0400 Subject: [PATCH 03/27] fix fasta checks for new logic --- harpy/_validations.py | 10 +--------- harpy/align.py | 6 +++--- harpy/phase.py | 2 +- harpy/simulate.py | 8 ++++---- harpy/snp.py | 4 ++-- harpy/sv.py | 4 ++-- 6 files changed, 13 insertions(+), 21 deletions(-) diff --git a/harpy/_validations.py b/harpy/_validations.py index 5c85af42d..2e1ae35ca 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -425,16 +425,8 @@ def validate_regions(regioninput, genome): sys.exit(1) return "file" -def check_fasta(genofile, quiet): +def check_fasta(genofile): """perform validations on fasta file for extensions and file contents""" - ext_options = [".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".mpfa", ".frn"] - ext_correct = 0 - for i in ext_options: - if genofile.lower().endswith(i) or genofile.lower().endswith(i + ".gz"): - ext_correct += 1 - if ext_correct == 0 and not quiet: - print_notice(f"[blue]{genofile}[/blue] has an unfamiliar FASTA file extension. Common FASTA file extensions are: [green]" + ", ".join(ext_options) + "[/green] and may also be gzipped.") - # validate fasta file contents if is_gzip(genofile): fasta = gzip.open(genofile, 'rt') diff --git a/harpy/align.py b/harpy/align.py index 4f77b9a5d..08612cce1 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -105,7 +105,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) fetch_rule(workflowdir, "align_bwa.smk") @@ -206,7 +206,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) fetch_rule(workflowdir, "align_ema.smk") @@ -298,7 +298,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) fetch_rule(workflowdir, "align_strobealign.smk") diff --git a/harpy/phase.py b/harpy/phase.py index 4e33ed7f5..a43c6d297 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -73,7 +73,7 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) validate_bam_RG(bamlist, threads, quiet) if genome: - check_fasta(genome, quiet) + check_fasta(genome) if contigs: vcf_contig_match(contigs, vcf) fetch_rule(workflowdir, "phase.smk") diff --git a/harpy/simulate.py b/harpy/simulate.py index b31124024..3fa0d4f9e 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -271,7 +271,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text.add_row("Input Genome:", os.path.basename(genome)) if snp_vcf: validate_input_by_ext(snp_vcf, "--snp-vcf", ["vcf","vcf.gz","bcf"]) @@ -387,7 +387,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") @@ -505,7 +505,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") @@ -613,7 +613,7 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") diff --git a/harpy/snp.py b/harpy/snp.py index 9903f1730..09c2ae677 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -94,7 +94,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) - check_fasta(genome, quiet) + check_fasta(genome) # setup regions checks regtype = validate_regions(regions, genome) @@ -199,7 +199,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) - check_fasta(genome, quiet) + check_fasta(genome) # setup regions checks regtype = validate_regions(regions, genome) diff --git a/harpy/sv.py b/harpy/sv.py index 1c84b170c..72c939992 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -93,7 +93,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) if populations: @@ -191,7 +191,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) if populations: From a2af4eea63151877eeda616ae2b79419db82fef5 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 10:03:31 -0400 Subject: [PATCH 04/27] rm validate by ext --- harpy/_validations.py | 13 ----------- harpy/align.py | 6 +++--- harpy/impute.py | 8 +++---- harpy/phase.py | 7 +++--- harpy/simulate.py | 50 ++++++++++++++++--------------------------- harpy/snp.py | 4 ++-- harpy/sv.py | 4 ++-- 7 files changed, 33 insertions(+), 59 deletions(-) diff --git a/harpy/_validations.py b/harpy/_validations.py index 2e1ae35ca..728304954 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -58,19 +58,6 @@ def check_envdir(dirpath): rprint(errtable, file = sys.stderr) sys.exit(1) -def validate_input_by_ext(inputfile, option, ext): - """Check that the input file for a given option has read permissions and matches the acceptable extensions """ - if isinstance(ext, list): - test = [not(inputfile.lower().endswith(i.lower())) for i in ext] - if all(test): - ext_text = " | ".join(ext) - print_error("invalid file extension", f"The input file for [bold]{option}[/bold] must end in one of:\n[green bold]{ext_text}[/green bold]") - sys.exit(1) - else: - if not inputfile.lower().endswith(ext.lower()): - print_error("invalid file extension", f"The input file for [bold]{option}[/bold] must end in [green bold]{ext}[/green bold]") - sys.exit(1) - def check_impute_params(parameters): """Validate the STITCH parameter file for column names, order, types, missing values, etc.""" with open(parameters, "r", encoding="utf-8") as paramfile: diff --git a/harpy/align.py b/harpy/align.py index 08612cce1..856ef930c 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -65,7 +65,7 @@ def align(): } @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/") -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional bwa mem parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @@ -152,7 +152,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True, help = 'Output directory name') @@ -254,7 +254,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary") @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/") -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') diff --git a/harpy/impute.py b/harpy/impute.py index f70e72bc1..fb588a578 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -7,11 +7,12 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import InputFile from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, fetch_script, snakemake_log from ._parsers import parse_alignment_inputs, biallelic_contigs -from ._validations import validate_input_by_ext, vcf_sample_match, check_impute_params, validate_bam_RG +from ._validations import vcf_sample_match, check_impute_params, validate_bam_RG docstring = { "harpy impute": [ @@ -29,9 +30,9 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/impute/") @click.option('-x', '--extra-params', type = str, help = 'Additional STITCH parameters, in quotes') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Impute", show_default=True, help = 'Output directory name') -@click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False), help = 'STITCH parameter file (tab-delimited)') +@click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'STITCH parameter file (tab-delimited)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -@click.option('-v', '--vcf', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to BCF/VCF file') +@click.option('-v', '--vcf', required = True, type = InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @@ -68,7 +69,6 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) - validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) params = check_impute_params(parameters) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) diff --git a/harpy/phase.py b/harpy/phase.py index a43c6d297..bb0c80b0f 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -11,7 +11,7 @@ from ._misc import fetch_rule, fetch_report, snakemake_log from ._cli_types import ContigList, InputFile from ._parsers import parse_alignment_inputs -from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, validate_input_by_ext, vcf_contig_match +from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, vcf_contig_match docstring = { "harpy phase": [ @@ -28,13 +28,13 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/phase") @click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Phase", show_default=True, help = 'Output directory name') @click.option('-p', '--prune-threshold', default = 7, show_default = True, type = click.IntRange(0,100), help = 'PHRED-scale threshold (%) for pruning low-confidence SNPs (larger prunes more.)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max_open = True), help = 'Number of threads to use') -@click.option('-v', '--vcf', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to BCF/VCF file') +@click.option('-v', '--vcf', required = True, type = InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @@ -70,7 +70,6 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, os.makedirs(f"{workflowdir}/input", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) samplenames = vcf_sample_match(vcf, bamlist, vcf_samples) - validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) validate_bam_RG(bamlist, threads, quiet) if genome: check_fasta(genome) diff --git a/harpy/simulate.py b/harpy/simulate.py index 3fa0d4f9e..ed7bbee62 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -6,11 +6,12 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import InputFile from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_script, snakemake_log from ._printing import print_error -from ._validations import validate_input_by_ext, check_fasta +from ._validations import check_fasta @click.group(options_metavar='', context_settings={"help_option_names" : ["-h", "--help"]}) def simulate(): @@ -123,7 +124,7 @@ def simulate(): } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") -@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False), help = "File of linked-read barcodes to add to reads") +@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads") @click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") @click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True, help = "Average number of molecules per partition") @click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True, help = "Mean molecule length (kbp)") @@ -138,8 +139,8 @@ def simulate(): @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome_hap1', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) -@click.argument('genome_hap2', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.argument('genome_hap1', required=True, type = InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) +@click.argument('genome_hap2', required=True, type = InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_rate, distance_sd, barcodes, read_pairs, molecule_length, partitions, molecules_per, threads, snakemake, quiet, hpc, conda, setup_only): """ Create linked reads from a genome @@ -207,17 +208,17 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") -@click.option('-s', '--snp-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known snps to simulate') -@click.option('-i', '--indel-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known indels to simulate') +@click.option('-s', '--snp-vcf', type=InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'VCF file of known snps to simulate') +@click.option('-i', '--indel-vcf', type=InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'VCF file of known indels to simulate') @click.option('-n', '--snp-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random snps to simluate") @click.option('-m', '--indel-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random indels to simluate") @click.option('-r', '--titv-ratio', type = click.FloatRange(min=0), default= 0.5, show_choices=True, show_default=True, help = "Transition/Transversion ratio for snps") @click.option('-d', '--indel-ratio', type = click.FloatRange(min=0), default= 1, show_choices=True, show_default=True, help = "Insertion/Deletion ratio for indels") @click.option('-a', '--indel-size-alpha', type = click.FloatRange(min=0), hidden = True, default = 2.0, help = "Exponent Alpha for power-law-fitted size distribution") @click.option('-l', '--indel-size-constant', type = click.FloatRange(min=0), default = 0.5, hidden = True, help = "Exponent constant for power-law-fitted size distribution") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") +@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") @click.option('-y', '--snp-gene-constraints', type = click.Choice(["noncoding", "coding", "2d", "4d"]), help = "How to constrain randomly simulated SNPs {`noncoding`,`coding`,`2d`,`4d`}") -@click.option('-g', '--genes', type = click.Path(exists=True, readable=True), help = "GFF3 file of genes to use with `--snp-gene-constraints`") +@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -229,7 +230,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count, indel_count, titv_ratio, indel_ratio, indel_size_alpha, indel_size_constant, centromeres, genes, snp_gene_constraints, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce snps and/or indels into a genome @@ -274,20 +275,16 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count check_fasta(genome) start_text.add_row("Input Genome:", os.path.basename(genome)) if snp_vcf: - validate_input_by_ext(snp_vcf, "--snp-vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("SNP File:", os.path.basename(snp_vcf)) elif snp_count > 0: start_text.add_row("Random SNPs:", f"{snp_count}") if indel_vcf: - validate_input_by_ext(indel_vcf, "--indel-vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("Indel File:", os.path.basename(indel_vcf)) elif indel_count > 0: start_text.add_row("Random Indels:", f"{indel_count}") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) @@ -345,8 +342,8 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random inversions to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum inversion size (bp)") @click.option('-x', '--max-size', type = click.IntRange(min = 1), default = 100000, show_default= True, help = "Maximum inversion size (bp)") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @@ -358,7 +355,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_size, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce inversions into a genome @@ -394,15 +391,12 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si start_text.add_row("Input Genome:", os.path.basename(genome)) if vcf: - validate_input_by_ext(vcf, "--vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("Inversion File:", os.path.basename(vcf)) else: start_text.add_row("Random Inversions:", f"{count}") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) @@ -457,8 +451,8 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('-d', '--dup-ratio', type = click.FloatRange(min=0), default = 1, show_choices=True, show_default=True, help = "Ratio for the selected variant") @click.option('-l', '--gain-ratio', type = click.FloatRange(min=0), default=1, show_default=True, help = " Relative ratio of DNA gain over DNA loss") @click.option('-y', '--max-copy', type = click.IntRange(min = 1), default=10, show_default=True, help = "Maximum number of copies") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of genes to avoid when simulating (requires `--snp-coding-partition` for SNPs)") +@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -470,7 +464,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, dup_ratio, max_copy, gain_ratio, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce copy number variants into a genome @@ -512,15 +506,12 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du start_text.add_row("Input Genome:", os.path.basename(genome)) if vcf: - validate_input_by_ext(vcf, "--vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("CNV File:", os.path.basename(vcf)) else: start_text.add_row("Random CNVs:", f"{count}") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) @@ -572,8 +563,8 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known translocations to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random translocations to simluate") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -585,7 +576,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce translocations into a genome @@ -620,15 +611,12 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, start_text.add_row("Input Genome:", os.path.basename(genome)) if vcf: - validate_input_by_ext(vcf, "--vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("Translocation File:", os.path.basename(vcf)) else: start_text.add_row("Random Translocations", f"{count} random translocations") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) diff --git a/harpy/snp.py b/harpy/snp.py index 09c2ae677..430675699 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -49,7 +49,7 @@ def snp(): @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") @@ -154,7 +154,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") diff --git a/harpy/sv.py b/harpy/sv.py index 72c939992..a9b1aaf55 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -53,7 +53,7 @@ def sv(): @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/sv/leviathan/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @@ -142,7 +142,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/sv/naibr/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".gz", ".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for calling variants') +@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for calling variants') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-q', '--min-quality', show_default = True, default=30, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality of reads to use') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') From fe2cf0be9f0ed45fecf4db632f67d6e97038ee7a Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 10:47:36 -0400 Subject: [PATCH 05/27] add snakemake validation --- harpy/_cli_types.py | 15 +++++++++++++++ harpy/align.py | 8 ++++---- harpy/assembly.py | 4 ++-- harpy/deconvolve.py | 3 ++- harpy/demultiplex.py | 3 ++- harpy/impute.py | 4 ++-- harpy/metassembly.py | 4 ++-- harpy/phase.py | 4 ++-- harpy/preflight.py | 5 +++-- harpy/qc.py | 4 ++-- harpy/simulate.py | 12 ++++++------ harpy/snp.py | 6 +++--- harpy/sv.py | 6 +++--- 13 files changed, 48 insertions(+), 30 deletions(-) diff --git a/harpy/_cli_types.py b/harpy/_cli_types.py index c68d63df6..f718c69ef 100644 --- a/harpy/_cli_types.py +++ b/harpy/_cli_types.py @@ -76,3 +76,18 @@ def convert(self, value, param, ctx): self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) if not valid and self.gzip_ok: self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility. Gzip compression (ending in .gz) is allowed.", param, ctx) + return value + +class SnakemakeParams(click.ParamType): + """A class for a click type which accepts snakemake parameters. Does validations to make sure there isn't doubling up.""" + name = "snakemake_params" + def convert(self, value, param, ctx): + forbidden = "--rerun-incomplete --ri --show-failed-logs --rerun-triggers --nolock --software-deployment-method --smd --deployment --deployment-method --conda-prefix --cores -c --directory -d --snakefile -s --configfile --configfiles".split() + available = "--dry-run --dryrun -n --profile --cache --jobs -j --local-cores --resources --res --set-threads --max-threads --set-resources --set-scatter --set-resource-scopes --default-resources --default-res --preemptible-rules --preemptible-retries --envvars --touch -t --keep-going -k --force -f --executor -e --forceall -F --forcerun -R --prioritize -P --batch --until -U --omit-from -O --shadow-prefixDIR --scheduler --wms-monitor --wms-monitor-arg --scheduler-ilp-solver --conda-base-path --no-subworkflows --nosw --precommand --groups --group-components --report --report-stylesheet --reporterPLUGIN --draft-notebook --edit-notebook --notebook-listen --lint --generate-unit-tests --containerize --export-cwl --list-rules --list -l --list-target-rules --lt --dag --rulegraph --filegraph --d3dag --summary -S --detailed-summary -D --archive --cleanup-metadata --cmFILE --cleanup-shadow --skip-script-cleanup --unlock --list-changes --lc --list-input-changes --li --list-params-changes --lp --list-untracked --lu --delete-all-output --delete-temp-output --keep-incomplete --drop-metadata --version -v --printshellcmds -p --debug-dag --nocolor --quiet -q --print-compilation --verbose --force-use-threads --allow-ambiguity -a --ignore-incomplete --ii --max-inventory-time --latency-wait --output-wait -w --wait-for-files --wait-for-files-file --queue-input-wait-time --notemp --nt --all-temp --unneeded-temp-files --keep-storage-local-copies --target-files-omit-workdir-adjustment --allowed-rules --max-jobs-per-timespan --max-jobs-per-second --max-status-checks-per-second --seconds-between-status-checks --retries --restart-times -T --wrapper-prefix --default-storage-provider --default-storage-prefix --local-storage-prefix --remote-job-local-storage-prefix --shared-fs-usage --scheduler-greediness --greediness --no-hooks --debug --runtime-profile --local-groupid --attempt --log-handler-script --log-service --job-deploy-sources --benchmark-extended --container-image --immediate-submit --is --jobscript --js --jobname --jn --flux --container-cleanup-images --use-conda --conda-not-block-search-path-envvars --list-conda-envs --conda-cleanup-envs --conda-cleanup-pkgs --conda-create-envs-only --conda-frontend --use-apptainer --use-singularity --apptainer-prefix --singularity-prefix --apptainer-args --singularity-args --use-envmodules --scheduler-solver-path --deploy-sources --target-jobs --mode --report-html-path --report-html-stylesheet-path".split() + for i in value.split(): + if i.startswith("-"): + if i in forbidden: + self.fail(f"{i} is a forbidden option because it is already used by Harpy to call Snakemake.", param, ctx) + if i not in available: + self.fail(f"{i} is not a valid Snakemake option. Run \'snakemake --help\' for a list of all Snakemake command line options.", param, ctx) + return value \ No newline at end of file diff --git a/harpy/align.py b/harpy/align.py index 856ef930c..530e50e08 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -10,7 +10,7 @@ import rich_click as click from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import ContigList, InputFile +from ._cli_types import ContigList, InputFile, SnakemakeParams from ._launch import launch_snakemake from ._parsers import parse_fastq_inputs from ._printing import print_error, print_solution, print_notice @@ -79,7 +79,7 @@ def align(): @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_params, min_quality, molecule_distance, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ @@ -165,7 +165,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_unmapped, threads, ema_bins, skip_reports, extra_params, min_quality, snakemake, quiet, hpc, conda, contigs, setup_only): """ @@ -269,7 +269,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, threads, extra_params, min_quality, molecule_distance, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ diff --git a/harpy/assembly.py b/harpy/assembly.py index f2949eb66..3d30c15fb 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -6,10 +6,10 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import KParam, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log -from ._cli_types import KParam from ._validations import validate_fastq_bx docstring = { @@ -55,7 +55,7 @@ @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) @click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, extra_params,arcs_extra,contig_length,links,min_quality,min_aligned,mismatch,molecule_distance,molecule_length,seq_identity,span, organism_type, conda, threads, snakemake, quiet, hpc, setup_only, skip_reports): diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index d86588d7c..8b5d674b4 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -6,6 +6,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -35,7 +36,7 @@ @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, threads, snakemake, quiet, hpc, conda, setup_only): """ diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 8adf78842..2700c520e 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -7,6 +7,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -46,7 +47,7 @@ def demultiplex(): @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('R1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('R2_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('I1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) diff --git a/harpy/impute.py b/harpy/impute.py index fb588a578..6279ec13b 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import InputFile +from ._cli_types import InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, fetch_script, snakemake_log @@ -38,7 +38,7 @@ @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for imputation rather than those found the inputs') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): diff --git a/harpy/metassembly.py b/harpy/metassembly.py index a32454c2c..1541c0ba4 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log -from ._cli_types import KParam +from ._cli_types import KParam, SnakemakeParams from ._validations import validate_fastq_bx docstring = { @@ -41,7 +41,7 @@ @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) @click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, ignore_bx, output_dir, extra_params, conda, threads, snakemake, quiet, hpc, organism_type, setup_only, skip_reports): diff --git a/harpy/phase.py b/harpy/phase.py index bb0c80b0f..564fdcfd9 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList, InputFile +from ._cli_types import ContigList, InputFile, SnakemakeParams from ._parsers import parse_alignment_inputs from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, vcf_contig_match @@ -41,7 +41,7 @@ @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for phasing rather than those found the inputs') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, vcf_samples, genome, snakemake, extra_params, ignore_bx, skip_reports, quiet, hpc, conda, contigs, setup_only): diff --git a/harpy/preflight.py b/harpy/preflight.py index ee30f0a43..68a1372e5 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -6,6 +6,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types import SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -43,7 +44,7 @@ def preflight(): @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1) def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only): """ @@ -101,7 +102,7 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/bam", show_default=True, help = 'Output directory name') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') diff --git a/harpy/qc.py b/harpy/qc.py index 7eb95da4a..9817c67ad 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import IntList +from ._cli_types import IntList, SnakemakeParams from ._parsers import parse_fastq_inputs docstring = { @@ -39,7 +39,7 @@ @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, deconvolve, extra_params, threads, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ diff --git a/harpy/simulate.py b/harpy/simulate.py index ed7bbee62..b45562f4c 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import InputFile +from ._cli_types import InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_script, snakemake_log @@ -138,7 +138,7 @@ def simulate(): @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome_hap1', required=True, type = InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) @click.argument('genome_hap2', required=True, type = InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_rate, distance_sd, barcodes, read_pairs, molecule_length, partitions, molecules_per, threads, snakemake, quiet, hpc, conda, setup_only): @@ -229,7 +229,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count, indel_count, titv_ratio, indel_ratio, indel_size_alpha, indel_size_constant, centromeres, genes, snp_gene_constraints, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ @@ -354,7 +354,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_size, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ @@ -463,7 +463,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, dup_ratio, max_copy, gain_ratio, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ @@ -575,7 +575,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ diff --git a/harpy/snp.py b/harpy/snp.py index 430675699..27d4c8e63 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import InputFile +from ._cli_types import InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -60,7 +60,7 @@ def snp(): @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ @@ -165,7 +165,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ diff --git a/harpy/sv.py b/harpy/sv.py index a9b1aaf55..133c3ed5c 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import ContigList, InputFile +from ._cli_types import ContigList, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -66,7 +66,7 @@ def sv(): @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, threads, populations, extra_params, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ @@ -157,7 +157,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, threads, populations, molecule_distance, extra_params, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ From ed39b0d8221fe0fd6a00d284b011b3a68748f869 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 10:54:31 -0400 Subject: [PATCH 06/27] swap to conda --- harpy/hpc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/harpy/hpc.py b/harpy/hpc.py index c8deb1965..68293a000 100644 --- a/harpy/hpc.py +++ b/harpy/hpc.py @@ -37,7 +37,7 @@ def generic(): Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `generic-cluster` plugin with: ```bash -mamba install bioconda::snakemake-executor-plugin-cluster-generic +conda install bioconda::snakemake-executor-plugin-cluster-generic ``` """)) with open(outfile, "w", encoding = "utf-8") as yml: @@ -79,7 +79,7 @@ def lsf(): Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `lsf` plugin with: ```bash -mamba install bioconda::snakemake-executor-plugin-lsf +conda install bioconda::snakemake-executor-plugin-lsf ``` """)) with open(outfile, "w", encoding = "utf-8") as yml: @@ -115,7 +115,7 @@ def htcondor(): Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `htcondor` plugin with: ```bash -mamba install bioconda::snakemake-executor-plugin-htcondor +conda install bioconda::snakemake-executor-plugin-htcondor ``` """)) with open(outfile, "w", encoding = "utf-8") as yml: @@ -150,7 +150,7 @@ def slurm(): Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `slurm` plugin with: ```bash -mamba install bioconda::snakemake-executor-plugin-slurm +conda install bioconda::snakemake-executor-plugin-slurm ``` """)) with open(outfile, "w", encoding = "utf-8") as yml: @@ -185,7 +185,7 @@ def googlebatch(): Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `googlebatch` plugin with: ```bash -mamba install bioconda::snakemake-executor-plugin-googlebatch +conda install bioconda::snakemake-executor-plugin-googlebatch ``` """)) with open(outfile, "w", encoding = "utf-8") as yml: From 75a046d50ea5b3875df8a41cfe7ef58eed2db414 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 11:04:55 -0400 Subject: [PATCH 07/27] add HPC cli type and validation --- harpy/_cli_types.py | 18 +++++++++++++++++- harpy/align.py | 8 ++++---- harpy/assembly.py | 4 ++-- harpy/deconvolve.py | 4 ++-- harpy/demultiplex.py | 4 ++-- harpy/impute.py | 4 ++-- harpy/metassembly.py | 4 ++-- harpy/phase.py | 4 ++-- harpy/preflight.py | 6 +++--- harpy/qc.py | 4 ++-- harpy/simulate.py | 12 ++++++------ harpy/snp.py | 6 +++--- harpy/sv.py | 10 +++++----- 13 files changed, 52 insertions(+), 36 deletions(-) diff --git a/harpy/_cli_types.py b/harpy/_cli_types.py index f718c69ef..b8514ed9e 100644 --- a/harpy/_cli_types.py +++ b/harpy/_cli_types.py @@ -90,4 +90,20 @@ def convert(self, value, param, ctx): self.fail(f"{i} is a forbidden option because it is already used by Harpy to call Snakemake.", param, ctx) if i not in available: self.fail(f"{i} is not a valid Snakemake option. Run \'snakemake --help\' for a list of all Snakemake command line options.", param, ctx) - return value \ No newline at end of file + return value + +class HPCProfile(click.ParamType): + """A class for a click type which accepts a directory with a snakemake HPC profile. Does validations to make sure the config file is there.""" + name = "hpc_profile" + def convert(self, value, param, ctx): + if not os.path.exists(value): + self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) + elif not os.access(value, os.R_OK): + self.fail(f"{value} is not readable. Please check file/directory permissions and try again", param, ctx) + if os.path.isfile(value): + self.fail(f"{value} is a file, but input should be a directory.", param, ctx) + if not os.path.exists(f"{value}/config.yaml"): + self.fail(f"{value} does not contain the necessary config.yaml file.", param, ctx) + elif not os.access(f"{value}/config.yaml", os.R_OK): + self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx) + return value diff --git a/harpy/align.py b/harpy/align.py index 530e50e08..a99aaa00b 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -10,7 +10,7 @@ import rich_click as click from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import ContigList, InputFile, SnakemakeParams +from ._cli_types import ContigList, InputFile, HPCProfile, SnakemakeParams from ._launch import launch_snakemake from ._parsers import parse_fastq_inputs from ._printing import print_error, print_solution, print_notice @@ -76,7 +76,7 @@ def align(): @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @@ -162,7 +162,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @@ -266,7 +266,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/assembly.py b/harpy/assembly.py index 3d30c15fb..dd3098a25 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import KParam, SnakemakeParams +from ._cli_types import KParam, HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -51,7 +51,7 @@ @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index 8b5d674b4..3b1c6f3f3 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import SnakemakeParams +from ._cli_types import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -34,7 +34,7 @@ @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Deconvolve", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 2700c520e..91624d409 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import SnakemakeParams +from ._cli_types import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -44,7 +44,7 @@ def demultiplex(): @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/impute.py b/harpy/impute.py index 6279ec13b..3ee64901e 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import InputFile, SnakemakeParams +from ._cli_types import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, fetch_script, snakemake_log @@ -35,7 +35,7 @@ @click.option('-v', '--vcf', required = True, type = InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/metassembly.py b/harpy/metassembly.py index 1541c0ba4..a2be5fbce 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log -from ._cli_types import KParam, SnakemakeParams +from ._cli_types import HPCProfile, KParam, SnakemakeParams from ._validations import validate_fastq_bx docstring = { @@ -37,7 +37,7 @@ @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') diff --git a/harpy/phase.py b/harpy/phase.py index 564fdcfd9..43b0c214d 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList, InputFile, SnakemakeParams +from ._cli_types import ContigList, HPCProfile, InputFile, SnakemakeParams from ._parsers import parse_alignment_inputs from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, vcf_contig_match @@ -38,7 +38,7 @@ @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/preflight.py b/harpy/preflight.py index 68a1372e5..245db0ba6 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import SnakemakeParams +from ._cli_types import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -42,7 +42,7 @@ def preflight(): @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1) @@ -103,7 +103,7 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/bam", show_default=True, help = 'Output directory name') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) diff --git a/harpy/qc.py b/harpy/qc.py index 9817c67ad..cb49a184d 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import IntList, SnakemakeParams +from ._cli_types import HPCProfile, IntList, SnakemakeParams from ._parsers import parse_fastq_inputs docstring = { @@ -36,7 +36,7 @@ @click.option('-a', '--trim-adapters', is_flag = True, default = False, help = 'Detect and trim adapters') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/simulate.py b/harpy/simulate.py index b45562f4c..0861ae288 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import InputFile, SnakemakeParams +from ._cli_types import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_script, snakemake_log @@ -134,7 +134,7 @@ def simulate(): @click.option('-p', '--partitions', type = click.IntRange(min = 1), default=1500, show_default=True, help = "Number (in thousands) of partitions/beads to generate") @click.option('-n', '--read-pairs', type = click.FloatRange(min = 0.001), default = 600, show_default=True, help = "Number (in millions) of read pairs to simulate") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @@ -226,7 +226,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @@ -351,7 +351,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Simulate/inversion", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @@ -460,7 +460,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @@ -572,7 +572,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/snp.py b/harpy/snp.py index 27d4c8e63..6a08d8bad 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import InputFile, SnakemakeParams +from ._cli_types import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -55,7 +55,7 @@ def snp(): @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") @click.option('-r', '--regions', type=str, default=50000, show_default=True, help = "Regions where to call variants") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @@ -162,7 +162,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') diff --git a/harpy/sv.py b/harpy/sv.py index 133c3ed5c..4a0322d7a 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import ContigList, InputFile, SnakemakeParams +from ._cli_types import ContigList, HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -52,7 +52,7 @@ def sv(): } @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/sv/leviathan/") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') +@click.option('-x', '--extra-params', type = str, help = 'Additional leviathan parameters, in quotes') @click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @@ -63,7 +63,7 @@ def sv(): @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @@ -141,7 +141,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log, quiet, "workflow/sv.leviathan.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/sv/naibr/") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') +@click.option('-x', '--extra-params', type = str, help = 'Additional naibr parameters, in quotes') @click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for calling variants') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-q', '--min-quality', show_default = True, default=30, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality of reads to use') @@ -154,7 +154,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') From 1bd937065f4fe348e20297b7e2efe9b14a84cd93 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 11:09:30 -0400 Subject: [PATCH 08/27] rename to generic to add program-specific file --- .github/filters.yml | 2 +- .../{_cli_types.py => _cli_types_generic.py} | 4 +++- harpy/_cli_types_params.py | 21 +++++++++++++++++++ harpy/align.py | 2 +- harpy/assembly.py | 2 +- harpy/deconvolve.py | 2 +- harpy/demultiplex.py | 2 +- harpy/impute.py | 2 +- harpy/metassembly.py | 2 +- harpy/phase.py | 2 +- harpy/preflight.py | 2 +- harpy/qc.py | 2 +- harpy/simulate.py | 2 +- harpy/snp.py | 2 +- harpy/sv.py | 2 +- 15 files changed, 37 insertions(+), 14 deletions(-) rename harpy/{_cli_types.py => _cli_types_generic.py} (98%) create mode 100644 harpy/_cli_types_params.py diff --git a/.github/filters.yml b/.github/filters.yml index e1e3b6a91..54dccac03 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -1,7 +1,7 @@ common: &common - 'harpy/_launch.py' - 'harpy/_misc.py' - - 'harpy/_cli_types.py' + - 'harpy/_cli_types_generic.py' - 'harpy/_parsers.py' - 'harpy/_printing.py' - 'harpy/_validations.py' diff --git a/harpy/_cli_types.py b/harpy/_cli_types_generic.py similarity index 98% rename from harpy/_cli_types.py rename to harpy/_cli_types_generic.py index b8514ed9e..def96a33a 100644 --- a/harpy/_cli_types.py +++ b/harpy/_cli_types_generic.py @@ -1,4 +1,4 @@ -"""Module with helper function to set up Harpy workflows""" +"""Module with python-click types for command-line level validations of inputs""" import os import click @@ -107,3 +107,5 @@ def convert(self, value, param, ctx): elif not os.access(f"{value}/config.yaml", os.R_OK): self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx) return value + +### program-specific extra-params types diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py new file mode 100644 index 000000000..762075762 --- /dev/null +++ b/harpy/_cli_types_params.py @@ -0,0 +1,21 @@ +"""Module with python-click types for command-line level validations of program-specific extra-params inputs""" + +import os +import click + +class HPCProfile(click.ParamType): + """A class for a click type which accepts a directory with a snakemake HPC profile. Does validations to make sure the config file is there.""" + name = "hpc_profile" + def convert(self, value, param, ctx): + if not os.path.exists(value): + self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) + elif not os.access(value, os.R_OK): + self.fail(f"{value} is not readable. Please check file/directory permissions and try again", param, ctx) + if os.path.isfile(value): + self.fail(f"{value} is a file, but input should be a directory.", param, ctx) + if not os.path.exists(f"{value}/config.yaml"): + self.fail(f"{value} does not contain the necessary config.yaml file.", param, ctx) + elif not os.access(f"{value}/config.yaml", os.R_OK): + self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx) + return value + diff --git a/harpy/align.py b/harpy/align.py index a99aaa00b..d252d314f 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -10,7 +10,7 @@ import rich_click as click from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import ContigList, InputFile, HPCProfile, SnakemakeParams +from ._cli_types_generic import ContigList, InputFile, HPCProfile, SnakemakeParams from ._launch import launch_snakemake from ._parsers import parse_fastq_inputs from ._printing import print_error, print_solution, print_notice diff --git a/harpy/assembly.py b/harpy/assembly.py index dd3098a25..f11feb18f 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import KParam, HPCProfile, SnakemakeParams +from ._cli_types_generic import KParam, HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index 3b1c6f3f3..b2fe3be49 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import HPCProfile, SnakemakeParams +from ._cli_types_generic import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 91624d409..7a0b82402 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import HPCProfile, SnakemakeParams +from ._cli_types_generic import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log diff --git a/harpy/impute.py b/harpy/impute.py index 3ee64901e..4dbb40532 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, fetch_script, snakemake_log diff --git a/harpy/metassembly.py b/harpy/metassembly.py index a2be5fbce..c9ffc5b88 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log -from ._cli_types import HPCProfile, KParam, SnakemakeParams +from ._cli_types_generic import HPCProfile, KParam, SnakemakeParams from ._validations import validate_fastq_bx docstring = { diff --git a/harpy/phase.py b/harpy/phase.py index 43b0c214d..3c541dc87 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams from ._parsers import parse_alignment_inputs from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, vcf_contig_match diff --git a/harpy/preflight.py b/harpy/preflight.py index 245db0ba6..5880cad23 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import HPCProfile, SnakemakeParams +from ._cli_types_generic import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log diff --git a/harpy/qc.py b/harpy/qc.py index cb49a184d..95c07b26e 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -9,7 +9,7 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import HPCProfile, IntList, SnakemakeParams +from ._cli_types_generic import HPCProfile, IntList, SnakemakeParams from ._parsers import parse_fastq_inputs docstring = { diff --git a/harpy/simulate.py b/harpy/simulate.py index 0861ae288..9650e82b8 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -6,7 +6,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_script, snakemake_log diff --git a/harpy/snp.py b/harpy/snp.py index 6a08d8bad..a55c2ca22 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log diff --git a/harpy/sv.py b/harpy/sv.py index 4a0322d7a..effc00545 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -7,7 +7,7 @@ from rich import box from rich.table import Table import rich_click as click -from ._cli_types import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log From e492be33c0538681f3e35158beea2f4137531f15 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 11:53:15 -0400 Subject: [PATCH 09/27] add fastp param validation --- harpy/_cli_types_params.py | 32 ++++++++++++++++---------------- harpy/qc.py | 3 ++- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 762075762..03f9768d2 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -1,21 +1,21 @@ """Module with python-click types for command-line level validations of program-specific extra-params inputs""" -import os import click -class HPCProfile(click.ParamType): - """A class for a click type which accepts a directory with a snakemake HPC profile. Does validations to make sure the config file is there.""" - name = "hpc_profile" +class FastpParams(click.ParamType): + """A class for a click type that validates fastp extra-params.""" + name = "fastp_params" def convert(self, value, param, ctx): - if not os.path.exists(value): - self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) - elif not os.access(value, os.R_OK): - self.fail(f"{value} is not readable. Please check file/directory permissions and try again", param, ctx) - if os.path.isfile(value): - self.fail(f"{value} is a file, but input should be a directory.", param, ctx) - if not os.path.exists(f"{value}/config.yaml"): - self.fail(f"{value} does not contain the necessary config.yaml file.", param, ctx) - elif not os.access(f"{value}/config.yaml", os.R_OK): - self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx) - return value - + harpy_fastp = "--length_required -l --max_len1 -b --detect_adapter_for_pe --disable_adapter_trimming -A --dedup -D".split() + faspt_options = " --failed_out -6 --phred64 --reads_to_process --fix_mgi_id -a --adapter_sequence --adapter_sequence_r2 --adapter_fasta -f --trim_front1 -t --trim_tail1 -b --max_len1 -F --trim_front2 -T --trim_tail2 -B --max_len2 --dup_calc_accuracy --dont_eval_duplication --poly_g_min_len -x --trim_poly_x --poly_x_min_len -5 --cut_front -3 --cut_tail -W --cut_window_size -M --cut_mean_quality --cut_front_window_size --cut_front_mean_quality --cut_tail_window_size --cut_tail_mean_quality --cut_right_window_size --cut_right_mean_quality -Q --disable_quality_filtering -q --qualified_quality_phred -u --unqualified_percent_limit -n --n_base_limit -e --average_qual -L --disable_length_filtering -l --length_required --length_limit -y --low_complexity_filter -Y --complexity_threshold --filter_by_index1 --filter_by_index2 --filter_by_index_threshold -c --correction --overlap_len_require --overlap_diff_limit --overlap_diff_percent_limit -U --umi --umi_loc --umi_len --umi_prefix --umi_skip -p --overrepresentation_analysis -P --overrepresentation_sampling -s --split -S --split_by_lines -d --split_prefix_digits".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_fastp: + self.fail(f"{i} is already used by Harpy when calling fastp.") + if i not in faspt_options: + self.fail(f"{i} is not a valid fastp option. See the fastp documentation for a list of available fastp options: https://github.com/OpenGene/fastp.") + if opts < 1: + self.fail("No valid options recognized. All fastp options begin with one or two dashes (e.g. --phred64 or -a). See the fastp documentation for a list of available fastp options: https://github.com/OpenGene/fastp.") + return value \ No newline at end of file diff --git a/harpy/qc.py b/harpy/qc.py index 95c07b26e..52ea8cd26 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -10,6 +10,7 @@ from ._launch import launch_snakemake from ._misc import fetch_report, fetch_rule, snakemake_log from ._cli_types_generic import HPCProfile, IntList, SnakemakeParams +from ._cli_types_params import FastpParams from ._parsers import parse_fastq_inputs docstring = { @@ -28,7 +29,7 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/qc") @click.option('-c', '--deconvolve', type = IntList(4), default = "0,0,0,0", help = 'Accepts the QuickDeconvolution parameters for `k`,`w`,`d`,`a` (in that order)') @click.option('-d', '--deduplicate', is_flag = True, default = False, help = 'Identify and remove PCR duplicates') -@click.option('-x', '--extra-params', type = str, help = 'Additional Fastp parameters, in quotes') +@click.option('-x', '--extra-params', type = FastpParams(), help = 'Additional Fastp parameters, in quotes') @click.option('-m', '--max-length', default = 150, show_default = True, type=int, help = 'Maximum length to trim sequences down to') @click.option('-n', '--min-length', default = 30, show_default = True, type=int, help = 'Discard reads shorter than this length') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "QC", show_default=True, help = 'Output directory name') From 964365d2b964d568e815df1bdeedfcc6dac702ca Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 12:14:25 -0400 Subject: [PATCH 10/27] add frag density as an option --- harpy/align.py | 6 ++++-- harpy/snakefiles/align_ema.smk | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/harpy/align.py b/harpy/align.py index d252d314f..d99eec893 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -45,7 +45,7 @@ def align(): "harpy align ema": [ { "name": "Parameters", - "options": ["--ema-bins", "--extra-params", "--genome", "--keep-unmapped", "--platform", "--min-quality", "--barcode-list"], + "options": ["--ema-bins", "--extra-params", "--fragment-density", "--genome", "--keep-unmapped", "--platform", "--min-quality", "--barcode-list"], }, { "name": "Workflow Controls", @@ -150,6 +150,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema") @click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') +@click.option('-d', '--fragment-density', is_flag = True, show_default = True, default = False, help = 'Perform read fragment density optimization') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") @click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @@ -167,7 +168,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_unmapped, threads, ema_bins, skip_reports, extra_params, min_quality, snakemake, quiet, hpc, conda, contigs, setup_only): +def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, depth_window, keep_unmapped, threads, ema_bins, skip_reports, extra_params, min_quality, snakemake, quiet, hpc, conda, contigs, setup_only): """ Align sequences to genome using `EMA` @@ -220,6 +221,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u "output_directory" : output_dir, "alignment_quality" : min_quality, "keep_unmapped" : keep_unmapped, + "fragement_density_optimization": fragment_density, "depth_windowsize" : depth_window, "platform" : platform, "EMA_bins" : ema_bins, diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index f693a83fc..01a26891d 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -19,6 +19,7 @@ binrange = ["%03d" % i for i in range(nbins)] fqlist = config["inputs"]["fastq"] genomefile = config["inputs"]["genome"] platform = config["platform"] +frag_opt = config["fragment_density_optimization"] barcode_list = config["inputs"].get("barcode_list", "") extra = config.get("extra", "") bn = os.path.basename(genomefile) @@ -149,6 +150,7 @@ rule align_ema: RG_tag = lambda wc: "\"@RG\\tID:" + wc.get("sample") + "\\tSM:" + wc.get("sample") + "\"", bxtype = f"-p {platform}", tmpdir = lambda wc: outdir + "/." + d[wc.sample], + frag_opt = "-d" if frag_opt else "", quality = config["alignment_quality"], unmapped = "" if keep_unmapped else "-F 4", extra = extra @@ -158,7 +160,7 @@ rule align_ema: f"{envdir}/align.yaml" shell: """ - ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R {params.RG_tag} -x {input.readbin} 2> {log.ema} | + ema align -t {threads} {params.extra} {params.frag_opt} {params.bxtype} -r {input.genome} -R {params.RG_tag} -x {input.readbin} 2> {log.ema} | samtools view -h {params.unmapped} -q {params.quality} | samtools sort -T {params.tmpdir} --reference {input.genome} -O bam --write-index -m {resources.mem_mb}M -o {output.aln}##idx##{output.idx} - 2> {log.sort} rm -rf {params.tmpdir} @@ -353,7 +355,8 @@ rule workflow_summary: bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: beadtech = "-p" if platform == "haplotag" else f"-w {barcode_list}", - unmapped = "" if keep_unmapped else "-F 4" + unmapped = "" if keep_unmapped else "-F 4", + frag_opt = "-d" if frag_opt else "" run: summary = ["The harpy align ema workflow ran using these parameters:"] summary.append(f"The provided genome: {genomefile}") @@ -364,7 +367,7 @@ rule workflow_summary: bins += f"\tseqtk mergepe forward.fq.gz reverse.fq.gz | ema preproc {params.beadtech} -n {nbins}" summary.append(bins) ema_align = "Barcoded bins were aligned with ema align using:\n" - ema_align += f'\tema align {extra} -d -p {platform} -R "@RG\\tID:SAMPLE\\tSM:SAMPLE" |\n' + ema_align += f'\tema align {extra} {params.frag_opt} -p {platform} -R "@RG\\tID:SAMPLE\\tSM:SAMPLE" |\n' ema_align += f"\tsamtools view -h {params.unmapped} -q {config["alignment_quality"]} - |\n" ema_align += "\tsamtools sort --reference genome" summary.append(ema_align) From ba807e48f5bc970cf4a78e6f09be31e74b1565e2 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 12:25:41 -0400 Subject: [PATCH 11/27] add extra params validations --- harpy/_cli_types_params.py | 68 ++++++++++++++++++++++++++++++++++---- harpy/align.py | 11 +++--- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 03f9768d2..d506fb45c 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -6,16 +6,70 @@ class FastpParams(click.ParamType): """A class for a click type that validates fastp extra-params.""" name = "fastp_params" def convert(self, value, param, ctx): - harpy_fastp = "--length_required -l --max_len1 -b --detect_adapter_for_pe --disable_adapter_trimming -A --dedup -D".split() - faspt_options = " --failed_out -6 --phred64 --reads_to_process --fix_mgi_id -a --adapter_sequence --adapter_sequence_r2 --adapter_fasta -f --trim_front1 -t --trim_tail1 -b --max_len1 -F --trim_front2 -T --trim_tail2 -B --max_len2 --dup_calc_accuracy --dont_eval_duplication --poly_g_min_len -x --trim_poly_x --poly_x_min_len -5 --cut_front -3 --cut_tail -W --cut_window_size -M --cut_mean_quality --cut_front_window_size --cut_front_mean_quality --cut_tail_window_size --cut_tail_mean_quality --cut_right_window_size --cut_right_mean_quality -Q --disable_quality_filtering -q --qualified_quality_phred -u --unqualified_percent_limit -n --n_base_limit -e --average_qual -L --disable_length_filtering -l --length_required --length_limit -y --low_complexity_filter -Y --complexity_threshold --filter_by_index1 --filter_by_index2 --filter_by_index_threshold -c --correction --overlap_len_require --overlap_diff_limit --overlap_diff_percent_limit -U --umi --umi_loc --umi_len --umi_prefix --umi_skip -p --overrepresentation_analysis -P --overrepresentation_sampling -s --split -S --split_by_lines -d --split_prefix_digits".split() + harpy_options = "--length_required -l --max_len1 -b --detect_adapter_for_pe --disable_adapter_trimming -A --dedup -D".split() + valid_options = "--failed_out -6 --phred64 --reads_to_process --fix_mgi_id -a --adapter_sequence --adapter_sequence_r2 --adapter_fasta -f --trim_front1 -t --trim_tail1 -b --max_len1 -F --trim_front2 -T --trim_tail2 -B --max_len2 --dup_calc_accuracy --dont_eval_duplication --poly_g_min_len -x --trim_poly_x --poly_x_min_len -5 --cut_front -3 --cut_tail -W --cut_window_size -M --cut_mean_quality --cut_front_window_size --cut_front_mean_quality --cut_tail_window_size --cut_tail_mean_quality --cut_right_window_size --cut_right_mean_quality -Q --disable_quality_filtering -q --qualified_quality_phred -u --unqualified_percent_limit -n --n_base_limit -e --average_qual -L --disable_length_filtering -l --length_required --length_limit -y --low_complexity_filter -Y --complexity_threshold --filter_by_index1 --filter_by_index2 --filter_by_index_threshold -c --correction --overlap_len_require --overlap_diff_limit --overlap_diff_percent_limit -U --umi --umi_loc --umi_len --umi_prefix --umi_skip -p --overrepresentation_analysis -P --overrepresentation_sampling -s --split -S --split_by_lines -d --split_prefix_digits".split() opts = 0 for i in value.split(): if i.startswith("-"): opts += 1 - if i in harpy_fastp: - self.fail(f"{i} is already used by Harpy when calling fastp.") - if i not in faspt_options: - self.fail(f"{i} is not a valid fastp option. See the fastp documentation for a list of available fastp options: https://github.com/OpenGene/fastp.") + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling fastp.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid fastp option. See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All fastp options begin with one or two dashes (e.g. --phred64 or -a). See the fastp documentation for a list of available fastp options: https://github.com/OpenGene/fastp.") + self.fail("No valid options recognized. All fastp options begin with one or two dashes (e.g. --phred64 or -a). See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) + return value + +class BwaParams(click.ParamType): + """A class for a click type that validates bwa extra-params.""" + name = "bwa_params" + def convert(self, value, param, ctx): + harpy_options = "-C -v -t -R".split() + valid_options = "-k -w -d -r -c -P -A -B -O -E -L -U -p -T -a -H -M ".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling bwa mem.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid bwa mem option. See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All bwa options begin with one dash (e.g. -M). See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) + return value + +class EmaParams(click.ParamType): + """A class for a click type that validates ema extra-params.""" + name = "ema_params" + def convert(self, value, param, ctx): + harpy_options = "-t -p -d -r -R -x".split() + valid_options = "-i".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling ema.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid ema option. See the fastp documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All ema options begin with one dash (e.g. -i). See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) + return value + +class StrobeAlignParams(click.ParamType): + """A class for a click type that validates strobealign extra-params.""" + name = "strobealign_params" + def convert(self, value, param, ctx): + harpy_options = "--use-index -i --create-index -r -N -t -U -C --rg-id --rg".split() + valid_options = "-x -A -B -O -E -L -f -S -M -R -m -k -l -u -c -s -b --aux-len --aemb --eqx --no-PG --details".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling strobealign.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid strobealign option. See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All strobealign options begin with one or two dashes (e.g. --phred64 or -a). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) return value \ No newline at end of file diff --git a/harpy/align.py b/harpy/align.py index d99eec893..2b63152bc 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -3,7 +3,6 @@ import os import sys import yaml -from time import sleep from pathlib import Path from rich import box from rich.table import Table @@ -11,6 +10,7 @@ from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log from ._cli_types_generic import ContigList, InputFile, HPCProfile, SnakemakeParams +from ._cli_types_params import BwaParams, EmaParams, StrobeAlignParams from ._launch import launch_snakemake from ._parsers import parse_fastq_inputs from ._printing import print_error, print_solution, print_notice @@ -67,7 +67,7 @@ def align(): @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/") @click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') -@click.option('-x', '--extra-params', type = str, help = 'Additional bwa mem parameters, in quotes') +@click.option('-x', '--extra-params', type = BwaParams(), help = 'Additional bwa mem parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @@ -149,7 +149,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet, "workflow/align.bwa.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema") -@click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') +@click.option('-x', '--extra-params', type = EmaParams(), help = 'Additional ema align parameters, in quotes') @click.option('-d', '--fragment-density', is_flag = True, show_default = True, default = False, help = 'Perform read fragment density optimization') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") @@ -201,9 +201,8 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de else: print_solution("Running EMA requires TELLseq barcodes provided to [green]--barcode-list[/green]. They can be acquired from the TELL-read software [dim]https://www.illumina.com/products/by-type/informatics-products/basespace-sequence-hub/apps/universal-sequencing-tell-seq-data-analysis-pipeline.html[/dim]") sys.exit(1) - if platform == "haplotag" and barcode_list: + if platform == "haplotag" and barcode_list and not quiet: print_notice("Haplotag data does not require a barcode list and the file provided to [green]--barcode-list[/green] will be ignored.") - sleep(3) os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) @@ -258,7 +257,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/") @click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') -@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') +@click.option('-x', '--extra-params', type = StrobeAlignParams(), help = 'Additional aligner parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') From e2f3a93355ce32d034a66aed8a122b738462b71b Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 12:26:09 -0400 Subject: [PATCH 12/27] realistic example --- harpy/_cli_types_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index d506fb45c..33939093e 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -71,5 +71,5 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid strobealign option. See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All strobealign options begin with one or two dashes (e.g. --phred64 or -a). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) + self.fail("No valid options recognized. All strobealign options begin with one or two dashes (e.g. --eqx or -L). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) return value \ No newline at end of file From d159fb3e76102f09c3a8ee152fa557fe41611e29 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 14:59:18 -0400 Subject: [PATCH 13/27] add spades param clu --- harpy/_cli_types_params.py | 60 ++++++++++++++++++++++++++++++++++- harpy/assembly.py | 3 +- harpy/metassembly.py | 3 +- harpy/snakefiles/assembly.smk | 4 +-- 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 33939093e..64f217d73 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -72,4 +72,62 @@ def convert(self, value, param, ctx): self.fail(f"{i} is not a valid strobealign option. See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) if opts < 1: self.fail("No valid options recognized. All strobealign options begin with one or two dashes (e.g. --eqx or -L). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) - return value \ No newline at end of file + return value + +class SpadesParams(click.ParamType): + """A class for a click type that validates spades extra-params.""" + name = "spades_params" + def convert(self, value, param, ctx): + harpy_options = "-t -m -k --gemcode1-1 --gemcode1-2 -o --isolate --pe1-1 --pe1-2".split() + valid_options = "--dataset --pacbio --nanopore --sanger --trusted-contigs --untrusted-contigs --assembly-graph --cov-cutoff --phred-offset --custom-hmms --gfa11".split() + valid_options += [f"--mp{x}-{orient}" for x in range(1,10) for orient in ["1","2","12", "fr", "rf", "ff"]] + valid_options += [f"--hqmp{x}-{orient}" for x in range(1,10) for orient in ["1","2","12", "s", "fr", "rf", "ff"]] + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling spades.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid spades option. See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All spades options begin with one or two dashes (e.g. --cov-cutoff). See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) + return value + +class XXXParams(click.ParamType): + """A class for a click type that validates XXX extra-params.""" + name = "XXX_params" + def convert(self, value, param, ctx): + harpy_options = "".split() + valid_options = "".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling XXX.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid XXX option. See the XXX documentation for a list of available options: XXXX.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All XXX options begin with one or two dashes (e.g. --eqx or -L). See the XXX documentation for a list of available options: XXXX.", param, ctx) + return value + +class XXXParams(click.ParamType): + """A class for a click type that validates XXX extra-params.""" + name = "XXX_params" + def convert(self, value, param, ctx): + harpy_options = "".split() + valid_options = "".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling XXX.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid XXX option. See the XXX documentation for a list of available options: XXXX.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All XXX options begin with two dashes (e.g. --eqx or -L). See the XXX documentation for a list of available options: XXXX.", param, ctx) + return value + + diff --git a/harpy/assembly.py b/harpy/assembly.py index f11feb18f..f9a2ced95 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -7,6 +7,7 @@ from rich.table import Table import rich_click as click from ._cli_types_generic import KParam, HPCProfile, SnakemakeParams +from ._cli_types_params import SpadesParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -34,7 +35,7 @@ @click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode [`BX`,`BC`]") @click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)') @click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes') -@click.option('-x', '--extra-params', type = str, help = 'Additional spades parameters, in quotes') +@click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes') # TIGMINT/ARCS/LINKS @click.option('-y', '--arcs-extra', type = str, help = 'Additional ARCS parameters, in quotes') @click.option("-c","--contig-length", type = int, default = 500, show_default = True, help = "Minimum contig length") diff --git a/harpy/metassembly.py b/harpy/metassembly.py index c9ffc5b88..1fb8ddb81 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -10,6 +10,7 @@ from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log from ._cli_types_generic import HPCProfile, KParam, SnakemakeParams +from ._cli_types_params import SpadesParams from ._validations import validate_fastq_bx docstring = { @@ -31,7 +32,7 @@ @click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)') @click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes') @click.option('--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore linked-read info for initial spades assembly') -@click.option('-x', '--extra-params', type = str, help = 'Additional spades parameters, in quotes') +@click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes') # Common Workflow @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') diff --git a/harpy/snakefiles/assembly.smk b/harpy/snakefiles/assembly.smk index f8246229c..0ee14767d 100644 --- a/harpy/snakefiles/assembly.smk +++ b/harpy/snakefiles/assembly.smk @@ -155,9 +155,7 @@ rule BUSCO_analysis: conda: f"{envdir}/assembly.yaml" shell: - """ - ( busco -f -i {input} -c {threads} -m genome --out_path {params} > {log} 2>&1 ) || touch {output} - """ + "( busco -f -i {input} -c {threads} -m genome --out_path {params} > {log} 2>&1 ) || touch {output}" rule build_report: input: From 493dfaae9252e4ad414f35d879fece29736ce369 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 15:44:35 -0400 Subject: [PATCH 14/27] add Arcs param validation --- harpy/_cli_types_params.py | 33 ++++++++++++++++++--------------- harpy/assembly.py | 4 ++-- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 64f217d73..cdb8dd956 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -17,7 +17,7 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid fastp option. See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All fastp options begin with one or two dashes (e.g. --phred64 or -a). See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) + self.fail("No valid options recognized. Available fastp options begin with one or two dashes (e.g. --phred64 or -a). See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) return value class BwaParams(click.ParamType): @@ -35,7 +35,7 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid bwa mem option. See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All bwa options begin with one dash (e.g. -M). See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) + self.fail("No valid options recognized. Available bwa options begin with one dash (e.g. -M). See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) return value class EmaParams(click.ParamType): @@ -53,7 +53,7 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid ema option. See the fastp documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All ema options begin with one dash (e.g. -i). See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) + self.fail("No valid options recognized. Available ema options begin with one dash (e.g. -i). See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) return value class StrobeAlignParams(click.ParamType): @@ -71,7 +71,7 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid strobealign option. See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All strobealign options begin with one or two dashes (e.g. --eqx or -L). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) + self.fail("No valid options recognized. Available strobealign options begin with one or two dashes (e.g. --eqx or -L). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) return value class SpadesParams(click.ParamType): @@ -91,25 +91,28 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid spades option. See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All spades options begin with one or two dashes (e.g. --cov-cutoff). See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) + self.fail("No valid options recognized. Available spades options begin with two dashes (e.g. --cov-cutoff). See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) return value -class XXXParams(click.ParamType): - """A class for a click type that validates XXX extra-params.""" - name = "XXX_params" +class ArcsParams(click.ParamType): + """A class for a click type that validates ARCS extra-params.""" + name = "arcs_params" def convert(self, value, param, ctx): - harpy_options = "".split() - valid_options = "".split() + harpy_options = "draft reads t mapq nm dist minsize span c z s l base_name".split() + valid_options = "G cut longmap window as trim ac u multfile g graph gap tsv barcodecounts m index_multiplicity d max_degree e end_length r error_percent k k_value j j_index B bin_sizeN D dist_est no_dist_est dist_median dist_upper dist_tsv samples_tsv P pair f d k o e a b r p x".split() opts = 0 for i in value.split(): if i.startswith("-"): + self.fail(f"{i} begins with a dash, which would be interpreted as an argument to arcs-make rather than arcs. To avoid unexpected errors, arguments to arcs-make are disallowed. If this was inteded to be an argument to arcs, try using " + i.lstrip("-") + "=VAL instead", param, ctx) + if "=" in i: opts += 1 - if i in harpy_options: - self.fail(f"{i} is already used by Harpy when calling XXX.", param, ctx) - if i not in valid_options: - self.fail(f"{i} is not a valid XXX option. See the XXX documentation for a list of available options: XXXX.", param, ctx) + arg = i.split("=")[0].strip() + if arg in harpy_options: + self.fail(f"{arg} is already used by Harpy when calling arcs.", param, ctx) + if arg not in valid_options: + self.fail(f"{arg} is not a valid arcs option. See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) if opts < 1: - self.fail("No valid options recognized. All XXX options begin with one or two dashes (e.g. --eqx or -L). See the XXX documentation for a list of available options: XXXX.", param, ctx) + self.fail("No valid options recognized. All arcs options begin with one or two dashes (e.g. --eqx or -L). See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) return value class XXXParams(click.ParamType): diff --git a/harpy/assembly.py b/harpy/assembly.py index f9a2ced95..3a678221c 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -7,7 +7,7 @@ from rich.table import Table import rich_click as click from ._cli_types_generic import KParam, HPCProfile, SnakemakeParams -from ._cli_types_params import SpadesParams +from ._cli_types_params import SpadesParams, ArcsParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -37,7 +37,7 @@ @click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes') @click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes') # TIGMINT/ARCS/LINKS -@click.option('-y', '--arcs-extra', type = str, help = 'Additional ARCS parameters, in quotes') +@click.option('-y', '--arcs-extra', type = ArcsParams(), help = 'Additional ARCS parameters, in quotes (`option=arg` format)') @click.option("-c","--contig-length", type = int, default = 500, show_default = True, help = "Minimum contig length") @click.option("-n", "--links", type = int, default = 5, show_default = True, help = "Minimum number of links to compute scaffold") @click.option("-a", "--min-aligned", type = int, default = 5, show_default = True, help = "Minimum aligned read pairs per barcode") From 996e24bb0907da02e63bb0df6396fcaf65b62dc8 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 16:14:30 -0400 Subject: [PATCH 15/27] simplify the cli and with a dict --- harpy/_cli_types_generic.py | 16 ++++++++++------ harpy/align.py | 6 +++--- harpy/impute.py | 5 +++-- harpy/phase.py | 4 ++-- harpy/simulate.py | 32 ++++++++++++++++---------------- harpy/snp.py | 4 ++-- harpy/sv.py | 4 ++-- 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py index def96a33a..441890699 100644 --- a/harpy/_cli_types_generic.py +++ b/harpy/_cli_types_generic.py @@ -54,11 +54,16 @@ def convert(self, value, param, ctx): class InputFile(click.ParamType): """A class for a click type that verifies that a file exists and that it has an expected extension""" - def __init__(self, extensions, gzip_ok): + def __init__(self, filetype, gzip_ok): super().__init__() - self.extensions = extensions + self.filetype = filetype self.gzip_ok = gzip_ok def convert(self, value, param, ctx): + filedict = { + "fasta": [".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], + "vcf": ["vcf", "bcf", "vcf.gz"], + "gff": [".gff",".gff3"] + } if not os.path.exists(value): self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) elif not os.access(value, os.R_OK): @@ -67,15 +72,14 @@ def convert(self, value, param, ctx): self.fail(f"{value} is a directory, but input should be a file.", param, ctx) valid = False lowercase = value.lower() - extensions = [self.extensions] if not isinstance(self.extensions, list) else self.extensions - for ext in extensions: + for ext in filedict[self.filetype]: valid = True if lowercase.endswith(ext) else valid if self.gzip_ok: valid = True if lowercase.endswith(ext + ".gz") else valid if not valid and not self.gzip_ok: - self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(filedict[self.filetype]) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) if not valid and self.gzip_ok: - self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(extensions) + "]. Please verify this is the correct file type and rename the extension for compatibility. Gzip compression (ending in .gz) is allowed.", param, ctx) + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(filedict[self.filetype]) + "]. Please verify this is the correct file type and rename the extension for compatibility. Gzip compression (ending in .gz) is allowed.", param, ctx) return value class SnakemakeParams(click.ParamType): diff --git a/harpy/align.py b/harpy/align.py index 2b63152bc..498b6c79f 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -65,7 +65,7 @@ def align(): } @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/") -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = BwaParams(), help = 'Additional bwa mem parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @@ -153,7 +153,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('-d', '--fragment-density', is_flag = True, show_default = True, default = False, help = 'Perform read fragment density optimization') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True, help = 'Output directory name') @@ -255,7 +255,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary") @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/") -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = StrobeAlignParams(), help = 'Additional aligner parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') diff --git a/harpy/impute.py b/harpy/impute.py index 4dbb40532..751d1f2f8 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -8,6 +8,7 @@ from rich.table import Table import rich_click as click from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import StitchParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, fetch_script, snakemake_log @@ -28,11 +29,11 @@ } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/impute/") -@click.option('-x', '--extra-params', type = str, help = 'Additional STITCH parameters, in quotes') +@click.option('-x', '--extra-params', type = StitchParams(), help = 'Additional STITCH parameters, in quotes') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Impute", show_default=True, help = 'Output directory name') @click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'STITCH parameter file (tab-delimited)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -@click.option('-v', '--vcf', required = True, type = InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'Path to BCF/VCF file') +@click.option('-v', '--vcf', required = True, type = InputFile("vcf", gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') diff --git a/harpy/phase.py b/harpy/phase.py index 3c541dc87..e11729b82 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -28,13 +28,13 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/phase") @click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Phase", show_default=True, help = 'Output directory name') @click.option('-p', '--prune-threshold', default = 7, show_default = True, type = click.IntRange(0,100), help = 'PHRED-scale threshold (%) for pruning low-confidence SNPs (larger prunes more.)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max_open = True), help = 'Number of threads to use') -@click.option('-v', '--vcf', required = True, type = InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'Path to BCF/VCF file') +@click.option('-v', '--vcf', required = True, type = InputFile("vcf", gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') diff --git a/harpy/simulate.py b/harpy/simulate.py index 9650e82b8..eb0ed959c 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -139,8 +139,8 @@ def simulate(): @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome_hap1', required=True, type = InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) -@click.argument('genome_hap2', required=True, type = InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) +@click.argument('genome_hap1', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) +@click.argument('genome_hap2', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_rate, distance_sd, barcodes, read_pairs, molecule_length, partitions, molecules_per, threads, snakemake, quiet, hpc, conda, setup_only): """ Create linked reads from a genome @@ -208,17 +208,17 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") -@click.option('-s', '--snp-vcf', type=InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'VCF file of known snps to simulate') -@click.option('-i', '--indel-vcf', type=InputFile(["vcf", "bcf", "vcf.gz"], gzip_ok = False), help = 'VCF file of known indels to simulate') +@click.option('-s', '--snp-vcf', type=InputFile("vcf", gzip_ok = False), help = 'VCF file of known snps to simulate') +@click.option('-i', '--indel-vcf', type=InputFile("vcf", gzip_ok = False), help = 'VCF file of known indels to simulate') @click.option('-n', '--snp-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random snps to simluate") @click.option('-m', '--indel-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random indels to simluate") @click.option('-r', '--titv-ratio', type = click.FloatRange(min=0), default= 0.5, show_choices=True, show_default=True, help = "Transition/Transversion ratio for snps") @click.option('-d', '--indel-ratio', type = click.FloatRange(min=0), default= 1, show_choices=True, show_default=True, help = "Insertion/Deletion ratio for indels") @click.option('-a', '--indel-size-alpha', type = click.FloatRange(min=0), hidden = True, default = 2.0, help = "Exponent Alpha for power-law-fitted size distribution") @click.option('-l', '--indel-size-constant', type = click.FloatRange(min=0), default = 0.5, hidden = True, help = "Exponent constant for power-law-fitted size distribution") -@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") @click.option('-y', '--snp-gene-constraints', type = click.Choice(["noncoding", "coding", "2d", "4d"]), help = "How to constrain randomly simulated SNPs {`noncoding`,`coding`,`2d`,`4d`}") -@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -230,7 +230,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count, indel_count, titv_ratio, indel_ratio, indel_size_alpha, indel_size_constant, centromeres, genes, snp_gene_constraints, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce snps and/or indels into a genome @@ -342,8 +342,8 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random inversions to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum inversion size (bp)") @click.option('-x', '--max-size', type = click.IntRange(min = 1), default = 100000, show_default= True, help = "Maximum inversion size (bp)") -@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @@ -355,7 +355,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_size, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce inversions into a genome @@ -451,8 +451,8 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('-d', '--dup-ratio', type = click.FloatRange(min=0), default = 1, show_choices=True, show_default=True, help = "Ratio for the selected variant") @click.option('-l', '--gain-ratio', type = click.FloatRange(min=0), default=1, show_default=True, help = " Relative ratio of DNA gain over DNA loss") @click.option('-y', '--max-copy', type = click.IntRange(min = 1), default=10, show_default=True, help = "Maximum number of copies") -@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -464,7 +464,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, dup_ratio, max_copy, gain_ratio, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce copy number variants into a genome @@ -563,8 +563,8 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known translocations to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random translocations to simluate") -@click.option('-c', '--centromeres', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = InputFile([".gff",".gff3"], gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -576,7 +576,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") @click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), nargs=1) +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce translocations into a genome diff --git a/harpy/snp.py b/harpy/snp.py index a55c2ca22..10d07343a 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -49,7 +49,7 @@ def snp(): @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") @@ -154,7 +154,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") diff --git a/harpy/sv.py b/harpy/sv.py index effc00545..69b408db5 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -53,7 +53,7 @@ def sv(): @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/sv/leviathan/") @click.option('-x', '--extra-params', type = str, help = 'Additional leviathan parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for variant calling') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @@ -142,7 +142,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/sv/naibr/") @click.option('-x', '--extra-params', type = str, help = 'Additional naibr parameters, in quotes') -@click.option('-g', '--genome', type=InputFile([".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], gzip_ok = True), required = True, help = 'Genome assembly for calling variants') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for calling variants') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-q', '--min-quality', show_default = True, default=30, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality of reads to use') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') From 49f1712773519215e73426d1bba50e1635ee4e07 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 16:14:40 -0400 Subject: [PATCH 16/27] add stitch param validation --- harpy/_cli_types_params.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index cdb8dd956..dc69b4276 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -112,7 +112,28 @@ def convert(self, value, param, ctx): if arg not in valid_options: self.fail(f"{arg} is not a valid arcs option. See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) if opts < 1: - self.fail("No valid options recognized. All arcs options begin with one or two dashes (e.g. --eqx or -L). See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) + self.fail("No valid options recognized. All arcs options begin without dashes and must be in the form ARG=VAL (e.g. k=15). See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) + return value + +class StitchParams(click.ParamType): + """A class for a click type that validates stitch extra-params.""" + name = "stitch_params" + def convert(self, value, param, ctx): + harpy_options = "method posfile bamlist nCores nGen chr K S use_bx_tag bxTagUpperLimit outputdir output_filename tempdir".split() + valid_options = "nStarts sampleNames_file genfile B_bit_prob outputInputInVCFFormat downsampleToCov downsampleFraction readAware chrStart chrEnd regionStart regionEnd buffer maxDifferenceBetweenReads maxEmissionMatrixDifference alphaMatThreshold emissionThreshold iSizeUpperLimit bqFilter niterations shuffleHaplotypeIterations splitReadIterations expRate maxRate minRate Jmax regenerateInput originalRegionName keepInterimFiles keepTempDir outputHaplotypeProbabilities switchModelIteration generateInputOnly restartIterations refillIterations downsampleSamples downsampleSamplesKeepList subsetSNPsfile useSoftClippedBases outputBlockSize outputSNPBlockSize inputBundleBlockSize genetic_map_file reference_haplotype_file reference_legend_file reference_sample_file reference_populations reference_phred reference_iterations reference_shuffleHaplotypeIterations initial_min_hapProb initial_max_hapProb regenerateInputWithDefaultValues plotHapSumDuringIterations plot_shuffle_haplotype_attempts plotAfterImputation save_sampleReadsInfo gridWindowSize shuffle_bin_nSNPs shuffle_bin_radius keepSampleReadsInRAM useTempdirWhileWriting output_haplotype_dosages".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + self.fail(f"{i} begins with a dash, which is the wrong format. Try using " + i.lstrip("-") + "=VAL instead", param, ctx) + if "=" in i: + opts += 1 + arg = i.split("=")[0].strip() + if arg in harpy_options: + self.fail(f"{arg} is already used by Harpy when calling stitch.", param, ctx) + if arg not in valid_options: + self.fail(f"{arg} is not a valid stitch option. See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All stitch options begin without dashes and must be in the form ARG=VAL (e.g. downsampleFraction=0.5). See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md.", param, ctx) return value class XXXParams(click.ParamType): @@ -132,5 +153,4 @@ def convert(self, value, param, ctx): if opts < 1: self.fail("No valid options recognized. All XXX options begin with two dashes (e.g. --eqx or -L). See the XXX documentation for a list of available options: XXXX.", param, ctx) return value - - + From 910cdfaa049601e225f013a70ec804de5df12c2b Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 16:16:29 -0400 Subject: [PATCH 17/27] clarify text --- harpy/hpc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/harpy/hpc.py b/harpy/hpc.py index 68293a000..b9c5d00bd 100644 --- a/harpy/hpc.py +++ b/harpy/hpc.py @@ -13,8 +13,9 @@ def hpc(): If running Harpy on an HPC system, you can leverage Snakemake to handle all the job submissions on your behalf. This command creates templates for common HPC schedulers so you can run Harpy on a cluster with minimal friction. - The subcommands create a `config.yaml` in an `hpc/` directory. You will also need - to install the associated Snakemake executor plugin for HPC job submission to work. + The subcommands create a `config.yaml` in an `hpc/system` directory (where `system` + is one of the options below). You will also need to install the associated Snakemake + executor plugin for HPC job submission to work. """ docstring = { From 055f618be8368fa8ac8ea09356b79287b5fd598a Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 16:24:03 -0400 Subject: [PATCH 18/27] add hapcut extra validatio --- harpy/_cli_types_params.py | 19 +++++++++++++++++++ harpy/phase.py | 3 ++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index dc69b4276..3381f9222 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -136,6 +136,24 @@ def convert(self, value, param, ctx): self.fail("No valid options recognized. All stitch options begin without dashes and must be in the form ARG=VAL (e.g. downsampleFraction=0.5). See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md.", param, ctx) return value +class HapCutParams(click.ParamType): + """A class for a click type that validates hapcut2 extra-params.""" + name = "hapcut2_params" + def convert(self, value, param, ctx): + harpy_options = "--fragments --vcf --out --nf --error_analysis_mode --call_homozygous --outvcf --threshold --no_prune".split() + valid_options = "--skip_prune --sp --discrete_pruning --dp --max_iter --mi --maxcut_iter --mc".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling hapcut2.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid hapcut2 option. See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. All hapcut2 options begin with two dashes (e.g. --dp). See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) + return value + class XXXParams(click.ParamType): """A class for a click type that validates XXX extra-params.""" name = "XXX_params" @@ -154,3 +172,4 @@ def convert(self, value, param, ctx): self.fail("No valid options recognized. All XXX options begin with two dashes (e.g. --eqx or -L). See the XXX documentation for a list of available options: XXXX.", param, ctx) return value + diff --git a/harpy/phase.py b/harpy/phase.py index e11729b82..240f93e7f 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -10,6 +10,7 @@ from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import HapCutParams from ._parsers import parse_alignment_inputs from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, vcf_contig_match @@ -27,7 +28,7 @@ } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/phase") -@click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') +@click.option('-x', '--extra-params', type = HapCutParams(), help = 'Additional HapCut2 parameters, in quotes') @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') From 313991c14843bd6c6dc210b162688d4702e65751 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 16:58:56 -0400 Subject: [PATCH 19/27] stage snp params validations --- harpy/_cli_types_params.py | 77 +++++++++++++++++++++++++++++++++----- harpy/snp.py | 1 + harpy/sv.py | 1 + 3 files changed, 70 insertions(+), 9 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 3381f9222..60e40b21c 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -112,7 +112,7 @@ def convert(self, value, param, ctx): if arg not in valid_options: self.fail(f"{arg} is not a valid arcs option. See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) if opts < 1: - self.fail("No valid options recognized. All arcs options begin without dashes and must be in the form ARG=VAL (e.g. k=15). See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) + self.fail("No valid options recognized. Available arcs options begin without dashes and must be in the form ARG=VAL, without spaces (e.g. k=15). See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) return value class StitchParams(click.ParamType): @@ -133,7 +133,7 @@ def convert(self, value, param, ctx): if arg not in valid_options: self.fail(f"{arg} is not a valid stitch option. See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md", param, ctx) if opts < 1: - self.fail("No valid options recognized. All stitch options begin without dashes and must be in the form ARG=VAL (e.g. downsampleFraction=0.5). See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md.", param, ctx) + self.fail("No valid options recognized. Available stitch options begin without dashes and must be in the form ARG=VAL (e.g. downsampleFraction=0.5). See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md.", param, ctx) return value class HapCutParams(click.ParamType): @@ -151,12 +151,51 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid hapcut2 option. See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All hapcut2 options begin with two dashes (e.g. --dp). See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) + self.fail("No valid options recognized. Available hapcut2 options begin with two dashes (e.g. --dp). See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) + return value + +class LeviathanParams(click.ParamType): + """A class for a click type that validates leviathan extra-params.""" + name = "leviathan_params" + def convert(self, value, param, ctx): + harpy_options = "-b -i -g -o -v --minVariantSize -c --minBarcodes -B --nbBins -t --threads -C --candidates".split() + valid_options = "-r --regionSize -n --maxLinks -M --mediumSize -L --largeSize -s --smallRate -m --mediumRate -l --largeRate -d --duplicates -s --skipTranslocations -p --poolSize".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling leviathan.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid leviathan option. See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available leviathan options begin with one or two dashes (e.g. --mediumRate or -m). See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) + return value + +class NaibrParams(click.ParamType): + """A class for a click type that validates naibr extra-params.""" + name = "naibr_params" + def convert(self, value, param, ctx): + harpy_options = "bam_file prefix outdir threads min_mapq d min_sv k".split() + valid_options = "blacklist candidates".split() + opts = 0 + for idx,i in enumerate(value.split()): + if i.startswith("-"): + self.fail(f"{i} begins with a dash, which is the wrong format. Try using " + i.lstrip("-") + " VAL instead", param, ctx) + # if it's an odd index, it's the first (arg) of an arg val pair + if idx % 2 != 0: + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling naibr.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid naibr option. See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available naibr options begin without dashes in the form of ARG VAL (e.g. blacklist inversions.ignore). See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) return value -class XXXParams(click.ParamType): - """A class for a click type that validates XXX extra-params.""" - name = "XXX_params" +class MpileupParams(click.ParamType): + """A class for a click type that validates mpileup extra-params.""" + name = "mpileup_params" def convert(self, value, param, ctx): harpy_options = "".split() valid_options = "".split() @@ -165,11 +204,31 @@ def convert(self, value, param, ctx): if i.startswith("-"): opts += 1 if i in harpy_options: - self.fail(f"{i} is already used by Harpy when calling XXX.", param, ctx) + self.fail(f"{i} is already used by Harpy when calling mpileup.", param, ctx) if i not in valid_options: - self.fail(f"{i} is not a valid XXX option. See the XXX documentation for a list of available options: XXXX.", param, ctx) + self.fail(f"{i} is not a valid mpileup option. See the mpileup documentation for a list of available options: XXXX.", param, ctx) if opts < 1: - self.fail("No valid options recognized. All XXX options begin with two dashes (e.g. --eqx or -L). See the XXX documentation for a list of available options: XXXX.", param, ctx) + self.fail("No valid options recognized. Available mpileup options begin with two dashes (e.g. --eqx or -L). See the mpileup documentation for a list of available options: XXXX.", param, ctx) return value +class FreebayesParams(click.ParamType): + """A class for a click type that validates freebayes extra-params.""" + name = "freebayes_params" + def convert(self, value, param, ctx): + harpy_options = "".split() + valid_options = "".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling freebayes.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid freebayes option. See the freebayes documentation for a list of available options: XXXX.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available freebayes options begin with two dashes (e.g. --eqx or -L). See the freebayes documentation for a list of available options: XXXX.", param, ctx) + return value + + + diff --git a/harpy/snp.py b/harpy/snp.py index 10d07343a..ca9c1964a 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -8,6 +8,7 @@ from rich.table import Table import rich_click as click from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import MpileupParams, FreebayesParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log diff --git a/harpy/sv.py b/harpy/sv.py index 69b408db5..e8c25f809 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -8,6 +8,7 @@ from rich.table import Table import rich_click as click from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import LeviathanParams, NaibrParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log From ad035c7ce39e0d4453d540ed2bb3a8397b042cad Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 1 Nov 2024 16:59:50 -0400 Subject: [PATCH 20/27] add urls --- harpy/_cli_types_params.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 60e40b21c..9b3be454f 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -206,9 +206,9 @@ def convert(self, value, param, ctx): if i in harpy_options: self.fail(f"{i} is already used by Harpy when calling mpileup.", param, ctx) if i not in valid_options: - self.fail(f"{i} is not a valid mpileup option. See the mpileup documentation for a list of available options: XXXX.", param, ctx) + self.fail(f"{i} is not a valid mpileup option. See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) if opts < 1: - self.fail("No valid options recognized. Available mpileup options begin with two dashes (e.g. --eqx or -L). See the mpileup documentation for a list of available options: XXXX.", param, ctx) + self.fail("No valid options recognized. Available mpileup options begin with two dashes (e.g. --eqx or -L). See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) return value class FreebayesParams(click.ParamType): @@ -224,9 +224,9 @@ def convert(self, value, param, ctx): if i in harpy_options: self.fail(f"{i} is already used by Harpy when calling freebayes.", param, ctx) if i not in valid_options: - self.fail(f"{i} is not a valid freebayes option. See the freebayes documentation for a list of available options: XXXX.", param, ctx) + self.fail(f"{i} is not a valid freebayes option. See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) if opts < 1: - self.fail("No valid options recognized. Available freebayes options begin with two dashes (e.g. --eqx or -L). See the freebayes documentation for a list of available options: XXXX.", param, ctx) + self.fail("No valid options recognized. Available freebayes options begin with two dashes (e.g. --eqx or -L). See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) return value From 07500c07bce462f0caec35730a45e49fc527a19e Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 10:43:23 -0500 Subject: [PATCH 21/27] add snp params --- harpy/_cli_types_params.py | 17 +++++++++-------- harpy/snp.py | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index 9b3be454f..e5b7f8175 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -169,7 +169,7 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid leviathan option. See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) if opts < 1: - self.fail("No valid options recognized. Available leviathan options begin with one or two dashes (e.g. --mediumRate or -m). See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) + self.fail("No valid options recognized. Available leviathan options begin with one or two dashes (e.g. -m or -mediumRate). See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) return value class NaibrParams(click.ParamType): @@ -190,15 +190,15 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid naibr option. See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) if opts < 1: - self.fail("No valid options recognized. Available naibr options begin without dashes in the form of ARG VAL (e.g. blacklist inversions.ignore). See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) + self.fail("No valid options recognized. Available naibr options begin without dashes in the form of ARGVAL (e.g. blacklist inversions.ignore). See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) return value class MpileupParams(click.ParamType): """A class for a click type that validates mpileup extra-params.""" name = "mpileup_params" def convert(self, value, param, ctx): - harpy_options = "".split() - valid_options = "".split() + harpy_options = "--fasta-ref -f --bam-list -b --annotate -a --output-type -O -r --regions".split() + valid_options = "-6 --illumina1.3+ -A --count-orphans -B --no-BAQ -C --adjust-MQ -D --full-BAQ -d --max-depth -E --redo-BAQ -G --read-groups -q --min-MQ -Q --min-BQ --max-BQ INT --delta-BQ INT --ignore-RG --ls --skip-all-set --ns --skip-any-set --lu --skip-all-unset --nu --skip-any-unset -s --samples -S --samples-file -t --targets -T --targets-file -x --ignore-overlaps -g --gvcf -o -X --config -e --ext-prob -F --gap-frac -h --tandem-qual -I --skip-indels -L --max-idepth -m --min-ireads -M --max-read-len -o --open-prob -p --per-sample-mF -P --platforms --ar --ambig-reads --indel-bias --del-bias --score-vs-ref --indel-size --indels-2.0 --indels-cns --seqq-offset --no-indels-cns --poly-mqual".split() opts = 0 for i in value.split(): if i.startswith("-"): @@ -208,15 +208,15 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid mpileup option. See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) if opts < 1: - self.fail("No valid options recognized. Available mpileup options begin with two dashes (e.g. --eqx or -L). See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) + self.fail("No valid options recognized. Available mpileup options begin one or with two dashes (e.g. -d or --max-depth). See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) return value class FreebayesParams(click.ParamType): """A class for a click type that validates freebayes extra-params.""" name = "freebayes_params" def convert(self, value, param, ctx): - harpy_options = "".split() - valid_options = "".split() + harpy_options = "-r --region -p --ploidy -f --fasta-reference -L --bam-list --populations".split() + valid_options = "-t --targets -s --samples -A --cnv-map -v --vcf --gvcf --gvcf-chunkUM -& --gvcf-dont-use-chunk -@ --variant-input -l --only-use-input-alleles --haplotype-basis-alleles --report-all-haplotype-alleles --report-monomorphic -P --pvar --strict-vcf -T --theta -J --pooled-discrete -K --pooled-continuous -Z --use-reference-allele --reference-quality -n --use-best-n-alleles -E --max-complex-gap --haplotype-length --min-repeat-size --min-repeat-entropy --no-partial-observations -O --dont-left-align-indels -4 --use-duplicate-reads -m --min-mapping-quality -q --min-base-quality -R --min-supporting-allele-qsum -Y --min-supporting-mapping-qsum -Q --mismatch-base-quality-threshold -U --read-mismatch-limit -z --read-max-mismatch-fraction -$ --read-snp-limit -e --read-indel-limit -0 --standard-filters -F --min-alternate-fraction -C --min-alternate-count -3 --min-alternate-qsum -G --min-alternate-total --min-coverage --limit-coverage -g --skip-coverage --trim-complex-tail -k --no-population-priors -w --hwe-priors-off -V --binomial-obs-priors-off -a --allele-balance-priors-off --observation-bias --base-quality-cap --prob-contamination --legacy-gls --contamination-estimates --report-genotype-likelihood-max -B --genotyping-max-iterations --genotyping-max-banddepth -W --posterior-integration-limits,M -N --exclude-unobserved-genotypes -S --genotype-variant-threshold -j --use-mapping-quality -H --harmonic-indel-quality -D --read-dependence-factor -= --genotype-qualities -d --debug".split() opts = 0 for i in value.split(): if i.startswith("-"): @@ -226,9 +226,10 @@ def convert(self, value, param, ctx): if i not in valid_options: self.fail(f"{i} is not a valid freebayes option. See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) if opts < 1: - self.fail("No valid options recognized. Available freebayes options begin with two dashes (e.g. --eqx or -L). See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) + self.fail("No valid options recognized. Available freebayes options begin with one or two dashes (e.g. -t or --targets). See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) return value + diff --git a/harpy/snp.py b/harpy/snp.py index ca9c1964a..c7830b786 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -49,7 +49,7 @@ def snp(): } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') +@click.option('-x', '--extra-params', type = MpileupParams(), help = 'Additional mpileup parameters, in quotes') @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') @@ -154,7 +154,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log, quiet, "workflow/snp.mpileup.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') +@click.option('-x', '--extra-params', type = FreebayesParams(), help = 'Additional freebayes parameters, in quotes') @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') From 1ff9d39b073bda79eb491bcc2bd1e04f09f0275f Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 14:25:55 -0500 Subject: [PATCH 22/27] fixes --- .github/workflows/tests.yml | 4 ++-- harpy/_cli_types_generic.py | 2 ++ harpy/_cli_types_params.py | 8 ++++---- harpy/align.py | 2 +- harpy/simulate.py | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80ee4bed3..82935dbcf 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -229,7 +229,7 @@ jobs: path: .snakemake/singularity - name: harpy qc shell: micromamba-shell {0} - run: harpy qc -x "--trim_poly_g" --quiet test/fastq + run: harpy qc -x "--low_complexity_filter" --quiet test/fastq - name: harpy qc all options shell: micromamba-shell {0} run: harpy qc -a -d -c 21,40,3,0 --quiet test/fastq @@ -344,7 +344,7 @@ jobs: path: .snakemake/singularity - name: test ema shell: micromamba-shell {0} - run: harpy align ema --quiet --ema-bins 150 -g test/genome/genome.fasta.gz -x "-d" test/fastq + run: harpy align ema --quiet --ema-bins 150 -g test/genome/genome.fasta.gz test/fastq strobe: needs: [changes, pkgbuild] diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py index 441890699..e28a3ead2 100644 --- a/harpy/_cli_types_generic.py +++ b/harpy/_cli_types_generic.py @@ -64,6 +64,8 @@ def convert(self, value, param, ctx): "vcf": ["vcf", "bcf", "vcf.gz"], "gff": [".gff",".gff3"] } + if self.filetype not in filedict.keys(): + self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml") if not os.path.exists(value): self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) elif not os.access(value, os.R_OK): diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py index e5b7f8175..efa1ddb0e 100644 --- a/harpy/_cli_types_params.py +++ b/harpy/_cli_types_params.py @@ -6,7 +6,7 @@ class FastpParams(click.ParamType): """A class for a click type that validates fastp extra-params.""" name = "fastp_params" def convert(self, value, param, ctx): - harpy_options = "--length_required -l --max_len1 -b --detect_adapter_for_pe --disable_adapter_trimming -A --dedup -D".split() + harpy_options = "--length_required -l --trim_poly_g --max_len1 -b --detect_adapter_for_pe --disable_adapter_trimming -A --dedup -D".split() valid_options = "--failed_out -6 --phred64 --reads_to_process --fix_mgi_id -a --adapter_sequence --adapter_sequence_r2 --adapter_fasta -f --trim_front1 -t --trim_tail1 -b --max_len1 -F --trim_front2 -T --trim_tail2 -B --max_len2 --dup_calc_accuracy --dont_eval_duplication --poly_g_min_len -x --trim_poly_x --poly_x_min_len -5 --cut_front -3 --cut_tail -W --cut_window_size -M --cut_mean_quality --cut_front_window_size --cut_front_mean_quality --cut_tail_window_size --cut_tail_mean_quality --cut_right_window_size --cut_right_mean_quality -Q --disable_quality_filtering -q --qualified_quality_phred -u --unqualified_percent_limit -n --n_base_limit -e --average_qual -L --disable_length_filtering -l --length_required --length_limit -y --low_complexity_filter -Y --complexity_threshold --filter_by_index1 --filter_by_index2 --filter_by_index_threshold -c --correction --overlap_len_require --overlap_diff_limit --overlap_diff_percent_limit -U --umi --umi_loc --umi_len --umi_prefix --umi_skip -p --overrepresentation_analysis -P --overrepresentation_sampling -s --split -S --split_by_lines -d --split_prefix_digits".split() opts = 0 for i in value.split(): @@ -51,7 +51,7 @@ def convert(self, value, param, ctx): if i in harpy_options: self.fail(f"{i} is already used by Harpy when calling ema.", param, ctx) if i not in valid_options: - self.fail(f"{i} is not a valid ema option. See the fastp documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) + self.fail(f"{i} is not a valid ema option. See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) if opts < 1: self.fail("No valid options recognized. Available ema options begin with one dash (e.g. -i). See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) return value @@ -182,8 +182,8 @@ def convert(self, value, param, ctx): for idx,i in enumerate(value.split()): if i.startswith("-"): self.fail(f"{i} begins with a dash, which is the wrong format. Try using " + i.lstrip("-") + " VAL instead", param, ctx) - # if it's an odd index, it's the first (arg) of an arg val pair - if idx % 2 != 0: + # if it's an even index, it's the argument name of an arg-val pair + if idx % 2 == 0: opts += 1 if i in harpy_options: self.fail(f"{i} is already used by Harpy when calling naibr.", param, ctx) diff --git a/harpy/align.py b/harpy/align.py index 498b6c79f..bb0a6350c 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -220,7 +220,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de "output_directory" : output_dir, "alignment_quality" : min_quality, "keep_unmapped" : keep_unmapped, - "fragement_density_optimization": fragment_density, + "fragment_density_optimization": fragment_density, "depth_windowsize" : depth_window, "platform" : platform, "EMA_bins" : ema_bins, diff --git a/harpy/simulate.py b/harpy/simulate.py index eb0ed959c..288a2d6f1 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -164,8 +164,8 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r if snakemake: command += snakemake - check_fasta(genome_hap1, quiet) - check_fasta(genome_hap2, quiet) + check_fasta(genome_hap1) + check_fasta(genome_hap2) os.makedirs(f"{workflowdir}/", exist_ok= True) fetch_rule(workflowdir, "simulate_linkedreads.smk") From 9212929492a08f57f30abe656655a07faf471da7 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 14:34:24 -0500 Subject: [PATCH 23/27] add contigs/scaff to quast and give better label names --- harpy/snakefiles/assembly.smk | 9 +++++---- harpy/snakefiles/metassembly.smk | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/harpy/snakefiles/assembly.smk b/harpy/snakefiles/assembly.smk index 0ee14767d..eadb46d51 100644 --- a/harpy/snakefiles/assembly.smk +++ b/harpy/snakefiles/assembly.smk @@ -119,7 +119,8 @@ rule scaffolding: rule QUAST_assessment: input: - assembly = f"{outdir}/scaffolds.fasta", + contigs = f"{outdir}/spades/contigs.fasta", + scaffolds = f"{outdir}/scaffolds.fasta", fastq = f"{outdir}/scaffold/interleaved.fq.gz" output: f"{outdir}/quast/report.tsv" @@ -128,14 +129,14 @@ rule QUAST_assessment: params: output_dir = f"-o {outdir}/quast", organism = f"--{organism}" if organism != "prokaryote" else "", - quast_params = "--labels harpy_cloudspades --rna-finding", - skip_things = "--no-sv" + quast_params = "--labels spades_contigs,arcs_scaffolds --rna-finding", + skip_things = "--no-sv" threads: workflow.cores conda: f"{envdir}/assembly.yaml" shell: - "quast.py --threads {threads} --pe12 {input.fastq} {params} {input.assembly} 2> {log}" + "quast.py --threads {threads} --pe12 {input.fastq} {params} {input.contigs} {input.scaffolds} 2> {log}" rule BUSCO_analysis: input: diff --git a/harpy/snakefiles/metassembly.smk b/harpy/snakefiles/metassembly.smk index c7201c3c4..92b2ff18d 100644 --- a/harpy/snakefiles/metassembly.smk +++ b/harpy/snakefiles/metassembly.smk @@ -249,7 +249,7 @@ rule QUAST_assessment: params: output_dir = f"-o {outdir}/quast", organism = f"--{organism}" if organism != "prokaryote" else "", - quast_params = "--labels contigs,scaffolds --glimmer --rna-finding" + quast_params = "--labels spades_contigs,athena_scaffolds --glimmer --rna-finding" threads: workflow.cores conda: From 7314c4d9081e8b59c30ec23cf000846d924956be Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 14:53:56 -0500 Subject: [PATCH 24/27] add new cli file to filters --- .github/filters.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/filters.yml b/.github/filters.yml index 54dccac03..b73bd48e4 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -2,6 +2,7 @@ common: &common - 'harpy/_launch.py' - 'harpy/_misc.py' - 'harpy/_cli_types_generic.py' + - 'harpy/_cli_types_params.py' - 'harpy/_parsers.py' - 'harpy/_printing.py' - 'harpy/_validations.py' From c8d46b9438a9e6577296706abcf8890d5a1df3fa Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 15:33:35 -0500 Subject: [PATCH 25/27] touch up printing --- harpy/popgroups.py | 6 +++--- harpy/stitchparams.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/harpy/popgroups.py b/harpy/popgroups.py index efa596270..e7608ed1a 100644 --- a/harpy/popgroups.py +++ b/harpy/popgroups.py @@ -37,10 +37,10 @@ def popgroup(inputdir, output): fqlist = [os.path.basename(i) for i in full_fqlist] bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$" if len(fqlist) == 0: - print_error("no files found", f"No FASTQ or BAM files were detected in [bold]{inputdir}[/bold]") + print_error("no files found", f"No [bold]FASTQ[/bold] or [bold]BAM[/bold] files were detected in [blue]{inputdir}[/blue]") print_solution( - "Check that FASTQ file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]" + - "\nCheck that BAM files end in [green].bam[/green]"+ + "Check that [bold]FASTQ[/bold] file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]" + + "\nCheck that [bold]BAM[/bold] files end in [green].bam[/green]"+ "\nRead the documentation for details: https://pdimens.github.io/harpy/haplotagdata/#naming-conventions" ) sys.exit(1) diff --git a/harpy/stitchparams.py b/harpy/stitchparams.py index 17f5cfec7..8d5d6a856 100644 --- a/harpy/stitchparams.py +++ b/harpy/stitchparams.py @@ -27,6 +27,6 @@ def stitchparams(output): _ = file.write('k1_ng30\tdiploid\tTRUE\t50000\t5\t1\t30\n') _ = file.write('high_ngen\tdiploid\tTRUE\t50000\t15\t1\t100') print_notice( - f"Created STITCH parameter file: {output}\n" + - "Modify the model parameters as needed, but DO NOT add/remove columns." + f"Created template STITCH parameter file: [blue]{output}[/blue]\n" + + "Modify the model parameters as needed, but [yellow bold]do not add/remove columns." ) \ No newline at end of file From 36cd006483fee0d7aa6f2637528f657b2397826c Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 15:33:51 -0500 Subject: [PATCH 26/27] improve logic such that the message only appears if the plugin isn't in the environment already --- harpy/hpc.py | 70 ++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 40 deletions(-) diff --git a/harpy/hpc.py b/harpy/hpc.py index b9c5d00bd..05cc83a36 100644 --- a/harpy/hpc.py +++ b/harpy/hpc.py @@ -1,7 +1,10 @@ """Harpy module to create HPC profiles for running snakemake on a cluster""" import os +import subprocess +import sys import rich_click as click +from rich import print as rprint from rich.markdown import Markdown from ._printing import print_notice @@ -27,20 +30,31 @@ def hpc(): ] } +def package_exists(pkg): + """helper function to search for a package in the active conda environment""" + search_pkg = subprocess.run(f"conda list snakemake-executor-plugin-{pkg}".split(), capture_output = True, text = True) + exists = False + for i in search_pkg.stdout.split("\n"): + if i and not i.startswith("#"): + exists = True + if not exists: + print_notice(Markdown(f""" +Using this scheduler requires installing an additional Snakemake plugin, which wasn't detected in this environment. +Install the `{pkg}` plugin with: + +```bash +conda install bioconda::snakemake-executor-plugin-{pkg} +``` + """)) + @click.command() def generic(): """Configuration for a generic scheduler""" outfile = "hpc/generic/config.yaml" os.makedirs("hpc/generic", exist_ok=True) if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `generic-cluster` plugin with: - -```bash -conda install bioconda::snakemake-executor-plugin-cluster-generic -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("cluster-generic") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: cluster-generic\n") @@ -75,14 +89,8 @@ def lsf(): outfile = "hpc/lsf/config.yaml" os.makedirs("hpc/lsf", exist_ok=True) if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `lsf` plugin with: - -```bash -conda install bioconda::snakemake-executor-plugin-lsf -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("lsf") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: lsf\n") @@ -111,14 +119,8 @@ def htcondor(): os.makedirs("hpc/htcondor", exist_ok=True) outfile = "hpc/htcondor/config.yaml" if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `htcondor` plugin with: - -```bash -conda install bioconda::snakemake-executor-plugin-htcondor -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("htcondor") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: htcondor\n") @@ -146,14 +148,8 @@ def slurm(): os.makedirs("hpc/slurm", exist_ok=True) outfile = "hpc/slurm/config.yaml" if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `slurm` plugin with: - -```bash -conda install bioconda::snakemake-executor-plugin-slurm -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("slurm") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: slurm\n") @@ -181,14 +177,8 @@ def googlebatch(): os.makedirs("hpc/googlebatch", exist_ok=True) outfile = "hpc/googlebatch/config.yaml" if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `googlebatch` plugin with: - -```bash -conda install bioconda::snakemake-executor-plugin-googlebatch -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("googlebatch") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: googlebatch\n") From 408114b4a2b18d6309102672120b0be8ec3f01d9 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 4 Nov 2024 15:40:25 -0500 Subject: [PATCH 27/27] nicer formatting --- harpy/popgroups.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/harpy/popgroups.py b/harpy/popgroups.py index e7608ed1a..da6a6f532 100644 --- a/harpy/popgroups.py +++ b/harpy/popgroups.py @@ -4,6 +4,7 @@ import re import sys import glob +from rich import print as rprint import rich_click as click from ._printing import print_error, print_solution, print_notice @@ -46,11 +47,11 @@ def popgroup(inputdir, output): sys.exit(1) samplenames = set([re.sub(bn_r, "", i, flags = re.IGNORECASE) for i in fqlist]) - click.echo(f"{len(samplenames)} samples detected in {inputdir}", file = sys.stderr) + rprint(f"[bold]{len(samplenames)} samples [/bold] detected in [blue]{inputdir}[blue]", file = sys.stderr) if os.path.exists(output): - write_text = f"The file [green]{output}[/green] was overwritten." + write_text = f"The file [blue]{output}[/blue] was overwritten." else: - write_text = f"Created sample population grouping file [green]{output}[/green]." + write_text = f"Created sample population grouping file [blue]{output}[/blue]." with open(output, "w", encoding="utf-8") as file: for i in samplenames: _ = file.write(i + '\tpop1\n')