diff --git a/.github/filters.yml b/.github/filters.yml index e1e3b6a91..b73bd48e4 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -1,7 +1,8 @@ common: &common - 'harpy/_launch.py' - 'harpy/_misc.py' - - 'harpy/_cli_types.py' + - 'harpy/_cli_types_generic.py' + - 'harpy/_cli_types_params.py' - 'harpy/_parsers.py' - 'harpy/_printing.py' - 'harpy/_validations.py' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80ee4bed3..82935dbcf 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -229,7 +229,7 @@ jobs: path: .snakemake/singularity - name: harpy qc shell: micromamba-shell {0} - run: harpy qc -x "--trim_poly_g" --quiet test/fastq + run: harpy qc -x "--low_complexity_filter" --quiet test/fastq - name: harpy qc all options shell: micromamba-shell {0} run: harpy qc -a -d -c 21,40,3,0 --quiet test/fastq @@ -344,7 +344,7 @@ jobs: path: .snakemake/singularity - name: test ema shell: micromamba-shell {0} - run: harpy align ema --quiet --ema-bins 150 -g test/genome/genome.fasta.gz -x "-d" test/fastq + run: harpy align ema --quiet --ema-bins 150 -g test/genome/genome.fasta.gz test/fastq strobe: needs: [changes, pkgbuild] diff --git a/harpy/_cli_types.py b/harpy/_cli_types.py deleted file mode 100644 index 1bd7d190a..000000000 --- a/harpy/_cli_types.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Module with helper function to set up Harpy workflows""" - -import os -import click - -class IntList(click.ParamType): - """A class for a click type which accepts an arbitrary number of integers, separated by a comma.""" - name = "int_list" - def __init__(self, max_entries): - super().__init__() - self.max_entries = max_entries - - def convert(self, value, param, ctx): - try: - parts = [i.strip() for i in value.split(',')] - if len(parts) != self.max_entries: - raise ValueError - for i in parts: - try: - int(i) - except: - raise ValueError - return [int(i) for i in parts] - except ValueError: - self.fail(f"{value} is not a valid list of integers. The value should be {self.max_entries} integers separated by a comma.", param, ctx) - -class KParam(click.ParamType): - """A class for a click type which accepts any number of odd integers separated by a comma, or the word auto.""" - name = "k_param" - def convert(self, value, param, ctx): - try: - if value == "auto": - return value - parts = [i.strip() for i in value.split(',')] - for i in parts: - if int(i) % 2 == 0 or int(i) > 128: - raise ValueError - return [int(i) for i in parts] - except ValueError: - self.fail(f"{value} is not 'auto' or odd integers <128 separated by a comma.", param, ctx) - -class ContigList(click.ParamType): - """A class for a click type which accepts a file of contigs or a list of contigs separated by a comma.""" - name = "contig_list" - def convert(self, value, param, ctx): - # check if it's a file - if os.path.exists(value): - if not os.path.isfile(value): - self.fail(f"{value} is not a file.", param, ctx) - with open(value, "r") as cont_in: - return [i.strip() for i in cont_in.readlines()] - else: - return [i.strip() for i in value.split(',')] - diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py new file mode 100644 index 000000000..e28a3ead2 --- /dev/null +++ b/harpy/_cli_types_generic.py @@ -0,0 +1,117 @@ +"""Module with python-click types for command-line level validations of inputs""" + +import os +import click + +class IntList(click.ParamType): + """A class for a click type which accepts an arbitrary number of integers, separated by a comma.""" + name = "int_list" + def __init__(self, max_entries): + super().__init__() + self.max_entries = max_entries + + def convert(self, value, param, ctx): + try: + parts = [i.strip() for i in value.split(',')] + if len(parts) != self.max_entries: + raise ValueError + for i in parts: + try: + int(i) + except: + raise ValueError + return [int(i) for i in parts] + except ValueError: + self.fail(f"{value} is not a valid list of integers. The value should be {self.max_entries} integers separated by a comma.", param, ctx) + +class KParam(click.ParamType): + """A class for a click type which accepts any number of odd integers separated by a comma, or the word auto.""" + name = "k_param" + def convert(self, value, param, ctx): + try: + if value == "auto": + return value + parts = [i.strip() for i in value.split(',')] + for i in parts: + if int(i) % 2 == 0 or int(i) > 128: + raise ValueError + return [int(i) for i in parts] + except ValueError: + self.fail(f"{value} is not 'auto' or odd integers <128 separated by a comma.", param, ctx) + +class ContigList(click.ParamType): + """A class for a click type which accepts a file of contigs or a list of contigs separated by a comma.""" + name = "contig_list" + def convert(self, value, param, ctx): + # check if it's a file + if os.path.exists(value): + if not os.path.isfile(value): + self.fail(f"{value} is not a file.", param, ctx) + with open(value, "r") as cont_in: + return [i.strip() for i in cont_in.readlines()] + else: + return [i.strip() for i in value.split(',')] + +class InputFile(click.ParamType): + """A class for a click type that verifies that a file exists and that it has an expected extension""" + def __init__(self, filetype, gzip_ok): + super().__init__() + self.filetype = filetype + self.gzip_ok = gzip_ok + def convert(self, value, param, ctx): + filedict = { + "fasta": [".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".frn"], + "vcf": ["vcf", "bcf", "vcf.gz"], + "gff": [".gff",".gff3"] + } + if self.filetype not in filedict.keys(): + self.fail(f"Extension validation for {self.filetype} is not yet implemented. This error should only appear during development; if you are a user and seeing this, please post an issue on GitHub: https://github.com/pdimens/harpy/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml") + if not os.path.exists(value): + self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) + elif not os.access(value, os.R_OK): + self.fail(f"{value} is not readable. Please check file/directory permissions and try again", param, ctx) + if os.path.isdir(value): + self.fail(f"{value} is a directory, but input should be a file.", param, ctx) + valid = False + lowercase = value.lower() + for ext in filedict[self.filetype]: + valid = True if lowercase.endswith(ext) else valid + if self.gzip_ok: + valid = True if lowercase.endswith(ext + ".gz") else valid + if not valid and not self.gzip_ok: + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(filedict[self.filetype]) + "]. Please verify this is the correct file type and rename the extension for compatibility.", param, ctx) + if not valid and self.gzip_ok: + self.fail(f"{value} does not end with one of the expected extensions [" + ", ".join(filedict[self.filetype]) + "]. Please verify this is the correct file type and rename the extension for compatibility. Gzip compression (ending in .gz) is allowed.", param, ctx) + return value + +class SnakemakeParams(click.ParamType): + """A class for a click type which accepts snakemake parameters. Does validations to make sure there isn't doubling up.""" + name = "snakemake_params" + def convert(self, value, param, ctx): + forbidden = "--rerun-incomplete --ri --show-failed-logs --rerun-triggers --nolock --software-deployment-method --smd --deployment --deployment-method --conda-prefix --cores -c --directory -d --snakefile -s --configfile --configfiles".split() + available = "--dry-run --dryrun -n --profile --cache --jobs -j --local-cores --resources --res --set-threads --max-threads --set-resources --set-scatter --set-resource-scopes --default-resources --default-res --preemptible-rules --preemptible-retries --envvars --touch -t --keep-going -k --force -f --executor -e --forceall -F --forcerun -R --prioritize -P --batch --until -U --omit-from -O --shadow-prefixDIR --scheduler --wms-monitor --wms-monitor-arg --scheduler-ilp-solver --conda-base-path --no-subworkflows --nosw --precommand --groups --group-components --report --report-stylesheet --reporterPLUGIN --draft-notebook --edit-notebook --notebook-listen --lint --generate-unit-tests --containerize --export-cwl --list-rules --list -l --list-target-rules --lt --dag --rulegraph --filegraph --d3dag --summary -S --detailed-summary -D --archive --cleanup-metadata --cmFILE --cleanup-shadow --skip-script-cleanup --unlock --list-changes --lc --list-input-changes --li --list-params-changes --lp --list-untracked --lu --delete-all-output --delete-temp-output --keep-incomplete --drop-metadata --version -v --printshellcmds -p --debug-dag --nocolor --quiet -q --print-compilation --verbose --force-use-threads --allow-ambiguity -a --ignore-incomplete --ii --max-inventory-time --latency-wait --output-wait -w --wait-for-files --wait-for-files-file --queue-input-wait-time --notemp --nt --all-temp --unneeded-temp-files --keep-storage-local-copies --target-files-omit-workdir-adjustment --allowed-rules --max-jobs-per-timespan --max-jobs-per-second --max-status-checks-per-second --seconds-between-status-checks --retries --restart-times -T --wrapper-prefix --default-storage-provider --default-storage-prefix --local-storage-prefix --remote-job-local-storage-prefix --shared-fs-usage --scheduler-greediness --greediness --no-hooks --debug --runtime-profile --local-groupid --attempt --log-handler-script --log-service --job-deploy-sources --benchmark-extended --container-image --immediate-submit --is --jobscript --js --jobname --jn --flux --container-cleanup-images --use-conda --conda-not-block-search-path-envvars --list-conda-envs --conda-cleanup-envs --conda-cleanup-pkgs --conda-create-envs-only --conda-frontend --use-apptainer --use-singularity --apptainer-prefix --singularity-prefix --apptainer-args --singularity-args --use-envmodules --scheduler-solver-path --deploy-sources --target-jobs --mode --report-html-path --report-html-stylesheet-path".split() + for i in value.split(): + if i.startswith("-"): + if i in forbidden: + self.fail(f"{i} is a forbidden option because it is already used by Harpy to call Snakemake.", param, ctx) + if i not in available: + self.fail(f"{i} is not a valid Snakemake option. Run \'snakemake --help\' for a list of all Snakemake command line options.", param, ctx) + return value + +class HPCProfile(click.ParamType): + """A class for a click type which accepts a directory with a snakemake HPC profile. Does validations to make sure the config file is there.""" + name = "hpc_profile" + def convert(self, value, param, ctx): + if not os.path.exists(value): + self.fail(f"{value} does not exist. Please check the spelling and try again.", param, ctx) + elif not os.access(value, os.R_OK): + self.fail(f"{value} is not readable. Please check file/directory permissions and try again", param, ctx) + if os.path.isfile(value): + self.fail(f"{value} is a file, but input should be a directory.", param, ctx) + if not os.path.exists(f"{value}/config.yaml"): + self.fail(f"{value} does not contain the necessary config.yaml file.", param, ctx) + elif not os.access(f"{value}/config.yaml", os.R_OK): + self.fail(f"{value}/config.yaml does not have read access. Please check the file permissions and try again.", param, ctx) + return value + +### program-specific extra-params types diff --git a/harpy/_cli_types_params.py b/harpy/_cli_types_params.py new file mode 100644 index 000000000..efa1ddb0e --- /dev/null +++ b/harpy/_cli_types_params.py @@ -0,0 +1,235 @@ +"""Module with python-click types for command-line level validations of program-specific extra-params inputs""" + +import click + +class FastpParams(click.ParamType): + """A class for a click type that validates fastp extra-params.""" + name = "fastp_params" + def convert(self, value, param, ctx): + harpy_options = "--length_required -l --trim_poly_g --max_len1 -b --detect_adapter_for_pe --disable_adapter_trimming -A --dedup -D".split() + valid_options = "--failed_out -6 --phred64 --reads_to_process --fix_mgi_id -a --adapter_sequence --adapter_sequence_r2 --adapter_fasta -f --trim_front1 -t --trim_tail1 -b --max_len1 -F --trim_front2 -T --trim_tail2 -B --max_len2 --dup_calc_accuracy --dont_eval_duplication --poly_g_min_len -x --trim_poly_x --poly_x_min_len -5 --cut_front -3 --cut_tail -W --cut_window_size -M --cut_mean_quality --cut_front_window_size --cut_front_mean_quality --cut_tail_window_size --cut_tail_mean_quality --cut_right_window_size --cut_right_mean_quality -Q --disable_quality_filtering -q --qualified_quality_phred -u --unqualified_percent_limit -n --n_base_limit -e --average_qual -L --disable_length_filtering -l --length_required --length_limit -y --low_complexity_filter -Y --complexity_threshold --filter_by_index1 --filter_by_index2 --filter_by_index_threshold -c --correction --overlap_len_require --overlap_diff_limit --overlap_diff_percent_limit -U --umi --umi_loc --umi_len --umi_prefix --umi_skip -p --overrepresentation_analysis -P --overrepresentation_sampling -s --split -S --split_by_lines -d --split_prefix_digits".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling fastp.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid fastp option. See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available fastp options begin with one or two dashes (e.g. --phred64 or -a). See the fastp documentation for a list of available options: https://github.com/OpenGene/fastp.", param, ctx) + return value + +class BwaParams(click.ParamType): + """A class for a click type that validates bwa extra-params.""" + name = "bwa_params" + def convert(self, value, param, ctx): + harpy_options = "-C -v -t -R".split() + valid_options = "-k -w -d -r -c -P -A -B -O -E -L -U -p -T -a -H -M ".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling bwa mem.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid bwa mem option. See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available bwa options begin with one dash (e.g. -M). See the bwa documentation for a list of available options: https://bio-bwa.sourceforge.net/bwa.shtml.", param, ctx) + return value + +class EmaParams(click.ParamType): + """A class for a click type that validates ema extra-params.""" + name = "ema_params" + def convert(self, value, param, ctx): + harpy_options = "-t -p -d -r -R -x".split() + valid_options = "-i".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling ema.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid ema option. See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available ema options begin with one dash (e.g. -i). See the ema documentation for a list of available options: https://github.com/arshajii/ema.", param, ctx) + return value + +class StrobeAlignParams(click.ParamType): + """A class for a click type that validates strobealign extra-params.""" + name = "strobealign_params" + def convert(self, value, param, ctx): + harpy_options = "--use-index -i --create-index -r -N -t -U -C --rg-id --rg".split() + valid_options = "-x -A -B -O -E -L -f -S -M -R -m -k -l -u -c -s -b --aux-len --aemb --eqx --no-PG --details".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling strobealign.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid strobealign option. See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available strobealign options begin with one or two dashes (e.g. --eqx or -L). See the strobealign documentation for a list of available options: https://github.com/ksahlin/strobealign.", param, ctx) + return value + +class SpadesParams(click.ParamType): + """A class for a click type that validates spades extra-params.""" + name = "spades_params" + def convert(self, value, param, ctx): + harpy_options = "-t -m -k --gemcode1-1 --gemcode1-2 -o --isolate --pe1-1 --pe1-2".split() + valid_options = "--dataset --pacbio --nanopore --sanger --trusted-contigs --untrusted-contigs --assembly-graph --cov-cutoff --phred-offset --custom-hmms --gfa11".split() + valid_options += [f"--mp{x}-{orient}" for x in range(1,10) for orient in ["1","2","12", "fr", "rf", "ff"]] + valid_options += [f"--hqmp{x}-{orient}" for x in range(1,10) for orient in ["1","2","12", "s", "fr", "rf", "ff"]] + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling spades.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid spades option. See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available spades options begin with two dashes (e.g. --cov-cutoff). See the spades documentation for a list of available options: http://ablab.github.io/spades/running.html.", param, ctx) + return value + +class ArcsParams(click.ParamType): + """A class for a click type that validates ARCS extra-params.""" + name = "arcs_params" + def convert(self, value, param, ctx): + harpy_options = "draft reads t mapq nm dist minsize span c z s l base_name".split() + valid_options = "G cut longmap window as trim ac u multfile g graph gap tsv barcodecounts m index_multiplicity d max_degree e end_length r error_percent k k_value j j_index B bin_sizeN D dist_est no_dist_est dist_median dist_upper dist_tsv samples_tsv P pair f d k o e a b r p x".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + self.fail(f"{i} begins with a dash, which would be interpreted as an argument to arcs-make rather than arcs. To avoid unexpected errors, arguments to arcs-make are disallowed. If this was inteded to be an argument to arcs, try using " + i.lstrip("-") + "=VAL instead", param, ctx) + if "=" in i: + opts += 1 + arg = i.split("=")[0].strip() + if arg in harpy_options: + self.fail(f"{arg} is already used by Harpy when calling arcs.", param, ctx) + if arg not in valid_options: + self.fail(f"{arg} is not a valid arcs option. See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available arcs options begin without dashes and must be in the form ARG=VAL, without spaces (e.g. k=15). See the documentation for a list of available options.\nTigmint: https://github.com/bcgsc/tigmint\nARCS: https://github.com/bcgsc/arcs\nLINKS: https://github.com/bcgsc/links", param, ctx) + return value + +class StitchParams(click.ParamType): + """A class for a click type that validates stitch extra-params.""" + name = "stitch_params" + def convert(self, value, param, ctx): + harpy_options = "method posfile bamlist nCores nGen chr K S use_bx_tag bxTagUpperLimit outputdir output_filename tempdir".split() + valid_options = "nStarts sampleNames_file genfile B_bit_prob outputInputInVCFFormat downsampleToCov downsampleFraction readAware chrStart chrEnd regionStart regionEnd buffer maxDifferenceBetweenReads maxEmissionMatrixDifference alphaMatThreshold emissionThreshold iSizeUpperLimit bqFilter niterations shuffleHaplotypeIterations splitReadIterations expRate maxRate minRate Jmax regenerateInput originalRegionName keepInterimFiles keepTempDir outputHaplotypeProbabilities switchModelIteration generateInputOnly restartIterations refillIterations downsampleSamples downsampleSamplesKeepList subsetSNPsfile useSoftClippedBases outputBlockSize outputSNPBlockSize inputBundleBlockSize genetic_map_file reference_haplotype_file reference_legend_file reference_sample_file reference_populations reference_phred reference_iterations reference_shuffleHaplotypeIterations initial_min_hapProb initial_max_hapProb regenerateInputWithDefaultValues plotHapSumDuringIterations plot_shuffle_haplotype_attempts plotAfterImputation save_sampleReadsInfo gridWindowSize shuffle_bin_nSNPs shuffle_bin_radius keepSampleReadsInRAM useTempdirWhileWriting output_haplotype_dosages".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + self.fail(f"{i} begins with a dash, which is the wrong format. Try using " + i.lstrip("-") + "=VAL instead", param, ctx) + if "=" in i: + opts += 1 + arg = i.split("=")[0].strip() + if arg in harpy_options: + self.fail(f"{arg} is already used by Harpy when calling stitch.", param, ctx) + if arg not in valid_options: + self.fail(f"{arg} is not a valid stitch option. See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available stitch options begin without dashes and must be in the form ARG=VAL (e.g. downsampleFraction=0.5). See the stitch documentation for a list of available options: https://github.com/rwdavies/STITCH/blob/master/Options.md.", param, ctx) + return value + +class HapCutParams(click.ParamType): + """A class for a click type that validates hapcut2 extra-params.""" + name = "hapcut2_params" + def convert(self, value, param, ctx): + harpy_options = "--fragments --vcf --out --nf --error_analysis_mode --call_homozygous --outvcf --threshold --no_prune".split() + valid_options = "--skip_prune --sp --discrete_pruning --dp --max_iter --mi --maxcut_iter --mc".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling hapcut2.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid hapcut2 option. See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available hapcut2 options begin with two dashes (e.g. --dp). See the hapcut2 documentation for a list of available options: https://github.com/vibansal/HapCUT2.", param, ctx) + return value + +class LeviathanParams(click.ParamType): + """A class for a click type that validates leviathan extra-params.""" + name = "leviathan_params" + def convert(self, value, param, ctx): + harpy_options = "-b -i -g -o -v --minVariantSize -c --minBarcodes -B --nbBins -t --threads -C --candidates".split() + valid_options = "-r --regionSize -n --maxLinks -M --mediumSize -L --largeSize -s --smallRate -m --mediumRate -l --largeRate -d --duplicates -s --skipTranslocations -p --poolSize".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling leviathan.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid leviathan option. See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available leviathan options begin with one or two dashes (e.g. -m or -mediumRate). See the leviathan documentation for a list of available options: https://github.com/morispi/LEVIATHAN.", param, ctx) + return value + +class NaibrParams(click.ParamType): + """A class for a click type that validates naibr extra-params.""" + name = "naibr_params" + def convert(self, value, param, ctx): + harpy_options = "bam_file prefix outdir threads min_mapq d min_sv k".split() + valid_options = "blacklist candidates".split() + opts = 0 + for idx,i in enumerate(value.split()): + if i.startswith("-"): + self.fail(f"{i} begins with a dash, which is the wrong format. Try using " + i.lstrip("-") + " VAL instead", param, ctx) + # if it's an even index, it's the argument name of an arg-val pair + if idx % 2 == 0: + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling naibr.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid naibr option. See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available naibr options begin without dashes in the form of ARGVAL (e.g. blacklist inversions.ignore). See the naibr documentation for a list of available options: https://github.com/pontushojer/NAIBR.", param, ctx) + return value + +class MpileupParams(click.ParamType): + """A class for a click type that validates mpileup extra-params.""" + name = "mpileup_params" + def convert(self, value, param, ctx): + harpy_options = "--fasta-ref -f --bam-list -b --annotate -a --output-type -O -r --regions".split() + valid_options = "-6 --illumina1.3+ -A --count-orphans -B --no-BAQ -C --adjust-MQ -D --full-BAQ -d --max-depth -E --redo-BAQ -G --read-groups -q --min-MQ -Q --min-BQ --max-BQ INT --delta-BQ INT --ignore-RG --ls --skip-all-set --ns --skip-any-set --lu --skip-all-unset --nu --skip-any-unset -s --samples -S --samples-file -t --targets -T --targets-file -x --ignore-overlaps -g --gvcf -o -X --config -e --ext-prob -F --gap-frac -h --tandem-qual -I --skip-indels -L --max-idepth -m --min-ireads -M --max-read-len -o --open-prob -p --per-sample-mF -P --platforms --ar --ambig-reads --indel-bias --del-bias --score-vs-ref --indel-size --indels-2.0 --indels-cns --seqq-offset --no-indels-cns --poly-mqual".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling mpileup.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid mpileup option. See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available mpileup options begin one or with two dashes (e.g. -d or --max-depth). See the mpileup documentation for a list of available options: https://samtools.github.io/bcftools/bcftools.html#mpileup.", param, ctx) + return value + +class FreebayesParams(click.ParamType): + """A class for a click type that validates freebayes extra-params.""" + name = "freebayes_params" + def convert(self, value, param, ctx): + harpy_options = "-r --region -p --ploidy -f --fasta-reference -L --bam-list --populations".split() + valid_options = "-t --targets -s --samples -A --cnv-map -v --vcf --gvcf --gvcf-chunkUM -& --gvcf-dont-use-chunk -@ --variant-input -l --only-use-input-alleles --haplotype-basis-alleles --report-all-haplotype-alleles --report-monomorphic -P --pvar --strict-vcf -T --theta -J --pooled-discrete -K --pooled-continuous -Z --use-reference-allele --reference-quality -n --use-best-n-alleles -E --max-complex-gap --haplotype-length --min-repeat-size --min-repeat-entropy --no-partial-observations -O --dont-left-align-indels -4 --use-duplicate-reads -m --min-mapping-quality -q --min-base-quality -R --min-supporting-allele-qsum -Y --min-supporting-mapping-qsum -Q --mismatch-base-quality-threshold -U --read-mismatch-limit -z --read-max-mismatch-fraction -$ --read-snp-limit -e --read-indel-limit -0 --standard-filters -F --min-alternate-fraction -C --min-alternate-count -3 --min-alternate-qsum -G --min-alternate-total --min-coverage --limit-coverage -g --skip-coverage --trim-complex-tail -k --no-population-priors -w --hwe-priors-off -V --binomial-obs-priors-off -a --allele-balance-priors-off --observation-bias --base-quality-cap --prob-contamination --legacy-gls --contamination-estimates --report-genotype-likelihood-max -B --genotyping-max-iterations --genotyping-max-banddepth -W --posterior-integration-limits,M -N --exclude-unobserved-genotypes -S --genotype-variant-threshold -j --use-mapping-quality -H --harmonic-indel-quality -D --read-dependence-factor -= --genotype-qualities -d --debug".split() + opts = 0 + for i in value.split(): + if i.startswith("-"): + opts += 1 + if i in harpy_options: + self.fail(f"{i} is already used by Harpy when calling freebayes.", param, ctx) + if i not in valid_options: + self.fail(f"{i} is not a valid freebayes option. See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) + if opts < 1: + self.fail("No valid options recognized. Available freebayes options begin with one or two dashes (e.g. -t or --targets). See the freebayes documentation for a list of available options: https://github.com/freebayes/freebayes.", param, ctx) + return value + + + + + diff --git a/harpy/_validations.py b/harpy/_validations.py index 5c85af42d..728304954 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -58,19 +58,6 @@ def check_envdir(dirpath): rprint(errtable, file = sys.stderr) sys.exit(1) -def validate_input_by_ext(inputfile, option, ext): - """Check that the input file for a given option has read permissions and matches the acceptable extensions """ - if isinstance(ext, list): - test = [not(inputfile.lower().endswith(i.lower())) for i in ext] - if all(test): - ext_text = " | ".join(ext) - print_error("invalid file extension", f"The input file for [bold]{option}[/bold] must end in one of:\n[green bold]{ext_text}[/green bold]") - sys.exit(1) - else: - if not inputfile.lower().endswith(ext.lower()): - print_error("invalid file extension", f"The input file for [bold]{option}[/bold] must end in [green bold]{ext}[/green bold]") - sys.exit(1) - def check_impute_params(parameters): """Validate the STITCH parameter file for column names, order, types, missing values, etc.""" with open(parameters, "r", encoding="utf-8") as paramfile: @@ -425,16 +412,8 @@ def validate_regions(regioninput, genome): sys.exit(1) return "file" -def check_fasta(genofile, quiet): +def check_fasta(genofile): """perform validations on fasta file for extensions and file contents""" - ext_options = [".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".mpfa", ".frn"] - ext_correct = 0 - for i in ext_options: - if genofile.lower().endswith(i) or genofile.lower().endswith(i + ".gz"): - ext_correct += 1 - if ext_correct == 0 and not quiet: - print_notice(f"[blue]{genofile}[/blue] has an unfamiliar FASTA file extension. Common FASTA file extensions are: [green]" + ", ".join(ext_options) + "[/green] and may also be gzipped.") - # validate fasta file contents if is_gzip(genofile): fasta = gzip.open(genofile, 'rt') diff --git a/harpy/align.py b/harpy/align.py index 8872db79a..bb0a6350c 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -3,14 +3,14 @@ import os import sys import yaml -from time import sleep from pathlib import Path from rich import box from rich.table import Table import rich_click as click from ._conda import create_conda_recipes from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import ContigList +from ._cli_types_generic import ContigList, InputFile, HPCProfile, SnakemakeParams +from ._cli_types_params import BwaParams, EmaParams, StrobeAlignParams from ._launch import launch_snakemake from ._parsers import parse_fastq_inputs from ._printing import print_error, print_solution, print_notice @@ -45,7 +45,7 @@ def align(): "harpy align ema": [ { "name": "Parameters", - "options": ["--ema-bins", "--extra-params", "--genome", "--keep-unmapped", "--platform", "--min-quality", "--barcode-list"], + "options": ["--ema-bins", "--extra-params", "--fragment-density", "--genome", "--keep-unmapped", "--platform", "--min-quality", "--barcode-list"], }, { "name": "Workflow Controls", @@ -65,9 +65,9 @@ def align(): } @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/") -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') -@click.option('-x', '--extra-params', type = str, help = 'Additional bwa mem parameters, in quotes') +@click.option('-x', '--extra-params', type = BwaParams(), help = 'Additional bwa mem parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @@ -76,10 +76,10 @@ def align(): @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_params, min_quality, molecule_distance, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ @@ -105,7 +105,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) fetch_rule(workflowdir, "align_bwa.smk") @@ -149,10 +149,11 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet, "workflow/align.bwa.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema") -@click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') +@click.option('-x', '--extra-params', type = EmaParams(), help = 'Additional ema align parameters, in quotes') +@click.option('-d', '--fragment-density', is_flag = True, show_default = True, default = False, help = 'Perform read fragment density optimization') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/ema", show_default=True, help = 'Output directory name') @@ -162,12 +163,12 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_unmapped, threads, ema_bins, skip_reports, extra_params, min_quality, snakemake, quiet, hpc, conda, contigs, setup_only): +def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, depth_window, keep_unmapped, threads, ema_bins, skip_reports, extra_params, min_quality, snakemake, quiet, hpc, conda, contigs, setup_only): """ Align sequences to genome using `EMA` @@ -200,13 +201,12 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u else: print_solution("Running EMA requires TELLseq barcodes provided to [green]--barcode-list[/green]. They can be acquired from the TELL-read software [dim]https://www.illumina.com/products/by-type/informatics-products/basespace-sequence-hub/apps/universal-sequencing-tell-seq-data-analysis-pipeline.html[/dim]") sys.exit(1) - if platform == "haplotag" and barcode_list: + if platform == "haplotag" and barcode_list and not quiet: print_notice("Haplotag data does not require a barcode list and the file provided to [green]--barcode-list[/green] will be ignored.") - sleep(3) os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) fetch_rule(workflowdir, "align_ema.smk") @@ -220,6 +220,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u "output_directory" : output_dir, "alignment_quality" : min_quality, "keep_unmapped" : keep_unmapped, + "fragment_density_optimization": fragment_density, "depth_windowsize" : depth_window, "platform" : platform, "EMA_bins" : ema_bins, @@ -254,9 +255,9 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary") @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/") -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') -@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') +@click.option('-x', '--extra-params', type = StrobeAlignParams(), help = 'Additional aligner parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @@ -266,10 +267,10 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, threads, extra_params, min_quality, molecule_distance, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ @@ -298,7 +299,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) fetch_rule(workflowdir, "align_strobealign.smk") diff --git a/harpy/assembly.py b/harpy/assembly.py index f2949eb66..3a678221c 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -6,10 +6,11 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import KParam, HPCProfile, SnakemakeParams +from ._cli_types_params import SpadesParams, ArcsParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log -from ._cli_types import KParam from ._validations import validate_fastq_bx docstring = { @@ -34,9 +35,9 @@ @click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode [`BX`,`BC`]") @click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)') @click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes') -@click.option('-x', '--extra-params', type = str, help = 'Additional spades parameters, in quotes') +@click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes') # TIGMINT/ARCS/LINKS -@click.option('-y', '--arcs-extra', type = str, help = 'Additional ARCS parameters, in quotes') +@click.option('-y', '--arcs-extra', type = ArcsParams(), help = 'Additional ARCS parameters, in quotes (`option=arg` format)') @click.option("-c","--contig-length", type = int, default = 500, show_default = True, help = "Minimum contig length") @click.option("-n", "--links", type = int, default = 5, show_default = True, help = "Minimum number of links to compute scaffold") @click.option("-a", "--min-aligned", type = int, default = 5, show_default = True, help = "Minimum aligned read pairs per barcode") @@ -51,11 +52,11 @@ @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) @click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, extra_params,arcs_extra,contig_length,links,min_quality,min_aligned,mismatch,molecule_distance,molecule_length,seq_identity,span, organism_type, conda, threads, snakemake, quiet, hpc, setup_only, skip_reports): diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index d86588d7c..b2fe3be49 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -6,6 +6,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -33,9 +34,9 @@ @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Deconvolve", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, threads, snakemake, quiet, hpc, conda, setup_only): """ diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 8adf78842..7a0b82402 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -7,6 +7,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log @@ -43,10 +44,10 @@ def demultiplex(): @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('R1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('R2_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('I1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) diff --git a/harpy/hpc.py b/harpy/hpc.py index c8deb1965..05cc83a36 100644 --- a/harpy/hpc.py +++ b/harpy/hpc.py @@ -1,7 +1,10 @@ """Harpy module to create HPC profiles for running snakemake on a cluster""" import os +import subprocess +import sys import rich_click as click +from rich import print as rprint from rich.markdown import Markdown from ._printing import print_notice @@ -13,8 +16,9 @@ def hpc(): If running Harpy on an HPC system, you can leverage Snakemake to handle all the job submissions on your behalf. This command creates templates for common HPC schedulers so you can run Harpy on a cluster with minimal friction. - The subcommands create a `config.yaml` in an `hpc/` directory. You will also need - to install the associated Snakemake executor plugin for HPC job submission to work. + The subcommands create a `config.yaml` in an `hpc/system` directory (where `system` + is one of the options below). You will also need to install the associated Snakemake + executor plugin for HPC job submission to work. """ docstring = { @@ -26,20 +30,31 @@ def hpc(): ] } +def package_exists(pkg): + """helper function to search for a package in the active conda environment""" + search_pkg = subprocess.run(f"conda list snakemake-executor-plugin-{pkg}".split(), capture_output = True, text = True) + exists = False + for i in search_pkg.stdout.split("\n"): + if i and not i.startswith("#"): + exists = True + if not exists: + print_notice(Markdown(f""" +Using this scheduler requires installing an additional Snakemake plugin, which wasn't detected in this environment. +Install the `{pkg}` plugin with: + +```bash +conda install bioconda::snakemake-executor-plugin-{pkg} +``` + """)) + @click.command() def generic(): """Configuration for a generic scheduler""" outfile = "hpc/generic/config.yaml" os.makedirs("hpc/generic", exist_ok=True) if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `generic-cluster` plugin with: - -```bash -mamba install bioconda::snakemake-executor-plugin-cluster-generic -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("cluster-generic") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: cluster-generic\n") @@ -74,14 +89,8 @@ def lsf(): outfile = "hpc/lsf/config.yaml" os.makedirs("hpc/lsf", exist_ok=True) if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `lsf` plugin with: - -```bash -mamba install bioconda::snakemake-executor-plugin-lsf -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("lsf") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: lsf\n") @@ -110,14 +119,8 @@ def htcondor(): os.makedirs("hpc/htcondor", exist_ok=True) outfile = "hpc/htcondor/config.yaml" if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `htcondor` plugin with: - -```bash -mamba install bioconda::snakemake-executor-plugin-htcondor -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("htcondor") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: htcondor\n") @@ -145,14 +148,8 @@ def slurm(): os.makedirs("hpc/slurm", exist_ok=True) outfile = "hpc/slurm/config.yaml" if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `slurm` plugin with: - -```bash -mamba install bioconda::snakemake-executor-plugin-slurm -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("slurm") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: slurm\n") @@ -180,14 +177,8 @@ def googlebatch(): os.makedirs("hpc/googlebatch", exist_ok=True) outfile = "hpc/googlebatch/config.yaml" if os.path.exists(outfile): - click.echo(f"{outfile} exists, overwriting.") - print_notice(Markdown(""" -Using a scheduler requires installing an additional Snakemake plugin. If you haven't already, install the `googlebatch` plugin with: - -```bash -mamba install bioconda::snakemake-executor-plugin-googlebatch -``` - """)) + rprint(f"HPC profile [blue]{outfile}[/blue] already exists, overwriting\n", file = sys.stderr) + package_exists("googlebatch") with open(outfile, "w", encoding = "utf-8") as yml: yml.write("__use_yte__: true\n") yml.write("executor: googlebatch\n") diff --git a/harpy/impute.py b/harpy/impute.py index f70e72bc1..751d1f2f8 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -7,11 +7,13 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import StitchParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, fetch_script, snakemake_log from ._parsers import parse_alignment_inputs, biallelic_contigs -from ._validations import validate_input_by_ext, vcf_sample_match, check_impute_params, validate_bam_RG +from ._validations import vcf_sample_match, check_impute_params, validate_bam_RG docstring = { "harpy impute": [ @@ -27,17 +29,17 @@ } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/impute/") -@click.option('-x', '--extra-params', type = str, help = 'Additional STITCH parameters, in quotes') +@click.option('-x', '--extra-params', type = StitchParams(), help = 'Additional STITCH parameters, in quotes') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Impute", show_default=True, help = 'Output directory name') -@click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False), help = 'STITCH parameter file (tab-delimited)') +@click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'STITCH parameter file (tab-delimited)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -@click.option('-v', '--vcf', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to BCF/VCF file') +@click.option('-v', '--vcf', required = True, type = InputFile("vcf", gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for imputation rather than those found the inputs') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): @@ -68,7 +70,6 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) - validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) params = check_impute_params(parameters) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) diff --git a/harpy/metassembly.py b/harpy/metassembly.py index a32454c2c..1fb8ddb81 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -9,7 +9,8 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, snakemake_log -from ._cli_types import KParam +from ._cli_types_generic import HPCProfile, KParam, SnakemakeParams +from ._cli_types_params import SpadesParams from ._validations import validate_fastq_bx docstring = { @@ -31,17 +32,17 @@ @click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)') @click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes') @click.option('--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore linked-read info for initial spades assembly') -@click.option('-x', '--extra-params', type = str, help = 'Additional spades parameters, in quotes') +@click.option('-x', '--extra-params', type = SpadesParams(), help = 'Additional spades parameters, in quotes') # Common Workflow @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-u', '--organism-type', type = click.Choice(['prokaryote', 'eukaryote', 'fungus'], case_sensitive=False), default = "eukaryote", show_default=True, help = "Organism type for assembly report [`eukaryote`,`prokaryote`,`fungus`]") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) @click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, ignore_bx, output_dir, extra_params, conda, threads, snakemake, quiet, hpc, organism_type, setup_only, skip_reports): diff --git a/harpy/phase.py b/harpy/phase.py index c5e650bd4..240f93e7f 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -9,9 +9,10 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList +from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import HapCutParams from ._parsers import parse_alignment_inputs -from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, validate_input_by_ext, vcf_contig_match +from ._validations import check_fasta, vcf_sample_match, validate_bam_RG, vcf_contig_match docstring = { "harpy phase": [ @@ -27,21 +28,21 @@ } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/phase") -@click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') +@click.option('-x', '--extra-params', type = HapCutParams(), help = 'Additional HapCut2 parameters, in quotes') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Phase", show_default=True, help = 'Output directory name') @click.option('-p', '--prune-threshold', default = 7, show_default = True, type = click.IntRange(0,100), help = 'PHRED-scale threshold (%) for pruning low-confidence SNPs (larger prunes more.)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max_open = True), help = 'Number of threads to use') -@click.option('-v', '--vcf', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to BCF/VCF file') +@click.option('-v', '--vcf', required = True, type = InputFile("vcf", gzip_ok = False), help = 'Path to BCF/VCF file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for phasing rather than those found the inputs') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, vcf_samples, genome, snakemake, extra_params, ignore_bx, skip_reports, quiet, hpc, conda, contigs, setup_only): @@ -70,10 +71,9 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, os.makedirs(f"{workflowdir}/input", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) samplenames = vcf_sample_match(vcf, bamlist, vcf_samples) - validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) validate_bam_RG(bamlist, threads, quiet) if genome: - check_fasta(genome, quiet) + check_fasta(genome) if contigs: vcf_contig_match(contigs, vcf) fetch_rule(workflowdir, "phase.smk") diff --git a/harpy/popgroups.py b/harpy/popgroups.py index efa596270..da6a6f532 100644 --- a/harpy/popgroups.py +++ b/harpy/popgroups.py @@ -4,6 +4,7 @@ import re import sys import glob +from rich import print as rprint import rich_click as click from ._printing import print_error, print_solution, print_notice @@ -37,20 +38,20 @@ def popgroup(inputdir, output): fqlist = [os.path.basename(i) for i in full_fqlist] bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$" if len(fqlist) == 0: - print_error("no files found", f"No FASTQ or BAM files were detected in [bold]{inputdir}[/bold]") + print_error("no files found", f"No [bold]FASTQ[/bold] or [bold]BAM[/bold] files were detected in [blue]{inputdir}[/blue]") print_solution( - "Check that FASTQ file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]" + - "\nCheck that BAM files end in [green].bam[/green]"+ + "Check that [bold]FASTQ[/bold] file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]" + + "\nCheck that [bold]BAM[/bold] files end in [green].bam[/green]"+ "\nRead the documentation for details: https://pdimens.github.io/harpy/haplotagdata/#naming-conventions" ) sys.exit(1) samplenames = set([re.sub(bn_r, "", i, flags = re.IGNORECASE) for i in fqlist]) - click.echo(f"{len(samplenames)} samples detected in {inputdir}", file = sys.stderr) + rprint(f"[bold]{len(samplenames)} samples [/bold] detected in [blue]{inputdir}[blue]", file = sys.stderr) if os.path.exists(output): - write_text = f"The file [green]{output}[/green] was overwritten." + write_text = f"The file [blue]{output}[/blue] was overwritten." else: - write_text = f"Created sample population grouping file [green]{output}[/green]." + write_text = f"Created sample population grouping file [blue]{output}[/blue]." with open(output, "w", encoding="utf-8") as file: for i in samplenames: _ = file.write(i + '\tpop1\n') diff --git a/harpy/preflight.py b/harpy/preflight.py index ee30f0a43..5880cad23 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -6,6 +6,7 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import HPCProfile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -41,9 +42,9 @@ def preflight(): @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1) def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only): """ @@ -101,8 +102,8 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/bam", show_default=True, help = 'Output directory name') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) diff --git a/harpy/qc.py b/harpy/qc.py index 7eb95da4a..52ea8cd26 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -9,7 +9,8 @@ from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_report, fetch_rule, snakemake_log -from ._cli_types import IntList +from ._cli_types_generic import HPCProfile, IntList, SnakemakeParams +from ._cli_types_params import FastpParams from ._parsers import parse_fastq_inputs docstring = { @@ -28,7 +29,7 @@ @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/qc") @click.option('-c', '--deconvolve', type = IntList(4), default = "0,0,0,0", help = 'Accepts the QuickDeconvolution parameters for `k`,`w`,`d`,`a` (in that order)') @click.option('-d', '--deduplicate', is_flag = True, default = False, help = 'Identify and remove PCR duplicates') -@click.option('-x', '--extra-params', type = str, help = 'Additional Fastp parameters, in quotes') +@click.option('-x', '--extra-params', type = FastpParams(), help = 'Additional Fastp parameters, in quotes') @click.option('-m', '--max-length', default = 150, show_default = True, type=int, help = 'Maximum length to trim sequences down to') @click.option('-n', '--min-length', default = 30, show_default = True, type=int, help = 'Discard reads shorter than this length') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "QC", show_default=True, help = 'Output directory name') @@ -36,10 +37,10 @@ @click.option('-a', '--trim-adapters', is_flag = True, default = False, help = 'Detect and trim adapters') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, deconvolve, extra_params, threads, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ diff --git a/harpy/simulate.py b/harpy/simulate.py index b31124024..288a2d6f1 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -6,11 +6,12 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_script, snakemake_log from ._printing import print_error -from ._validations import validate_input_by_ext, check_fasta +from ._validations import check_fasta @click.group(options_metavar='', context_settings={"help_option_names" : ["-h", "--help"]}) def simulate(): @@ -123,7 +124,7 @@ def simulate(): } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") -@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False), help = "File of linked-read barcodes to add to reads") +@click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "File of linked-read barcodes to add to reads") @click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") @click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True, help = "Average number of molecules per partition") @click.option('-l', '--molecule-length', type = click.IntRange(min = 2), default = 100, show_default=True, help = "Mean molecule length (kbp)") @@ -133,13 +134,13 @@ def simulate(): @click.option('-p', '--partitions', type = click.IntRange(min = 1), default=1500, show_default=True, help = "Number (in thousands) of partitions/beads to generate") @click.option('-n', '--read-pairs', type = click.FloatRange(min = 0.001), default = 600, show_default=True, help = "Number (in millions) of read pairs to simulate") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome_hap1', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) -@click.argument('genome_hap2', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.argument('genome_hap1', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) +@click.argument('genome_hap2', required=True, type = InputFile("fasta", gzip_ok = True), nargs=1) def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_rate, distance_sd, barcodes, read_pairs, molecule_length, partitions, molecules_per, threads, snakemake, quiet, hpc, conda, setup_only): """ Create linked reads from a genome @@ -163,8 +164,8 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r if snakemake: command += snakemake - check_fasta(genome_hap1, quiet) - check_fasta(genome_hap2, quiet) + check_fasta(genome_hap1) + check_fasta(genome_hap2) os.makedirs(f"{workflowdir}/", exist_ok= True) fetch_rule(workflowdir, "simulate_linkedreads.smk") @@ -207,17 +208,17 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") -@click.option('-s', '--snp-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known snps to simulate') -@click.option('-i', '--indel-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known indels to simulate') +@click.option('-s', '--snp-vcf', type=InputFile("vcf", gzip_ok = False), help = 'VCF file of known snps to simulate') +@click.option('-i', '--indel-vcf', type=InputFile("vcf", gzip_ok = False), help = 'VCF file of known indels to simulate') @click.option('-n', '--snp-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random snps to simluate") @click.option('-m', '--indel-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random indels to simluate") @click.option('-r', '--titv-ratio', type = click.FloatRange(min=0), default= 0.5, show_choices=True, show_default=True, help = "Transition/Transversion ratio for snps") @click.option('-d', '--indel-ratio', type = click.FloatRange(min=0), default= 1, show_choices=True, show_default=True, help = "Insertion/Deletion ratio for indels") @click.option('-a', '--indel-size-alpha', type = click.FloatRange(min=0), hidden = True, default = 2.0, help = "Exponent Alpha for power-law-fitted size distribution") @click.option('-l', '--indel-size-constant', type = click.FloatRange(min=0), default = 0.5, hidden = True, help = "Exponent constant for power-law-fitted size distribution") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") @click.option('-y', '--snp-gene-constraints', type = click.Choice(["noncoding", "coding", "2d", "4d"]), help = "How to constrain randomly simulated SNPs {`noncoding`,`coding`,`2d`,`4d`}") -@click.option('-g', '--genes', type = click.Path(exists=True, readable=True), help = "GFF3 file of genes to use with `--snp-gene-constraints`") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -225,11 +226,11 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count, indel_count, titv_ratio, indel_ratio, indel_size_alpha, indel_size_constant, centromeres, genes, snp_gene_constraints, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce snps and/or indels into a genome @@ -271,23 +272,19 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text.add_row("Input Genome:", os.path.basename(genome)) if snp_vcf: - validate_input_by_ext(snp_vcf, "--snp-vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("SNP File:", os.path.basename(snp_vcf)) elif snp_count > 0: start_text.add_row("Random SNPs:", f"{snp_count}") if indel_vcf: - validate_input_by_ext(indel_vcf, "--indel-vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("Indel File:", os.path.basename(indel_vcf)) elif indel_count > 0: start_text.add_row("Random Indels:", f"{indel_count}") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) @@ -345,8 +342,8 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random inversions to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum inversion size (bp)") @click.option('-x', '--max-size', type = click.IntRange(min = 1), default = 100000, show_default= True, help = "Maximum inversion size (bp)") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @@ -354,11 +351,11 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Simulate/inversion", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_size, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce inversions into a genome @@ -387,22 +384,19 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") start_text.add_row("Input Genome:", os.path.basename(genome)) if vcf: - validate_input_by_ext(vcf, "--vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("Inversion File:", os.path.basename(vcf)) else: start_text.add_row("Random Inversions:", f"{count}") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) @@ -457,8 +451,8 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('-d', '--dup-ratio', type = click.FloatRange(min=0), default = 1, show_choices=True, show_default=True, help = "Ratio for the selected variant") @click.option('-l', '--gain-ratio', type = click.FloatRange(min=0), default=1, show_default=True, help = " Relative ratio of DNA gain over DNA loss") @click.option('-y', '--max-copy', type = click.IntRange(min = 1), default=10, show_default=True, help = "Maximum number of copies") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of genes to avoid when simulating (requires `--snp-coding-partition` for SNPs)") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -466,11 +460,11 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, dup_ratio, max_copy, gain_ratio, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce copy number variants into a genome @@ -505,22 +499,19 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") start_text.add_row("Input Genome:", os.path.basename(genome)) if vcf: - validate_input_by_ext(vcf, "--vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("CNV File:", os.path.basename(vcf)) else: start_text.add_row("Random CNVs:", f"{count}") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) @@ -572,8 +563,8 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known translocations to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random translocations to simluate") -@click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") -@click.option('-g', '--genes', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of genes to avoid when simulating") +@click.option('-c', '--centromeres', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of centromeres to avoid") +@click.option('-g', '--genes', type = InputFile("gff", gzip_ok = True), help = "GFF3 file of genes to avoid when simulating") @click.option('-z', '--heterozygosity', type = click.FloatRange(0,1), default = 0, show_default=True, help = 'heterozygosity to simulate diploid variants') @click.option('--only-vcf', is_flag = True, default = False, help = 'If setting heterozygosity, only create the vcf rather than the fasta files') @click.option('-e', '--exclude-chr', type = click.Path(exists=True, dir_okay=False, readable=True), help = "Text file of chromosomes to avoid") @@ -581,11 +572,11 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du @click.option('-p', '--prefix', type = str, default= "sim", show_default=True, help = "Naming prefix for output files") @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--randomseed', type = click.IntRange(min = 1), help = "Random seed for simulation") -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('genome', required=True, type=click.Path(exists=True, dir_okay=False, readable=True), nargs=1) +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') +@click.argument('genome', required=True, type=InputFile("fasta", gzip_ok = True), nargs=1) def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, genes, heterozygosity, exclude_chr, randomseed, snakemake, quiet, hpc, conda, setup_only): """ Introduce translocations into a genome @@ -613,22 +604,19 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome, quiet) + check_fasta(genome) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") start_text.add_row("Input Genome:", os.path.basename(genome)) if vcf: - validate_input_by_ext(vcf, "--vcf", ["vcf","vcf.gz","bcf"]) start_text.add_row("Translocation File:", os.path.basename(vcf)) else: start_text.add_row("Random Translocations", f"{count} random translocations") if centromeres: - validate_input_by_ext(centromeres, "--centromeres", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Centromere GFF:", os.path.basename(centromeres)) if genes: - validate_input_by_ext(genes, "--genes", [".gff",".gff3",".gff.gz", ".gff3.gz"]) start_text.add_row("Genes GFF:", os.path.basename(genes)) if exclude_chr: start_text.add_row("Excluded Chromosomes:", os.path.basename(exclude_chr)) diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index f693a83fc..01a26891d 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -19,6 +19,7 @@ binrange = ["%03d" % i for i in range(nbins)] fqlist = config["inputs"]["fastq"] genomefile = config["inputs"]["genome"] platform = config["platform"] +frag_opt = config["fragment_density_optimization"] barcode_list = config["inputs"].get("barcode_list", "") extra = config.get("extra", "") bn = os.path.basename(genomefile) @@ -149,6 +150,7 @@ rule align_ema: RG_tag = lambda wc: "\"@RG\\tID:" + wc.get("sample") + "\\tSM:" + wc.get("sample") + "\"", bxtype = f"-p {platform}", tmpdir = lambda wc: outdir + "/." + d[wc.sample], + frag_opt = "-d" if frag_opt else "", quality = config["alignment_quality"], unmapped = "" if keep_unmapped else "-F 4", extra = extra @@ -158,7 +160,7 @@ rule align_ema: f"{envdir}/align.yaml" shell: """ - ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R {params.RG_tag} -x {input.readbin} 2> {log.ema} | + ema align -t {threads} {params.extra} {params.frag_opt} {params.bxtype} -r {input.genome} -R {params.RG_tag} -x {input.readbin} 2> {log.ema} | samtools view -h {params.unmapped} -q {params.quality} | samtools sort -T {params.tmpdir} --reference {input.genome} -O bam --write-index -m {resources.mem_mb}M -o {output.aln}##idx##{output.idx} - 2> {log.sort} rm -rf {params.tmpdir} @@ -353,7 +355,8 @@ rule workflow_summary: bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: beadtech = "-p" if platform == "haplotag" else f"-w {barcode_list}", - unmapped = "" if keep_unmapped else "-F 4" + unmapped = "" if keep_unmapped else "-F 4", + frag_opt = "-d" if frag_opt else "" run: summary = ["The harpy align ema workflow ran using these parameters:"] summary.append(f"The provided genome: {genomefile}") @@ -364,7 +367,7 @@ rule workflow_summary: bins += f"\tseqtk mergepe forward.fq.gz reverse.fq.gz | ema preproc {params.beadtech} -n {nbins}" summary.append(bins) ema_align = "Barcoded bins were aligned with ema align using:\n" - ema_align += f'\tema align {extra} -d -p {platform} -R "@RG\\tID:SAMPLE\\tSM:SAMPLE" |\n' + ema_align += f'\tema align {extra} {params.frag_opt} -p {platform} -R "@RG\\tID:SAMPLE\\tSM:SAMPLE" |\n' ema_align += f"\tsamtools view -h {params.unmapped} -q {config["alignment_quality"]} - |\n" ema_align += "\tsamtools sort --reference genome" summary.append(ema_align) diff --git a/harpy/snakefiles/assembly.smk b/harpy/snakefiles/assembly.smk index f8246229c..eadb46d51 100644 --- a/harpy/snakefiles/assembly.smk +++ b/harpy/snakefiles/assembly.smk @@ -119,7 +119,8 @@ rule scaffolding: rule QUAST_assessment: input: - assembly = f"{outdir}/scaffolds.fasta", + contigs = f"{outdir}/spades/contigs.fasta", + scaffolds = f"{outdir}/scaffolds.fasta", fastq = f"{outdir}/scaffold/interleaved.fq.gz" output: f"{outdir}/quast/report.tsv" @@ -128,14 +129,14 @@ rule QUAST_assessment: params: output_dir = f"-o {outdir}/quast", organism = f"--{organism}" if organism != "prokaryote" else "", - quast_params = "--labels harpy_cloudspades --rna-finding", - skip_things = "--no-sv" + quast_params = "--labels spades_contigs,arcs_scaffolds --rna-finding", + skip_things = "--no-sv" threads: workflow.cores conda: f"{envdir}/assembly.yaml" shell: - "quast.py --threads {threads} --pe12 {input.fastq} {params} {input.assembly} 2> {log}" + "quast.py --threads {threads} --pe12 {input.fastq} {params} {input.contigs} {input.scaffolds} 2> {log}" rule BUSCO_analysis: input: @@ -155,9 +156,7 @@ rule BUSCO_analysis: conda: f"{envdir}/assembly.yaml" shell: - """ - ( busco -f -i {input} -c {threads} -m genome --out_path {params} > {log} 2>&1 ) || touch {output} - """ + "( busco -f -i {input} -c {threads} -m genome --out_path {params} > {log} 2>&1 ) || touch {output}" rule build_report: input: diff --git a/harpy/snakefiles/metassembly.smk b/harpy/snakefiles/metassembly.smk index c7201c3c4..92b2ff18d 100644 --- a/harpy/snakefiles/metassembly.smk +++ b/harpy/snakefiles/metassembly.smk @@ -249,7 +249,7 @@ rule QUAST_assessment: params: output_dir = f"-o {outdir}/quast", organism = f"--{organism}" if organism != "prokaryote" else "", - quast_params = "--labels contigs,scaffolds --glimmer --rna-finding" + quast_params = "--labels spades_contigs,athena_scaffolds --glimmer --rna-finding" threads: workflow.cores conda: diff --git a/harpy/snp.py b/harpy/snp.py index becb7498c..c7830b786 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -7,6 +7,8 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import MpileupParams, FreebayesParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log @@ -47,19 +49,19 @@ def snp(): } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') +@click.option('-x', '--extra-params', type = MpileupParams(), help = 'Additional mpileup parameters, in quotes') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") @click.option('-r', '--regions', type=str, default=50000, show_default=True, help = "Regions where to call variants") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ @@ -93,7 +95,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) - check_fasta(genome, quiet) + check_fasta(genome) # setup regions checks regtype = validate_regions(regions, genome) @@ -152,8 +154,8 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log, quiet, "workflow/snp.mpileup.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') +@click.option('-x', '--extra-params', type = FreebayesParams(), help = 'Additional freebayes parameters, in quotes') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') @click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") @@ -161,10 +163,10 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ @@ -198,7 +200,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) - check_fasta(genome, quiet) + check_fasta(genome) # setup regions checks regtype = validate_regions(regions, genome) diff --git a/harpy/stitchparams.py b/harpy/stitchparams.py index 17f5cfec7..8d5d6a856 100644 --- a/harpy/stitchparams.py +++ b/harpy/stitchparams.py @@ -27,6 +27,6 @@ def stitchparams(output): _ = file.write('k1_ng30\tdiploid\tTRUE\t50000\t5\t1\t30\n') _ = file.write('high_ngen\tdiploid\tTRUE\t50000\t15\t1\t100') print_notice( - f"Created STITCH parameter file: {output}\n" + - "Modify the model parameters as needed, but DO NOT add/remove columns." + f"Created template STITCH parameter file: [blue]{output}[/blue]\n" + + "Modify the model parameters as needed, but [yellow bold]do not add/remove columns." ) \ No newline at end of file diff --git a/harpy/sv.py b/harpy/sv.py index 40c2b5eca..e8c25f809 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -7,10 +7,11 @@ from rich import box from rich.table import Table import rich_click as click +from ._cli_types_generic import ContigList, HPCProfile, InputFile, SnakemakeParams +from ._cli_types_params import LeviathanParams, NaibrParams from ._conda import create_conda_recipes from ._launch import launch_snakemake from ._misc import fetch_rule, fetch_report, snakemake_log -from ._cli_types import ContigList from ._parsers import parse_alignment_inputs from ._validations import check_fasta, check_phase_vcf from ._validations import validate_popfile, validate_popsamples, fasta_contig_match @@ -52,8 +53,8 @@ def sv(): } @click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/sv/leviathan/") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') +@click.option('-x', '--extra-params', type = str, help = 'Additional leviathan parameters, in quotes') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @@ -63,10 +64,10 @@ def sv(): @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, threads, populations, extra_params, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ @@ -93,7 +94,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) if populations: @@ -141,8 +142,8 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log, quiet, "workflow/sv.leviathan.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/sv/naibr/") -@click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') -@click.option('-g', '--genome', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Genome assembly') +@click.option('-x', '--extra-params', type = str, help = 'Additional naibr parameters, in quotes') +@click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for calling variants') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-q', '--min-quality', show_default = True, default=30, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality of reads to use') @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @@ -154,10 +155,10 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.option('--snakemake', type = SnakemakeParams(), help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, threads, populations, molecule_distance, extra_params, snakemake, skip_reports, quiet, hpc, conda, contigs, setup_only): """ @@ -191,7 +192,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - check_fasta(genome, quiet) + check_fasta(genome) if contigs: fasta_contig_match(contigs, genome) if populations: