From 08227bb22fe3ba87160074219027084217255e48 Mon Sep 17 00:00:00 2001 From: Retype GitHub Action Date: Tue, 26 Nov 2024 02:39:33 +0000 Subject: [PATCH] Refreshes Retype-generated documentation. Process triggered by pdimens. --- 404.html | 10 +- blog/filtering_snps/index.html | 12 +- blog/index.html | 10 +- blog/simulate_diploid/index.html | 12 +- blog/software_environments/index.html | 12 +- blog/sort_by_barcode/index.html | 12 +- blog/sv_pooling/index.html | 10 +- categories/guides/index.html | 10 +- categories/index.html | 10 +- commonoptions/index.html | 12 +- development/index.html | 12 +- haplotagdata/index.html | 10 +- index.html | 19 +-- install/index.html | 12 +- resources/js/config.js | 2 +- resources/js/search.json | 2 +- sitemap.xml.gz | Bin 635 -> 636 bytes snakemake/index.html | 12 +- software/index.html | 10 +- troubleshooting/index.html | 12 +- utilities/index.html | 12 +- workflows/align/bwa/index.html | 12 +- workflows/align/ema/index.html | 12 +- workflows/align/index.html | 10 +- workflows/align/strobe/index.html | 12 +- workflows/assembly/index.html | 12 +- workflows/deconvolve/index.html | 12 +- workflows/demultiplex/index.html | 12 +- workflows/impute/index.html | 20 +-- workflows/metassembly/index.html | 12 +- workflows/other/index.html | 149 ++++++++++++------ workflows/phase/index.html | 12 +- workflows/preflight/index.html | 12 +- workflows/qc/index.html | 12 +- workflows/simulate/index.html | 10 +- .../simulate/simulate-linkedreads/index.html | 12 +- .../simulate/simulate-variants/index.html | 12 +- workflows/snp/index.html | 12 +- workflows/sv/index.html | 10 +- workflows/sv/leviathan/index.html | 12 +- workflows/sv/naibr/index.html | 12 +- 41 files changed, 326 insertions(+), 266 deletions(-) diff --git a/404.html b/404.html index c01cb19f1..28b5772f9 100644 --- a/404.html +++ b/404.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
@@ -54,7 +54,7 @@ Harpy - +
diff --git a/blog/filtering_snps/index.html b/blog/filtering_snps/index.html index 14f701f0a..97f30245c 100644 --- a/blog/filtering_snps/index.html +++ b/blog/filtering_snps/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
@@ -61,7 +61,7 @@ Harpy - +
diff --git a/blog/index.html b/blog/index.html index 53af2015f..27197a36e 100644 --- a/blog/index.html +++ b/blog/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
@@ -55,7 +55,7 @@ Harpy - +
diff --git a/blog/simulate_diploid/index.html b/blog/simulate_diploid/index.html index 1795572d1..294c862b0 100644 --- a/blog/simulate_diploid/index.html +++ b/blog/simulate_diploid/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + + @@ -62,7 +62,7 @@ Harpy - + diff --git a/blog/software_environments/index.html b/blog/software_environments/index.html index 28f7323d0..5a41782a9 100644 --- a/blog/software_environments/index.html +++ b/blog/software_environments/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
@@ -61,7 +61,7 @@ Harpy - +
diff --git a/blog/sort_by_barcode/index.html b/blog/sort_by_barcode/index.html index b4dd03495..758b1c0fa 100644 --- a/blog/sort_by_barcode/index.html +++ b/blog/sort_by_barcode/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
@@ -61,7 +61,7 @@ Harpy - +
diff --git a/blog/sv_pooling/index.html b/blog/sv_pooling/index.html index b04561e3e..9729e11ee 100644 --- a/blog/sv_pooling/index.html +++ b/blog/sv_pooling/index.html @@ -4,7 +4,7 @@ - + @@ -37,11 +37,11 @@ - + - + - +
@@ -60,7 +60,7 @@ Harpy - +
diff --git a/categories/guides/index.html b/categories/guides/index.html index 46cefa589..bdb7ac34f 100644 --- a/categories/guides/index.html +++ b/categories/guides/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
@@ -54,7 +54,7 @@ Harpy - +
diff --git a/categories/index.html b/categories/index.html index d60bcdc5f..b468440c6 100644 --- a/categories/index.html +++ b/categories/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
@@ -54,7 +54,7 @@ Harpy - +
diff --git a/commonoptions/index.html b/commonoptions/index.html index 2340e9adb..a7dce5b70 100644 --- a/commonoptions/index.html +++ b/commonoptions/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
@@ -58,7 +58,7 @@ Harpy - +
diff --git a/development/index.html b/development/index.html index 97a6d6ba9..7b584619e 100644 --- a/development/index.html +++ b/development/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
@@ -60,7 +60,7 @@ Harpy - +
diff --git a/haplotagdata/index.html b/haplotagdata/index.html index c59cffcef..f5a57e5ca 100644 --- a/haplotagdata/index.html +++ b/haplotagdata/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + @@ -61,7 +61,7 @@ Harpy - + diff --git a/index.html b/index.html index 0585c7535..b97035542 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + @@ -60,7 +60,7 @@ Harpy - + @@ -423,12 +423,13 @@

│ metassembly Create a metassembly from linked-reads │ ╰─────────────────────────────────────────────────────────────────────────╯ ╭─ Other Commands ────────────────────────────────────────────────────────╮ -│ resume Resume a workflow from an existing Harpy directory │ -│ hpc Profile templates for cluster job submissions │ -│ preflight File format checks for haplotag data │ │ deconvolve Resolve clashing barcodes from different molecules │ +│ hpc Profile templates for cluster job submissions │ +│ imputeparams Create a template imputation parameter file │ │ popgroup Create a template grouping file for samples │ -│ stitchparams Create a template STITCH parameter file │ +│ preflight File format checks for haplotag data │ +│ resume Resume a workflow from an existing Harpy directory │ +│ view View a workflow log, config, or snakefile │ ╰─────────────────────────────────────────────────────────────────────────╯ diff --git a/install/index.html b/install/index.html index 52598a2ed..16f8dda3d 100644 --- a/install/index.html +++ b/install/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
@@ -58,7 +58,7 @@ Harpy - +
diff --git a/resources/js/config.js b/resources/js/config.js index c6a4cf875..47569c878 100644 --- a/resources/js/config.js +++ b/resources/js/config.js @@ -1 +1 @@ -var __DOCS_CONFIG__ = {"id":"b158wm1w8iO0Lg0z9u5gNoo9NrtIiIXvUXu","key":"G2ehFMZgA52/4wZZLaIKY7rvBxEE406pNxcvW1+1FEI.Y9kr+jKSXsPFp8DIrKfvYbuNHd231MvYGDLopumxOVeplVOEus3jQSISex7TCcvSg0KpzdaD1sbPJQvqboUAnw.31","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.785879740334","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"utilities","l":"Utilities","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; +var __DOCS_CONFIG__ = {"id":"uNgosmUw08y5158ATr/UaHj9X1UiJTVdPXq","key":"EX5YEr2AbnhgaBp4a5d1djzwyu+muFPBGiQxcOFJhDc.lZHNpJwrqO7/BADFA02KpWW63/cYjIHP5AyOUPcfKBMYGEQST6UpjrixPxFjSx//9pndNDdJtEMXIGCJbNt5Fw.25","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.785903955873","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"utilities","l":"Utilities","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; diff --git a/resources/js/search.json b/resources/js/search.json index c8b871fc4..9795e0772 100644 --- a/resources/js/search.json +++ b/resources/js/search.json @@ -1 +1 @@ -[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Typical Linked-Read Workflows","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have conda(or mamba) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via conda activate env_name) simply use the conda install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the descriptions are provided with an extra badge to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow files (e.g."]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as nucleotide sequences, one per line. If not provided, Harpy will generate the standard set of 96^4 haplotagging barcodes (24bp) used by Meier et al."]},{"l":"Haplotagging conversion","p":["Harpy will convert (demultiplex) the simulated linked-reads into proper haplotagging ACBD format by matching the first X number of nucleotides from the start of the forward read to the barcode list. The provided barcodes will all be assigned a"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Use the calculator provided below to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]},{"l":"Variant Simulation Workflow"}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but was slow and used a ton of memory as dependencies increased. Recent ( 23.10+) versions of Conda now use the libmamba solver, the super-fast and super-lightweight solver from Mamba. If you're experiencing"]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but most Harpy modules have an optional flag"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Utilities","p":["Harpy is the sum of its parts and some of those parts are stand-alone scripts used by the workflows that are accessible from within the Harpy conda environment. This page serves to document those scripts, since using them outside of a workflow"]},{"i":"assign_mipy","l":"assign_mi.py","p":["Assign an MI:i(Molecular Identifier) tag to each barcoded record based on a molecular distance cutoff. Input file must be coordinate sorted. This is similar to deconvolve_alignments.py"]},{"i":"bx_statspy","l":"bx_stats.py","p":["Calculates various linked-read molecule metrics from the (coordinate-sorted) input alignment file. Metrics include (per molecule):"]},{"i":"check_bampy","l":"check_bam.py","p":["Parses an aligment file to check:"]},{"i":"check_fastqpy","l":"check_fastq.py","p":["Parses a FASTQ file to check if any sequences don't conform to the SAM spec, whether BX:Z: is the last tag in the record, and the counts of:"]},{"i":"concatenate_bampy","l":"concatenate_bam.py","p":["Concatenate records from haplotagged SAM/BAM files while making sure MI tags remain unique for every sample. This is a means of accomplishing the same as samtools cat, except all"]},{"i":"count_bxpy","l":"count_bx.py","p":["Parses a FASTQ file to count:"]},{"i":"deconvolve_alignmentspy","l":"deconvolve_alignments.py","p":["Deconvolve BX-tagged barcodes and assign an MI(Molecular Identifier) tag to each barcoded record based on a molecular distance cutoff. Input file must be coordinate sorted. This is similar to"]},{"i":"depth_windowspy","l":"depth_windows.py","p":["Reads the output of samtools depth -a from stdin and calculates means within windows of a given windowsize."]},{"i":"haplotag_acbdpy","l":"haplotag_acbd.py","p":["Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes into the specified output_directory."]},{"i":"infer_svpy","l":"infer_sv.py","p":["Create column in NAIBR bedpe output inferring the SV type from the orientation. Removes variants with FAIL flags and you can use the optional -f(--fail) argument to output FAIL variants to a separate file."]},{"i":"inline_to_haplotagpy","l":"inline_to_haplotag.py","p":["Converts inline nucleotide barcodes in reads to haplotag linked reads with barcodes in BX:Z and OX:Z header tags."]},{"i":"make_windowspy","l":"make_windows.py","p":["Create a BED file of fixed intervals (-w, -- window) from a FASTA or fai file (the kind generated with samtools faidx). Nearly identical to bedtools makewindows, except the intervals are nonoverlapping. The"]},{"i":"molecule_coveragepy","l":"molecule_coverage.py","p":["Using the statsfile generated by bx_stats.py from Harpy, will calculate \"molecular coverage\" across the genome. Molecular coverage is the \"effective\" alignment coverage if you treat a molecule inferred from linked-read data as"]},{"i":"parse_phaseblockspy","l":"parse_phaseblocks.py","p":["Parse a phase block file from HapCut2 to pull out summary information"]},{"l":"rename_bam","p":["Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. This process creates a new file new_name.bam and you may use -d to delete the original file. Requires"]},{"l":"separate_validbx","p":["Split a BAM file with BX tags into 2 files, one with valid ACBD barcodes ( stdout), one with invalid ACBD barcodes ( stderr)."]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file +[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Typical Linked-Read Workflows","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have conda(or mamba) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via conda activate env_name) simply use the conda install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the descriptions are provided with an extra badge to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"imputeparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow files (e.g."]},{"i":"arguments-1","l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-2","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"view","p":["This convenience command lets you view the latest workflow log file of a Harpy output directory. Use --snakefile or --config to view the workflow snakefile or config.yaml file instead, respectively. Output is printed to the screen via"]},{"i":"arguments-3","l":"arguments"}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as nucleotide sequences, one per line. If not provided, Harpy will generate the standard set of 96^4 haplotagging barcodes (24bp) used by Meier et al."]},{"l":"Haplotagging conversion","p":["Harpy will convert (demultiplex) the simulated linked-reads into proper haplotagging ACBD format by matching the first X number of nucleotides from the start of the forward read to the barcode list. The provided barcodes will all be assigned a"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Use the calculator provided below to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]},{"l":"Variant Simulation Workflow"}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but was slow and used a ton of memory as dependencies increased. Recent ( 23.10+) versions of Conda now use the libmamba solver, the super-fast and super-lightweight solver from Mamba. If you're experiencing"]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but most Harpy modules have an optional flag"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Utilities","p":["Harpy is the sum of its parts and some of those parts are stand-alone scripts used by the workflows that are accessible from within the Harpy conda environment. This page serves to document those scripts, since using them outside of a workflow"]},{"i":"assign_mipy","l":"assign_mi.py","p":["Assign an MI:i(Molecular Identifier) tag to each barcoded record based on a molecular distance cutoff. Input file must be coordinate sorted. This is similar to deconvolve_alignments.py"]},{"i":"bx_statspy","l":"bx_stats.py","p":["Calculates various linked-read molecule metrics from the (coordinate-sorted) input alignment file. Metrics include (per molecule):"]},{"i":"check_bampy","l":"check_bam.py","p":["Parses an aligment file to check:"]},{"i":"check_fastqpy","l":"check_fastq.py","p":["Parses a FASTQ file to check if any sequences don't conform to the SAM spec, whether BX:Z: is the last tag in the record, and the counts of:"]},{"i":"concatenate_bampy","l":"concatenate_bam.py","p":["Concatenate records from haplotagged SAM/BAM files while making sure MI tags remain unique for every sample. This is a means of accomplishing the same as samtools cat, except all"]},{"i":"count_bxpy","l":"count_bx.py","p":["Parses a FASTQ file to count:"]},{"i":"deconvolve_alignmentspy","l":"deconvolve_alignments.py","p":["Deconvolve BX-tagged barcodes and assign an MI(Molecular Identifier) tag to each barcoded record based on a molecular distance cutoff. Input file must be coordinate sorted. This is similar to"]},{"i":"depth_windowspy","l":"depth_windows.py","p":["Reads the output of samtools depth -a from stdin and calculates means within windows of a given windowsize."]},{"i":"haplotag_acbdpy","l":"haplotag_acbd.py","p":["Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes into the specified output_directory."]},{"i":"infer_svpy","l":"infer_sv.py","p":["Create column in NAIBR bedpe output inferring the SV type from the orientation. Removes variants with FAIL flags and you can use the optional -f(--fail) argument to output FAIL variants to a separate file."]},{"i":"inline_to_haplotagpy","l":"inline_to_haplotag.py","p":["Converts inline nucleotide barcodes in reads to haplotag linked reads with barcodes in BX:Z and OX:Z header tags."]},{"i":"make_windowspy","l":"make_windows.py","p":["Create a BED file of fixed intervals (-w, -- window) from a FASTA or fai file (the kind generated with samtools faidx). Nearly identical to bedtools makewindows, except the intervals are nonoverlapping. The"]},{"i":"molecule_coveragepy","l":"molecule_coverage.py","p":["Using the statsfile generated by bx_stats.py from Harpy, will calculate \"molecular coverage\" across the genome. Molecular coverage is the \"effective\" alignment coverage if you treat a molecule inferred from linked-read data as"]},{"i":"parse_phaseblockspy","l":"parse_phaseblocks.py","p":["Parse a phase block file from HapCut2 to pull out summary information"]},{"l":"rename_bam","p":["Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. This process creates a new file new_name.bam and you may use -d to delete the original file. Requires"]},{"l":"separate_validbx","p":["Split a BAM file with BX tags into 2 files, one with valid ACBD barcodes ( stdout), one with invalid ACBD barcodes ( stderr)."]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 43cad2475f6f28c844f25d3d7da5ff71380a33be..be23f50e6584b06c3148865f08ff733a6429e70c 100644 GIT binary patch literal 636 zcmV-?0)zb@iwFP!000001I(DqvYRjrhWB}j4EsRH^^i$8yFS6`rW?k91D+VmwFM+^ zUy(~M)1K++RGDD~1UdQg3t7L|zn&XC?WFSx%pr)wc`%ixfKr+25PW|7Gia(Q3S}d#55ZNDFKW8*(fz|{9wxzU&F)-dqkmc1FRyFjBuS#{ z-5{Mzr^7D|M>^9`q(i)?1-4X;G(M~p)@>dth-%^NCE7^pE$^_p1RQx)b~(Nv@u`w#H}B(+Vrw3r0-6R4 zSfs@I|JP|AZkfQ8@hG(BK-w&LkZNHypr}fLVoY46?4$-u_mBlE=E3U0osJrM9|^50 zGfp(s*cK%R+j`5*_O%!%m#u~8k!;D023SkSg2q&vKhR29faySYJks$OsrRzUHAAx` zI)3q*R->|7o=3^&jU5peI%sn(eGP!*&_+adwLe}_G6l-4q=bSh$HsS58l@xQGD<6p zDPp4YMx4kopICN~Rq~uRpZ6^0X!XL4*GS~taZ+$?86Nu)(Dle%r{GZ3;ZaG$H}Y{P6?(WUa}{f)YP(5 z0&BtYZsG}qzbQ%^_eqvcijqfErbo0Owu-)mOhVPF@?&1%Z*Mr62JoNQ7@*=mkKNaU WmzYkz0RRC1{{sNTHNC(p7XSb@<~;%c literal 635 zcmV->0)+h^iwFP!000001I(CBlbS#fhVT9gi@PAAyNjjlEx%xM$%PtVgeiyV$xH+K z@6-5^k4iSRskBv`0O`@kx9RD(*}a}xJ@2LS3d}x;!(}j+ri4nFdLMj#`?PooUbES* zbJ|OsQ?T*-ph2|TC>n+#^a^DwtPjD}kuMv%@X_tVXc?x#b*i1=tl65B!!0hV>s*heH>D3!^2ZT+kyd$ zlvw}&IxWLzCNO0@3avSio>n|ajj$R})RjOnC9X>LQiG*?$buE~U=84o2MvRdgjThg zCYow(hmwPB^UTfmwVWoGZG`8MEa652Y@}mBW2(&`Xq7C%^q_kl>G+G(d)XG6p;;0g zzj$q@QCTfdljQTpPKXN~bcL3_0YGwS6C%6VAFn8x0%cxNLP1qi<2xvg(vfgErIp1L zG0}M=j^vn+EIY_LeN3A#1{QO)e&NPzC35a@{D9WU14-DhxHvAn4wY7@BugD|L3udf zARJE}F!`hF)&>sq;+z-4(R4gPzCO1AnyD!;OLxs_E={i-m~oupdFa>;@V$Yt&^@u! z_0#0Kwg*q(uq=4&!wn_Y2vxAmHwnK5z8A&)WYG(!1XB?w?8zoIwd|F^ zMzFk_cmUyViqgh?l$DdB;t`eU5iN - + @@ -32,12 +32,12 @@ - + - + - - + +
@@ -58,7 +58,7 @@ Harpy - +
diff --git a/software/index.html b/software/index.html index 0e22c6115..8517453ef 100644 --- a/software/index.html +++ b/software/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
@@ -57,7 +57,7 @@ Harpy - +
diff --git a/troubleshooting/index.html b/troubleshooting/index.html index 51114dd4a..75a7d72f8 100644 --- a/troubleshooting/index.html +++ b/troubleshooting/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
@@ -60,7 +60,7 @@ Harpy - +
diff --git a/utilities/index.html b/utilities/index.html index b7fd41ce2..e74fc369d 100644 --- a/utilities/index.html +++ b/utilities/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + @@ -60,7 +60,7 @@ Harpy - + diff --git a/workflows/align/bwa/index.html b/workflows/align/bwa/index.html index c1ed39b20..aba87ba82 100644 --- a/workflows/align/bwa/index.html +++ b/workflows/align/bwa/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -63,7 +63,7 @@ Harpy - + diff --git a/workflows/align/ema/index.html b/workflows/align/ema/index.html index 5efda0a3c..34332a7a1 100644 --- a/workflows/align/ema/index.html +++ b/workflows/align/ema/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -63,7 +63,7 @@ Harpy - + diff --git a/workflows/align/index.html b/workflows/align/index.html index 64aff8b78..adca4cec8 100644 --- a/workflows/align/index.html +++ b/workflows/align/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
@@ -57,7 +57,7 @@ Harpy - +
diff --git a/workflows/align/strobe/index.html b/workflows/align/strobe/index.html index 47da730b6..2cb07eb04 100644 --- a/workflows/align/strobe/index.html +++ b/workflows/align/strobe/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -63,7 +63,7 @@ Harpy - + diff --git a/workflows/assembly/index.html b/workflows/assembly/index.html index 33b99732d..b771442f8 100644 --- a/workflows/assembly/index.html +++ b/workflows/assembly/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - + diff --git a/workflows/deconvolve/index.html b/workflows/deconvolve/index.html index 8767bf83a..406a07223 100644 --- a/workflows/deconvolve/index.html +++ b/workflows/deconvolve/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + @@ -60,7 +60,7 @@ Harpy - + diff --git a/workflows/demultiplex/index.html b/workflows/demultiplex/index.html index 6e6bbc0c5..3f26efd0b 100644 --- a/workflows/demultiplex/index.html +++ b/workflows/demultiplex/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - + diff --git a/workflows/impute/index.html b/workflows/impute/index.html index 3ec743c2b..9b36ecd41 100644 --- a/workflows/impute/index.html +++ b/workflows/impute/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -63,7 +63,7 @@ Harpy - + @@ -399,8 +399,8 @@

example
-
# create stitch parameter file 'stitch.params'
-harpy stitchparams -o stitch.params 
+
# create the parameter file 'stitch.params'
+harpy imputeparams -o stitch.params 
 
 # run imputation
 harpy impute --threads 20 --vcf Variants/mpileup/variants.raw.bcf --parameters stitch.params Align/ema
@@ -502,8 +502,8 @@

different model parameters (explained in next section). The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model parameters and the rows are the values for those parameters. The parameter file -is required and can be created manually or with - harpy stitchparams +is required and can be created manually or with + harpy imputeparams . If created using harpy, the resulting file includes largely meaningless values that you will need to adjust for your study. The parameter must follow a particular format:

diff --git a/workflows/metassembly/index.html b/workflows/metassembly/index.html index db185e1f5..0c49803aa 100644 --- a/workflows/metassembly/index.html +++ b/workflows/metassembly/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - +

diff --git a/workflows/other/index.html b/workflows/other/index.html index 5eae71d47..bf10c2729 100644 --- a/workflows/other/index.html +++ b/workflows/other/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
@@ -58,7 +58,7 @@ Harpy - +
@@ -307,6 +307,10 @@

+imputeparams +Create a template imputation parameter file + + resume Continue a Harpy workflow from an existing output folder @@ -315,12 +319,73 @@

Create generic sample-group file using existing sample file names (fq.gz or bam) -stitchparams -Create template STITCH parameter file +view +View a workflow log, config, or snakefile + + + + + +

+ # + imputeparams +

+
+

Create a template parameter file for the + impute + module. +The file is formatted correctly and serves as a starting point for using parameters that make sense for your study.

+
+
usage
+
harpy imputeparams -o OUTPUTFILE
+
+
+
example
+
harpy imputeparams -o params.stitch
+
+ +

+ # + arguments +

+
+
+ + + + + + + + + + + + +
argumentshort namedescription
--output-o + required + Name of the output file
+

Typically, one runs STITCH multiple times, exploring how results vary with +different model parameters. The solution Harpy uses for this is to have the user +provide a tab-delimited dataframe file where the columns are the 6 STITCH model +parameters and the rows are the values for those parameters. To make formatting +easier, a template file is generated for you, just replace the values and add/remove +rows as necessary. See the section for the + impute + +module for details on these parameters. The template file will look like:

+
+
params.stitch
+
name model	usebx	bxlimit	k	s	ngen
+k10_ng50 diploid	TRUE	50000	3	2	10
+k1_ng30 diploid	TRUE	50000	3	1	5
+high_ngen   diploid TRUE    50000   15  1   100
+
+

# @@ -339,9 +404,9 @@

usage
harpy resume [--conda] DIRECTORY
- +

- # + # arguments

@@ -368,7 +433,7 @@

--conda toggle -generate a .harpy_envs/ folder with the necessary conda enviroments +Generate a /workflow/envs folder with the necessary conda enviroments @@ -382,7 +447,7 @@

which means you can also manually modify the config.yaml file (advanced, not recommended unless you are confident with what you're doing).

resume - also requires an existing and populated .harpy_envs/ directory in the current work directory, like the kind all + also requires an existing and populated workdir/envs/ directory in the target directory, like the kind all main harpy workflows would create. If one is not present, you can use --conda to create one.

@@ -399,9 +464,9 @@

usage example
harpy popgroup -o samples.groups data/
- +

- # + # arguments

@@ -430,7 +495,7 @@

file path required - name of the output file + Name of the output file @@ -462,27 +527,27 @@

sample5 pop3
- +

- # - stitchparams + # + view

-

Create a template parameter file for the - impute - module. The file is formatted correctly and serves -as a starting point for using parameters that make sense for your study.

+

This convenience command lets you view the latest workflow log file +of a Harpy output directory. Use --snakefile or --config to view the workflow +snakefile or config.yaml file instead, respectively. Output is printed to the screen via less and +accepts the typical keyboard shortcuts to navigate the output.

usage
-
harpy stitchparams -o OUTPUTFILE
+
harpy view [-s] [-c] DIRECTORY
example
-
harpy stitchparams -o params.stitch
+
harpy view Align/bwa
- +

- # + # arguments

@@ -497,31 +562,25 @@

---output --o +DIRECTORY + required - name of the output file + Output directory of an existing harpy workflow + + +--snakemake +-s +View the workflow snakefile instead + + +--config +-c +View the config.yaml file instead -

Typically, one runs STITCH multiple times, exploring how results vary with -different model parameters. The solution Harpy uses for this is to have the user -provide a tab-delimited dataframe file where the columns are the 6 STITCH model -parameters and the rows are the values for those parameters. To make formatting -easier, a template file is generated for you, just replace the values and add/remove -rows as necessary. See the section for the - impute - -module for details on these parameters. The template file will look like:

-
-
params.stitch
-
name model	usebx	bxlimit	k	s	ngen
-k10_ng50 diploid	TRUE	50000	3	2	10
-k1_ng30 diploid	TRUE	50000	3	1	5
-high_ngen   diploid TRUE    50000   15  1   100
-
diff --git a/workflows/phase/index.html b/workflows/phase/index.html index 5367aebf9..b669d1365 100644 --- a/workflows/phase/index.html +++ b/workflows/phase/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - + diff --git a/workflows/preflight/index.html b/workflows/preflight/index.html index 14936d6dd..158fe35a8 100644 --- a/workflows/preflight/index.html +++ b/workflows/preflight/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
@@ -60,7 +60,7 @@ Harpy - +
diff --git a/workflows/qc/index.html b/workflows/qc/index.html index 52390d150..5767beb7c 100644 --- a/workflows/qc/index.html +++ b/workflows/qc/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - + diff --git a/workflows/simulate/index.html b/workflows/simulate/index.html index cf1c3146e..d23518cc4 100644 --- a/workflows/simulate/index.html +++ b/workflows/simulate/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
@@ -57,7 +57,7 @@ Harpy - +
diff --git a/workflows/simulate/simulate-linkedreads/index.html b/workflows/simulate/simulate-linkedreads/index.html index 268ba7794..e8ad48a6a 100644 --- a/workflows/simulate/simulate-linkedreads/index.html +++ b/workflows/simulate/simulate-linkedreads/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -63,7 +63,7 @@ Harpy - + diff --git a/workflows/simulate/simulate-variants/index.html b/workflows/simulate/simulate-variants/index.html index c043731fb..646233a60 100644 --- a/workflows/simulate/simulate-variants/index.html +++ b/workflows/simulate/simulate-variants/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + @@ -59,7 +59,7 @@ Harpy - + diff --git a/workflows/snp/index.html b/workflows/snp/index.html index 28586a4d9..c2661d411 100644 --- a/workflows/snp/index.html +++ b/workflows/snp/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - + diff --git a/workflows/sv/index.html b/workflows/sv/index.html index 482700124..4ac76ecd9 100644 --- a/workflows/sv/index.html +++ b/workflows/sv/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
@@ -57,7 +57,7 @@ Harpy - +
diff --git a/workflows/sv/leviathan/index.html b/workflows/sv/leviathan/index.html index 34bdd090a..eff3b68aa 100644 --- a/workflows/sv/leviathan/index.html +++ b/workflows/sv/leviathan/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - + diff --git a/workflows/sv/naibr/index.html b/workflows/sv/naibr/index.html index 3c8dffe5b..fb50cc0ad 100644 --- a/workflows/sv/naibr/index.html +++ b/workflows/sv/naibr/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -61,7 +61,7 @@ Harpy - +