diff --git a/404.html b/404.html index c9d05dc12..97eec0a1d 100644 --- a/404.html +++ b/404.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
diff --git a/blog/filtering_snps/index.html b/blog/filtering_snps/index.html index ec086a076..687157a48 100644 --- a/blog/filtering_snps/index.html +++ b/blog/filtering_snps/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
diff --git a/blog/index.html b/blog/index.html index 1fda70ce9..ddfc15dcb 100644 --- a/blog/index.html +++ b/blog/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/blog/simulate_diploid/index.html b/blog/simulate_diploid/index.html index e0b43ec6c..b5fc2fef3 100644 --- a/blog/simulate_diploid/index.html +++ b/blog/simulate_diploid/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + + diff --git a/blog/software_environments/index.html b/blog/software_environments/index.html index b645f1139..5f7e2471f 100644 --- a/blog/software_environments/index.html +++ b/blog/software_environments/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
diff --git a/blog/sort_by_barcode/index.html b/blog/sort_by_barcode/index.html index 3ee3018f0..711a1739f 100644 --- a/blog/sort_by_barcode/index.html +++ b/blog/sort_by_barcode/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
diff --git a/blog/sv_pooling/index.html b/blog/sv_pooling/index.html index ad49f4d65..562f6076a 100644 --- a/blog/sv_pooling/index.html +++ b/blog/sv_pooling/index.html @@ -4,7 +4,7 @@ - + @@ -37,11 +37,11 @@ - + - + - +
diff --git a/categories/guides/index.html b/categories/guides/index.html index a857fdd86..197b812a4 100644 --- a/categories/guides/index.html +++ b/categories/guides/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
diff --git a/categories/index.html b/categories/index.html index e2c860cf7..77d27337c 100644 --- a/categories/index.html +++ b/categories/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
diff --git a/commonoptions/index.html b/commonoptions/index.html index fb32fac6f..34a8d1f4a 100644 --- a/commonoptions/index.html +++ b/commonoptions/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/development/index.html b/development/index.html index 706ba395d..dc0d90563 100644 --- a/development/index.html +++ b/development/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
diff --git a/haplotagdata/index.html b/haplotagdata/index.html index ffb9ffb74..aa8e108d7 100644 --- a/haplotagdata/index.html +++ b/haplotagdata/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + diff --git a/index.html b/index.html index df2be5355..601e73a5a 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + diff --git a/install/index.html b/install/index.html index b585197f7..34180a415 100644 --- a/install/index.html +++ b/install/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/resources/js/config.js b/resources/js/config.js index 91c0cc715..baf1c9b0d 100644 --- a/resources/js/config.js +++ b/resources/js/config.js @@ -1 +1 @@ -var __DOCS_CONFIG__ = {"id":"GaoZ1391KMi+Zyq3HjD0T5TvJak0zX/gWYT","key":"FkzMktlKOJIIZ3TwOPUVNNxkFO+iRDM6MDyd5toS9cI.IR+J9VgnmUBI5wVJNeN9CedQG3u21X2vsY+AIbjpKYIQhIoBqTuvAIdMFBZgZx2f1udjQhQPrMV72wERR/uoqA.112","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.784140609068","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"development","l":"Development","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; +var __DOCS_CONFIG__ = {"id":"J3aE072Zb/m/3KqySBDQ/apUbmMz2139/KG","key":"aEwnfN9IXSEPsoNhWI1cq6EkjITQ4n0/CpXcNPTFFY0.OHeGX0QuzBbOsY988QTwahxsBcB85fXlpThdDP9oF4M5xXlkMc7AWFlwLzXPrVI3J0WtGWHDmP5dOHEj0haf2g.80","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.784234966201","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"development","l":"Development","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; diff --git a/resources/js/search.json b/resources/js/search.json index 13338b18d..8524a1762 100644 --- a/resources/js/search.json +++ b/resources/js/search.json @@ -1 +1 @@ -[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via mamba activate env_name) simply use the mamba install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Developing Harpy","p":["Harpy is an open source program written using a combination of BASH, R, RMarkdown, Python, and Snakemake. This page provides information on Harpy's development and how to contribute to it, if you were inclined to do so."]},{"l":"Installing dev version","p":["The process follows cloning the harpy repository, installing the preconfigured conda environment, and running the resources/buildlocal.sh script to move all the necessary files to the"]},{"i":"harpys-components","l":"Harpy's components"},{"l":"source code","p":["Harpy runs in two stages:"]},{"l":"bioconda recipe","p":["For the ease of installation for end-users, Harpy has a recipe and build script in Bioconda, which makes it available for download and installation. A copy of the recipe is also"]},{"l":"The Harpy repository"},{"l":"repo structure","p":["Harpy exists as a Git repository and has 5 standard branches that are used in specific ways during development. Git is a popular version control system and discussing its use is out of the scope of this documentation, however there is no"]},{"l":"development workflow","p":["The dev workflow is reasonably standard:"]},{"l":"containerization","p":["As of Harpy v1.0, the software dependencies that the Snakemake workflows use are pre-configured as a Docker image that is uploaded to Dockerhub. Updating or editing this container can be done automatically or manually."]},{"l":"automatically","p":["The testing GitHub Action will automatically create a Dockerfile with (a hidden harpy command) and build a new Docker container, then upload it to dockerhub with the latest tag. This process is triggered on"]},{"l":"manually","p":["The dockerfile for that container is created by using a hidden harpy command"]},{"l":"Automations"},{"l":"testing","p":["CI ( C ontinuous I ntegration) is a term describing automated actions that do things to/with your code and are triggered by how you interact with a repository. Harpy has a series of GitHub Actions triggered by interactions with the"]},{"l":"releases","p":["There is an automation that gets triggered every time Harpy is tagged with the new version. It strips out the unnecessary files and will upload a cleaned tarball to the new release (reducing filesize by orders of magnitude). The automation will also"]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file +[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via mamba activate env_name) simply use the mamba install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the options are shown with an extra column to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Developing Harpy","p":["Harpy is an open source program written using a combination of BASH, R, RMarkdown, Python, and Snakemake. This page provides information on Harpy's development and how to contribute to it, if you were inclined to do so."]},{"l":"Installing dev version","p":["The process follows cloning the harpy repository, installing the preconfigured conda environment, and running the resources/buildlocal.sh script to move all the necessary files to the"]},{"i":"harpys-components","l":"Harpy's components"},{"l":"source code","p":["Harpy runs in two stages:"]},{"l":"bioconda recipe","p":["For the ease of installation for end-users, Harpy has a recipe and build script in Bioconda, which makes it available for download and installation. A copy of the recipe is also"]},{"l":"The Harpy repository"},{"l":"repo structure","p":["Harpy exists as a Git repository and has 5 standard branches that are used in specific ways during development. Git is a popular version control system and discussing its use is out of the scope of this documentation, however there is no"]},{"l":"development workflow","p":["The dev workflow is reasonably standard:"]},{"l":"containerization","p":["As of Harpy v1.0, the software dependencies that the Snakemake workflows use are pre-configured as a Docker image that is uploaded to Dockerhub. Updating or editing this container can be done automatically or manually."]},{"l":"automatically","p":["The testing GitHub Action will automatically create a Dockerfile with (a hidden harpy command) and build a new Docker container, then upload it to dockerhub with the latest tag. This process is triggered on"]},{"l":"manually","p":["The dockerfile for that container is created by using a hidden harpy command"]},{"l":"Automations"},{"l":"testing","p":["CI ( C ontinuous I ntegration) is a term describing automated actions that do things to/with your code and are triggered by how you interact with a repository. Harpy has a series of GitHub Actions triggered by interactions with the"]},{"l":"releases","p":["There is an automation that gets triggered every time Harpy is tagged with the new version. It strips out the unnecessary files and will upload a cleaned tarball to the new release (reducing filesize by orders of magnitude). The automation will also"]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 838a303ec..d4a468748 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ diff --git a/snakemake/index.html b/snakemake/index.html index 01f616a87..61ad83737 100644 --- a/snakemake/index.html +++ b/snakemake/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/software/index.html b/software/index.html index 66849b3e3..eb622377c 100644 --- a/software/index.html +++ b/software/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/troubleshooting/index.html b/troubleshooting/index.html index 66d57a710..5e6fd2eed 100644 --- a/troubleshooting/index.html +++ b/troubleshooting/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
diff --git a/workflows/align/bwa/index.html b/workflows/align/bwa/index.html index 25f1242ea..cb1f64c1b 100644 --- a/workflows/align/bwa/index.html +++ b/workflows/align/bwa/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/workflows/align/ema/index.html b/workflows/align/ema/index.html index 1d85357a8..0b00c06e1 100644 --- a/workflows/align/ema/index.html +++ b/workflows/align/ema/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/workflows/align/index.html b/workflows/align/index.html index 66fa84981..b63973d73 100644 --- a/workflows/align/index.html +++ b/workflows/align/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/workflows/align/strobe/index.html b/workflows/align/strobe/index.html index 2174feeff..c357055a0 100644 --- a/workflows/align/strobe/index.html +++ b/workflows/align/strobe/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/workflows/assembly/index.html b/workflows/assembly/index.html index a187f0d50..9449b0871 100644 --- a/workflows/assembly/index.html +++ b/workflows/assembly/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -311,92 +311,194 @@

common runtime options , the assembly - module is configured using these command-line arguments:

+ +module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, +the options are shown with an extra column to reflect which part of the assembly process they correspond to.

+ - + - + - - - - - - - - - - + - - - - - - - - - - + - - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - +
argument short nameprocess type default required description
FASTQ_R1 FASTQ file ‼️ FASTQ file of forward reads
FASTQ_R2 FASTQ file ‼️ FASTQ file of reverse reads
--bx-tag-bstringBX‼️Which sequence header tag encodes the linked-read barcode (BX for BX:Z or BC for BC:Z)
--extra-params -x + spades assembly + string Additional spades parameters, in quotes
--ignore-bxtoggleIgnore linked-read info for initial spades assembly
--kmer-length -k + spades assembly + list of int auto Kmer lengths to use for initial spades assembly. They must be odd and <128, separated by commas, and without spaces. (e.g. 13,23,51)
--max-memory -rint > 1000 + spades assembly +integer > 1000 10000 Maximum memory for spades to use, given in megabytes
--arcs-extra-y + arcs scaffold +stringAdditional ARCS parameters, in quotes and option=arg format
--contig-length-c + arcs scaffold +integer500Minimum contig length
--links-n + arcs scaffold +integer5Minimum number of links to compute scaffold
--min-aligned-a + arcs scaffold +integer5Minimum aligned read pairs per barcode
--min-quality-q + arcs scaffold +integer 0-400Minimum mapping quality
--mismatch-m + arcs scaffold +integer5Maximum number of mismatches
--molecule-distance-d + arcs scaffold +integer50000Distance cutoff to split molecules (bp)
--molecule-length-l + arcs scaffold +integer2000Minimum molecule length (bp)
--seq-identity-i + arcs scaffold +integer 0-10098Minimum sequence identity
--span-s + arcs scaffold +integer20Minimum number of spanning molecules to be considered assembled
--organism-type -u + report + string eukaryote Organism type for assembly report. Options: eukaryote,prokaryote,fungusOrganism type for assembly report: eukaryote,prokaryote, or fungus
diff --git a/workflows/deconvolve/index.html b/workflows/deconvolve/index.html index 968d55f15..7b2c21846 100644 --- a/workflows/deconvolve/index.html +++ b/workflows/deconvolve/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + @@ -437,7 +437,7 @@

--density -d integer -3 +3 On average, \frac{1}{2^d} kmers are indexed @@ -445,7 +445,7 @@

--dropout -a integer -0 +0 Minimum cloud size to deconvolve @@ -453,7 +453,7 @@

--kmer-length -k integer -21 +21 Size of k-mers to search for similarities @@ -461,7 +461,7 @@

--window-size -w integer -40 +40 Size of window guaranteed to contain at least one kmer diff --git a/workflows/demultiplex/index.html b/workflows/demultiplex/index.html index 30159f227..0de674635 100644 --- a/workflows/demultiplex/index.html +++ b/workflows/demultiplex/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/workflows/impute/index.html b/workflows/impute/index.html index c8b44b00a..866f7ad9a 100644 --- a/workflows/impute/index.html +++ b/workflows/impute/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/workflows/metassembly/index.html b/workflows/metassembly/index.html index f1190d097..dbfe6246c 100644 --- a/workflows/metassembly/index.html +++ b/workflows/metassembly/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + diff --git a/workflows/other/index.html b/workflows/other/index.html index 609ab3efc..3670cc0ff 100644 --- a/workflows/other/index.html +++ b/workflows/other/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
diff --git a/workflows/phase/index.html b/workflows/phase/index.html index 09b8ac419..57129dfd2 100644 --- a/workflows/phase/index.html +++ b/workflows/phase/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -414,7 +414,7 @@

--molecule-distance -d integer -100000 +100000 Base-pair distance threshold to separate molecules @@ -422,7 +422,7 @@

--prune-threshold -p integer (0-100) -7 +7 PHRED-scale (%) threshold for pruning low-confidence SNPs diff --git a/workflows/preflight/index.html b/workflows/preflight/index.html index 387c29ad2..c7c20c0b4 100644 --- a/workflows/preflight/index.html +++ b/workflows/preflight/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
diff --git a/workflows/qc/index.html b/workflows/qc/index.html index 56ad4cc40..8585c4169 100644 --- a/workflows/qc/index.html +++ b/workflows/qc/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -396,8 +396,8 @@

--deconvolve-params -p -(int,int,int,int) -(21,40,3,0) +int,int,int,int +21,40,3,0 Accepts the QuickDeconvolution parameters for k,w,d,a, in that order @@ -423,7 +423,7 @@

--min-length -n integer -30 +30 Discard reads shorter than this length @@ -431,7 +431,7 @@

--max-length -m integer -150 +150 Maximum length to trim sequences down to diff --git a/workflows/simulate/index.html b/workflows/simulate/index.html index a11c8fe3f..98b43bf6a 100644 --- a/workflows/simulate/index.html +++ b/workflows/simulate/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/workflows/simulate/simulate-linkedreads/index.html b/workflows/simulate/simulate-linkedreads/index.html index 7b95b4c63..a37846845 100644 --- a/workflows/simulate/simulate-linkedreads/index.html +++ b/workflows/simulate/simulate-linkedreads/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -361,7 +361,6 @@

argument short name -type default required description @@ -371,7 +370,6 @@

HAP1_GENOME -file path ‼️ Haplotype 1 of the diploid genome to simulate reads @@ -379,7 +377,6 @@

HAP2_GENOME -file path ‼️ Haplotype 1 of the diploid genome to simulate reads @@ -387,7 +384,6 @@

--barcodes -b -file path 10X barcodes File of linked-read barcodes to add to reads @@ -395,56 +391,49 @@

--distance-sd -s -integer -15 +15 Standard deviation of read-pair distance --molecule-length -l -integer -100 +100 Mean molecule length (kbp) --molecules-per -m -integer -10 +10 Average number of molecules per partition --mutation-rate -r -number -0.001 +0.001 Random mutation rate for simulating reads (0 - 1.0) --outer-distance -d -integer -350 +350 Outer distance between paired-end reads (bp) --patitions -p -integer -1500 +1500 Number (in thousands) of partitions/beads to generate --read-pairs -n -number -600 +600 Number (in millions) of read pairs to simulate diff --git a/workflows/simulate/simulate-variants/index.html b/workflows/simulate/simulate-variants/index.html index 1fcb7407c..dd552cc69 100644 --- a/workflows/simulate/simulate-variants/index.html +++ b/workflows/simulate/simulate-variants/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
@@ -364,7 +364,6 @@

argument short name -type required description @@ -373,56 +372,48 @@

INPUT_GENOME -file path ‼️ The haploid genome to simulate variants onto --centromeres -c -file path GFF3 file of centromeres to avoid --exclude-chr -e -file path Text file of chromosomes to avoid, one per line --genes -g -file path GFF3 file of genes to avoid simulating over (see snpindel for caveat) --heterozygosity -z -float between [0,1] proportion of simulated variants to make heterozygous (default: 0) --only-vcf -toggle When used with --heterozygosity, will create the diploid VCFs but will not simulate a diploid genome --prefix -string Naming prefix for output files (default: sim.{module_name}) --randomseed -integer Random seed for simulation @@ -465,7 +456,6 @@

argument short name -type default description @@ -474,64 +464,55 @@

--indel-count -m -integer -0 +0 Number of random indels to simluate --indel-vcf -i -file path VCF file of known indels to simulate --indel-ratio -d -float -1 +1 Insertion/Deletion ratio for indels --indel-size-alpha -a -float -2.0 +2.0 Exponent Alpha for power-law-fitted indel size distribution --indel-size-constant -l -float -0.5 +0.5 Exponent constant for power-law-fitted indel size distribution --snp-count -n -integer -0 +0 Number of random snps to simluate --snp-gene-constraints -y -string How to constrain randomly simulated SNPs {noncoding,coding,2d,4d} when using --genes --snp-vcf -s -file path VCF file of known snps to simulate --titv-ratio -r -float -0.5 +0.5 Transition/Transversion ratio for snps @@ -578,7 +559,6 @@

argument short name -type default description @@ -587,28 +567,24 @@

--count -n -integer -0 +0 Number of random inversions to simluate --max-size -x -integer -100000 +100000 Maximum inversion size (bp) --min-size -m -integer -1000 +1000 Minimum inversion size (bp) --vcf -v -file path VCF file of known inversions to simulate @@ -632,7 +608,6 @@

argument short name -type default description @@ -641,50 +616,43 @@

--vcf -v -file path VCF file of known copy number variants to simulate --count -n -integer -0 +0 Number of random cnv to simluate --dup-ratio -d -float -1 +1 Tandem/Dispersed duplication ratio --gain-ratio -l -float -1 +1 Relative ratio of DNA gain over DNA loss --max-size -x -integer -100000 +100000 Maximum cnv size (bp) --max-copy -y -integer -10 +10 Maximum number of copies --min-size -m -integer -1000 +1000 Minimum cnv size (bp) @@ -730,7 +698,6 @@

argument short name -type default description @@ -739,14 +706,12 @@

--count -n -integer -0 +0 Number of random inversions to simluate --vcf -v -file path VCF file of known inversions to simulate diff --git a/workflows/snp/index.html b/workflows/snp/index.html index ecf2997d4..3b14bd279 100644 --- a/workflows/snp/index.html +++ b/workflows/snp/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -439,7 +439,7 @@

--ploidy -n integer -2 +2 Ploidy of samples @@ -455,7 +455,7 @@

--regions -r integer/file path/string -50000 +50000 Regions to call variants on (see below) diff --git a/workflows/sv/index.html b/workflows/sv/index.html index 7830c26e3..60b99485a 100644 --- a/workflows/sv/index.html +++ b/workflows/sv/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
diff --git a/workflows/sv/leviathan/index.html b/workflows/sv/leviathan/index.html index 26f573572..8fa4ab541 100644 --- a/workflows/sv/leviathan/index.html +++ b/workflows/sv/leviathan/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -452,7 +452,7 @@

--iterations -i integer -50 +50 Number of iterations to perform through index (reduces memory) @@ -460,7 +460,7 @@

--min-barcodes -b integer -2 +2 Minimum number of barcode overlaps supporting candidate SV @@ -468,7 +468,7 @@

--min-sv -m integer -1000 +1000 Minimum size of SV to detect diff --git a/workflows/sv/naibr/index.html b/workflows/sv/naibr/index.html index ec33ead47..4d3f9e4e0 100644 --- a/workflows/sv/naibr/index.html +++ b/workflows/sv/naibr/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -439,7 +439,7 @@

--min-barcodes -b integer -2 +2 Minimum number of barcode overlaps supporting candidate SV @@ -447,7 +447,7 @@

--min-quality -q integer (0-40) -30 +30 Minimum MQ (SAM mapping quality) to pass filtering @@ -455,7 +455,7 @@

--min-sv -n integer -1000 +1000 Minimum size of SV to detect @@ -463,7 +463,7 @@

--molecule-distance -m integer -100000 +100000 Base-pair distance threshold to separate molecules