diff --git a/resources/js/config.js b/resources/js/config.js
index bbaebb5a0..10e4bd43e 100644
--- a/resources/js/config.js
+++ b/resources/js/config.js
@@ -1 +1 @@
-var __DOCS_CONFIG__ = {"id":"QJje139DqBZr+AfCQzqp+wAKFOADlqUghf4","key":"TmCxVCWok48UkFNnu5TnJ23EQBjJ6LHSSMDaKqKn1XE.jOIMLliBUfs+qN/gCeiFGHtmv9MEP2Qq58C5xuS6SDOy8AVTdt5tm99epNJ1t7bF4YlVjj8XbaRsIm0YyIIDXQ.7","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.784319887743","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"development","l":"Development","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}};
+var __DOCS_CONFIG__ = {"id":"uzGODrl0mREmP9L3/OfSXjWQjbsJgFn158Q","key":"U4RB4rcnFIaqAtzcQAIRCOiytVznxV3bFdU6rpbzv6c.CIeA26iUIsKgPt90wMJ4a8QjJwT5VI+cHc/mbruk6X0644gb6/AtgrWJP3a48Lcsl1jKEXR1VHz2VOdIm6Cb4A.38","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.784397146456","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"utilities","l":"Utilities","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}};
diff --git a/resources/js/search.json b/resources/js/search.json
index 1b78654ab..6de64feed 100644
--- a/resources/js/search.json
+++ b/resources/js/search.json
@@ -1 +1 @@
-[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via mamba activate env_name) simply use the mamba install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the descriptions are provided with an extra badge to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Developing Harpy","p":["Harpy is an open source program written using a combination of BASH, R, RMarkdown, Python, and Snakemake. This page provides information on Harpy's development and how to contribute to it, if you were inclined to do so."]},{"l":"Installing dev version","p":["The process follows cloning the harpy repository, installing the preconfigured conda environment, and running the resources/buildlocal.sh script to move all the necessary files to the"]},{"i":"harpys-components","l":"Harpy's components"},{"l":"source code","p":["Harpy runs in two stages:"]},{"l":"bioconda recipe","p":["For the ease of installation for end-users, Harpy has a recipe and build script in Bioconda, which makes it available for download and installation. A copy of the recipe is also"]},{"l":"The Harpy repository"},{"l":"repo structure","p":["Harpy exists as a Git repository and has 5 standard branches that are used in specific ways during development. Git is a popular version control system and discussing its use is out of the scope of this documentation, however there is no"]},{"l":"development workflow","p":["The dev workflow is reasonably standard:"]},{"l":"containerization","p":["As of Harpy v1.0, the software dependencies that the Snakemake workflows use are pre-configured as a Docker image that is uploaded to Dockerhub. Updating or editing this container can be done automatically or manually."]},{"l":"automatically","p":["The testing GitHub Action will automatically create a Dockerfile with (a hidden harpy command) and build a new Docker container, then upload it to dockerhub with the latest tag. This process is triggered on"]},{"l":"manually","p":["The dockerfile for that container is created by using a hidden harpy command"]},{"l":"Automations"},{"l":"testing","p":["CI ( C ontinuous I ntegration) is a term describing automated actions that do things to/with your code and are triggered by how you interact with a repository. Harpy has a series of GitHub Actions triggered by interactions with the"]},{"l":"releases","p":["There is an automation that gets triggered every time Harpy is tagged with the new version. It strips out the unnecessary files and will upload a cleaned tarball to the new release (reducing filesize by orders of magnitude). The automation will also"]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]]
\ No newline at end of file
+[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via mamba activate env_name) simply use the mamba install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the descriptions are provided with an extra badge to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Utilities","p":["Harpy is the sum of its parts and some of those parts are stand-alone scripts used by the workflows that are accessible from within the Harpy conda environment. This page serves to document those scripts, since using them outside of a workflow"]},{"i":"10xtohaplotagpy","l":"10xtoHaplotag.py","p":["Converts 10x linked reads to haplotag linked reads with barcodes in BX:Z and OX:Z header tags."]},{"i":"assign_mipy","l":"assign_mi.py","p":["Assign an MI:i(Molecular Identifier) tag to each barcoded record based on a molecular distance cutoff. Unmapped records are discarded in the output. Records without a BX:Z tag or"]},{"i":"bx_statspy","l":"bx_stats.py","p":["Calculates various linked-read molecule metrics from the (coordinate-sorted) input alignment file. Metrics include (per molecule):"]},{"i":"check_bampy","l":"check_bam.py","p":["Parses an aligment file to check:"]},{"i":"check_fastqpy","l":"check_fastq.py","p":["Parses a FASTQ file to check if any sequences don't conform to the SAM spec, whether BX:Z: is the last tag in the record, and the counts of:"]},{"i":"concatenate_bampy","l":"concatenate_bam.py","p":["Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample. This is a means of accomplishing the same as samtools cat, except all"]},{"i":"count_bxpy","l":"count_bx.py","p":["Parses a FASTQ file to count: total sequences, total number of BX tags, number of valid haplotagging BX tags, number of invalid BX tags, number of invalid BX tag segments (i.e."]},{"i":"depth_windowspy","l":"depth_windows.py","p":["Reads the output of samtools depth -a from stdin and calculates means within windows of a given windowsize."]},{"i":"haplotag_acbdpy","l":"haplotag_acbd.py","p":["Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes into the specified output_directory."]},{"i":"infer_svpy","l":"infer_sv.py","p":["Create column in NAIBR bedpe output inferring the SV type from the orientation. Removes variants with FAIL flags and you can use the optional -f(--fail) argument to output FAIL variants to a separate file."]},{"i":"make_windowspy","l":"make_windows.py","p":["Create a BED file of fixed intervals (-w, -- window) from a FASTA or fai file (the kind generated with samtools faidx). Nearly identical to bedtools makewindows, except the intervals are nonoverlapping. The"]},{"i":"molecule_coveragepy","l":"molecule_coverage.py","p":["Using the statsfile generated by bx_stats.py from Harpy, will calculate \"molecular coverage\" across the genome. Molecular coverage is the \"effective\" alignment coverage if you treat a molecule inferred from linked-read data as"]},{"i":"parse_phaseblockspy","l":"parse_phaseblocks.py","p":["Parse a phase block file from HapCut2 to pull out summary information"]},{"l":"rename_bam","p":["Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. This process creates a new file new_name.bam and you may use -d to delete the original file. Requires"]},{"l":"separate_validbx","p":["Split a BAM file with BX tags into 2 files, one with valid ACBD barcodes ( stdout), one with invalid ACBD barcodes ( stderr)."]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]]
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 26f1f176c..d32df0c5b 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ
diff --git a/snakemake/index.html b/snakemake/index.html
index 7ed243ec5..1a1dc76fa 100644
--- a/snakemake/index.html
+++ b/snakemake/index.html
@@ -4,7 +4,7 @@
-
+
@@ -32,12 +32,12 @@
-
+
-
+
-
-
+
+
Harpy is the sum of its parts and some of those parts are stand-alone scripts
+used by the workflows that are accessible from within the Harpy conda environment.
+This page serves to document those scripts, since using them outside of a workflow
+might be useful too. You can call up the docstring for any one of these utilities
+by calling the program without any arguments.
Converts 10x linked reads to haplotag linked reads with barcodes in BX:Z and OX:Z header tags.
+
+
+ #
+ assign_mi.py
+
+
+
+
assign_mi.py -c cutoff -o output.bam input.bam
+
+
Assign an MI:i (Molecular Identifier) tag to each barcoded
+record based on a molecular distance cutoff. Unmapped records
+are discarded in the output. Records without a BX:Z tag or
+with an invalid barcode (00 as one of its segments) are presevered
+but are not assigned an MI:i tag. Input file must be coordinate sorted.
+
+
+ #
+ bx_stats.py
+
+
+
+
bx_stats.py -o output.gz input.bam
+
+
Calculates various linked-read molecule metrics from the (coordinate-sorted) input alignment file.
+Metrics include (per molecule):
+
+
number of reads
+
position start
+
position end
+
length of molecule inferred from alignments
+
total aligned basepairs
+
total length of inferred inserts
+
molecule coverage (%) based on aligned bases
+
molecule coverage (%) based on total inferred insert length
+
+
+
+ #
+ check_bam.py
+
+
+
+
check_bam.py input.bam > output.txt
+
+
Parses an aligment file to check:
+
+
if the sample name matches the RG tag
+
whether BX:Z is the last tag in the record
+
the counts of:
+
+
total alignments
+
alignments with an MI:i tag
+
alignments without BX:Z tag
+
incorrect BX:Z tag
+
+
+
+
+
+ #
+ check_fastq.py
+
+
+
+
check_bam.py input.bam > output.txt
+
+
Parses a FASTQ file to check if any sequences don't conform to the SAM spec,
+whether BX:Z: is the last tag in the record, and the counts of:
Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
+This is a means of accomplishing the same as samtools cat, except all MI (Molecule Identifier) tags are updated
+so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
+all the files you want to concatenate, or a single file featuring filenames with the -b option.
+
+
+ #
+ count_bx.py
+
+
+
+
count_bx.py input.fastq > output.txt
+
+
Parses a FASTQ file to count: total sequences, total number of BX tags,
+number of valid haplotagging BX tags, number of invalid BX tags, number of
+invalid BX tag segments (i.e. A00, C00, B00, D00).
+
+
+ #
+ depth_windows.py
+
+
+
+
samtools depth -a file.bam | depth_windows.py windowsize > output.txt
+
+
Reads the output of samtools depth -a from stdin and calculates means within windows of a given windowsize.
+
+
+ #
+ haplotag_acbd.py
+
+
+
+
haplotag_acbd.py output_directory
+
+
Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes into the specified output_directory.
Create column in NAIBR bedpe output inferring the SV type from the orientation. Removes variants with FAIL flags
+and you can use the optional -f (--fail) argument to output FAIL variants to a separate file.
Create a BED file of fixed intervals (-w, --window) from a FASTA or fai file (the kind generated with samtools faidx).
+Nearly identical to bedtools makewindows, except the intervals are nonoverlapping. The -m (--mode) option specified
+whether indexing starts at 0 or 1.
Using the statsfile generated by bx_stats.py from Harpy, will calculate "molecular coverage" across the genome.
+Molecular coverage is the "effective" alignment coverage if you treat a molecule inferred from linked-read data as
+one contiguous alignment, even though the reads that make up that molecule don't cover its entire length. Requires a
+FASTA fai index (the kind created with samtools faidx) to know the actual sizes of the contigs.
+
+
+ #
+ parse_phaseblocks.py
+
+
+
+
parse_phaseblocks.py input > output.txt
+
+
Parse a phase block file from HapCut2 to pull out summary information
+
+
+ #
+ rename_bam
+
+
+
+
rename_bam.py [-d] new_name input.bam
+
+
Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM.
+This process creates a new file new_name.bam and you may use -d to delete the original file. Requires samtools.
The original LRSIM is a lengthy Perl script that, like Harpy, outsources
-to various other programs (SURVIVOR, DWGSIM, samtools, msort) and acts as a workflow through these programs. The Harpy
-version of LRSIM keeps only the novel LRSIM code that creates linked reads from reads simulated by DWGSIM. The
-rest of LRSIM's components are reincorporated into the Snakemake workflow governing the
+to various other programs (SURVIVOR, DWGSIM, samtools, msort) and acts as a workflow through these programs. The Harpy
+version of LRSIM keeps only the novel LRSIM code that creates linked reads from reads simulated by DWGSIM. The
+rest of LRSIM's components are reincorporated into the Snakemake workflow governing the simulate linkedreads
-module, while removing the SURVIVOR part since
+ module, while removing the SURVIVOR part since simulate {snpindel,...} are used for that purpose.
@@ -320,12 +340,12 @@
-
dependencies are expected to be on the PATH, not hardcoded to the folder LRSIM is running from
+
dependencies are expected to be on the PATH, not hardcoded to the folder LRSIM is running from
-r parameter changed to folder prefix since Harpy uses -g for the haplotypes
outputs are coded a little differently for flexibility (and use the -r parameter for some parts)
Random mutation rate for simulating reads (0 - 1.0)
--outer-distance
-d
350
-
Outer distance between paired-end reads (bp)
--patitions
-p
1500
-
Number (in thousands) of partitions/beads to generate
--read-pairs
-n
600
-
Number (in millions) of read pairs to simulate
@@ -511,7 +524,7 @@
entire barcode due to a segment failing to be associated with a beadtag segment. In the simulated data, since
10X barcodes don't feature segments, failure to associate the first 16 bases of read 1 with barcodes provided
to --barcodes will appear as BX:Z:A00C00B00D00. The original 10X barcode (or first 16 bases of read 1)
-will be removed from the sequence and stored in the TX:Z sequence header tag, e.g. TX:Z:ATATGTACTCATACCA.
+will be removed from the sequence and stored in the OX:Z sequence header tag, e.g. OX:Z:ATATGTACTCATACCA.
The paired reverse read will also have these tags. The diagram below attempts to simplify this visually.