diff --git a/404.html b/404.html index 706100a27..d13b4d104 100644 --- a/404.html +++ b/404.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
@@ -95,7 +95,7 @@
  • - + @@ -114,6 +114,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -219,7 +229,7 @@
  • - + @@ -238,6 +248,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/blog/filtering_snps/index.html b/blog/filtering_snps/index.html index 3b4340966..ea6f6ff95 100644 --- a/blog/filtering_snps/index.html +++ b/blog/filtering_snps/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
    @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -227,7 +237,7 @@
  • - + @@ -246,6 +256,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/blog/index.html b/blog/index.html index 177f7fd34..c5269d08c 100644 --- a/blog/index.html +++ b/blog/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
    @@ -96,7 +96,7 @@
  • - + @@ -115,6 +115,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -221,7 +231,7 @@
  • - + @@ -240,6 +250,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/blog/simulate_diploid/index.html b/blog/simulate_diploid/index.html index 30fea07b0..81b79a4f1 100644 --- a/blog/simulate_diploid/index.html +++ b/blog/simulate_diploid/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + + @@ -103,7 +103,7 @@
  • - + @@ -122,6 +122,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -228,7 +238,7 @@
  • - + @@ -247,6 +257,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/blog/software_environments/index.html b/blog/software_environments/index.html index d0ad677ef..866ae7fd8 100644 --- a/blog/software_environments/index.html +++ b/blog/software_environments/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
    @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -227,7 +237,7 @@
  • - + @@ -246,6 +256,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/blog/sort_by_barcode/index.html b/blog/sort_by_barcode/index.html index bf1ed9232..b7e6962c1 100644 --- a/blog/sort_by_barcode/index.html +++ b/blog/sort_by_barcode/index.html @@ -4,7 +4,7 @@ - + @@ -37,12 +37,12 @@ - + - + - - + +
    @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -227,7 +237,7 @@
  • - + @@ -246,6 +256,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/blog/sv_pooling/index.html b/blog/sv_pooling/index.html index 81b77ced1..cc6402a0f 100644 --- a/blog/sv_pooling/index.html +++ b/blog/sv_pooling/index.html @@ -4,7 +4,7 @@ - + @@ -37,11 +37,11 @@ - + - + - +
    @@ -101,7 +101,7 @@
  • - + @@ -120,6 +120,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/categories/guides/index.html b/categories/guides/index.html index 7ab594027..6d5ebf4e2 100644 --- a/categories/guides/index.html +++ b/categories/guides/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
    @@ -95,7 +95,7 @@
  • - + @@ -114,6 +114,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -219,7 +229,7 @@
  • - + @@ -238,6 +248,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/categories/index.html b/categories/index.html index 2e07fd5c5..9ee721ccb 100644 --- a/categories/index.html +++ b/categories/index.html @@ -4,7 +4,7 @@ - + @@ -29,11 +29,11 @@ - + - + - +
    @@ -95,7 +95,7 @@
  • - + @@ -114,6 +114,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -219,7 +229,7 @@
  • - + @@ -238,6 +248,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/commonoptions/index.html b/commonoptions/index.html index a4d4da071..70a6b8f08 100644 --- a/commonoptions/index.html +++ b/commonoptions/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
    @@ -99,7 +99,7 @@
  • - + @@ -118,6 +118,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -223,7 +233,7 @@
  • - + @@ -242,6 +252,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/development/index.html b/development/index.html index c90a1414c..a625e1457 100644 --- a/development/index.html +++ b/development/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + - diff --git a/haplotagdata/index.html b/haplotagdata/index.html index 0fb9398e5..d4abcaf29 100644 --- a/haplotagdata/index.html +++ b/haplotagdata/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/index.html b/index.html index 76602b925..822740052 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,7 @@ - + @@ -34,11 +34,11 @@ - + - + - + @@ -101,7 +101,7 @@
  • - + @@ -120,6 +120,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -225,7 +235,7 @@
  • - + @@ -244,6 +254,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/install/index.html b/install/index.html index c40bceac5..fdccb4950 100644 --- a/install/index.html +++ b/install/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
    @@ -99,7 +99,7 @@
  • - + @@ -118,6 +118,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -223,7 +233,7 @@
  • - + @@ -242,6 +252,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/resources/js/config.js b/resources/js/config.js index bbaebb5a0..10e4bd43e 100644 --- a/resources/js/config.js +++ b/resources/js/config.js @@ -1 +1 @@ -var __DOCS_CONFIG__ = {"id":"QJje139DqBZr+AfCQzqp+wAKFOADlqUghf4","key":"TmCxVCWok48UkFNnu5TnJ23EQBjJ6LHSSMDaKqKn1XE.jOIMLliBUfs+qN/gCeiFGHtmv9MEP2Qq58C5xuS6SDOy8AVTdt5tm99epNJ1t7bF4YlVjj8XbaRsIm0YyIIDXQ.7","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.784319887743","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"development","l":"Development","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; +var __DOCS_CONFIG__ = {"id":"uzGODrl0mREmP9L3/OfSXjWQjbsJgFn158Q","key":"U4RB4rcnFIaqAtzcQAIRCOiytVznxV3bFdU6rpbzv6c.CIeA26iUIsKgPt90wMJ4a8QjJwT5VI+cHc/mbruk6X0644gb6/AtgrWJP3a48Lcsl1jKEXR1VHz2VOdIm6Cb4A.38","base":"/harpy/","host":"pdimens.github.io","version":"1.0.0","useRelativePaths":true,"documentName":"index.html","appendDocumentName":false,"trailingSlash":true,"preloadSearch":false,"cacheBustingToken":"3.6.0.784397146456","cacheBustingStrategy":"query","sidebarFilterPlaceholder":"Filter","toolbarFilterPlaceholder":"Filter","showSidebarFilter":true,"filterNotFoundMsg":"No member names found containing the query \"{query}\"","maxHistoryItems":15,"homeIcon":"","access":[{"value":"public","label":"Public"},{"value":"protected","label":"Protected"}],"toolbarLinks":[{"id":"fields","label":"Fields"},{"id":"properties","label":"Properties"},{"id":"methods","label":"Methods"},{"id":"events","label":"Events"}],"sidebar":[{"n":"/","l":"Home","s":""},{"n":"install","l":"Install","s":""},{"n":"workflows","l":"Workflows","c":false,"i":[{"n":"align","l":"Align","i":[{"n":"bwa","l":"BWA","s":""},{"n":"ema","l":"EMA","s":""},{"n":"strobe","l":"Strobe","s":""}],"s":""},{"n":"assembly","l":"Assembly","s":""},{"n":"deconvolve","l":"Deconvolve","s":""},{"n":"demultiplex","l":"Demultiplex","s":""},{"n":"impute","l":"Impute","s":""},{"n":"metassembly","l":"Metassembly","s":""},{"n":"other","l":"Other","s":""},{"n":"phase","l":"Phase","s":""},{"n":"preflight","l":"Preflight","s":""},{"n":"qc","l":"QC","s":""},{"n":"simulate","l":"Simulate","i":[{"n":"simulate-linkedreads","l":"Linked Reads","s":""},{"n":"simulate-variants","l":"Variants","s":""}],"s":""},{"n":"snp","l":"SNP","s":""},{"n":"sv","l":"SV","i":[{"n":"leviathan","l":"Leviathan","s":""},{"n":"naibr","l":"Naibr","s":""}],"s":""}],"s":""},{"n":"haplotagdata","l":"Haplotag Data","s":""},{"n":"commonoptions","l":"Common Options","s":""},{"n":"troubleshooting","l":"Troubleshooting","s":""},{"n":"snakemake","l":"Snakemake Things","s":""},{"n":"software","l":"Software","s":""},{"n":"utilities","l":"Utilities","s":""},{"n":"blog","l":"Blog","v":false,"i":[{"n":"sort_by_barcode","l":" Sort data by barcode","v":false,"s":""},{"n":"simulate_diploid","l":" Simulating variants","v":false,"s":""},{"n":"sv_pooling","l":" Pooling samples for SV calling","v":false,"s":""},{"n":"software_environments","l":" Choosing a software runtime method","v":false,"s":""},{"n":"filtering_snps","l":" Filtering Variants","v":false,"s":""}]}],"search":{"mode":1,"minChars":2,"maxResults":20,"placeholder":"Search","hotkeys":["k"],"noResultsFoundMsg":"Sorry, no results found.","recognizeLanguages":true,"languages":[0],"preload":false},"resources":{"History_Title_Label":"History","History_ClearLink_Label":"Clear","History_NoHistory_Label":"No history items","API_AccessFilter_Label":"Access","API_ParameterSection_Label":"PARAMETERS","API_SignatureSection_Label":"SIGNATURE","API_CopyHint_Label":"Copy","API_CopyNameHint_Label":"Copy name","API_CopyLinkHint_Label":"Copy link","API_CopiedAckHint_Label":"Copied!","API_MoreOverloads_Label":"more","API_MoreDropdownItems_Label":"More","API_OptionalParameter_Label":"optional","API_DefaultParameterValue_Label":"Default value","API_InheritedFilter_Label":"Inherited","Search_Input_Placeholder":"Search","Toc_Contents_Label":"Contents","Toc_RelatedClasses_Label":"Related Classes","History_JustNowTime_Label":"just now","History_AgoTime_Label":"ago","History_YearTime_Label":"y","History_MonthTime_Label":"mo","History_DayTime_Label":"d","History_HourTime_Label":"h","History_MinuteTime_Label":"m","History_SecondTime_Label":"s"}}; diff --git a/resources/js/search.json b/resources/js/search.json index 1b78654ab..6de64feed 100644 --- a/resources/js/search.json +++ b/resources/js/search.json @@ -1 +1 @@ -[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via mamba activate env_name) simply use the mamba install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the descriptions are provided with an extra badge to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Developing Harpy","p":["Harpy is an open source program written using a combination of BASH, R, RMarkdown, Python, and Snakemake. This page provides information on Harpy's development and how to contribute to it, if you were inclined to do so."]},{"l":"Installing dev version","p":["The process follows cloning the harpy repository, installing the preconfigured conda environment, and running the resources/buildlocal.sh script to move all the necessary files to the"]},{"i":"harpys-components","l":"Harpy's components"},{"l":"source code","p":["Harpy runs in two stages:"]},{"l":"bioconda recipe","p":["For the ease of installation for end-users, Harpy has a recipe and build script in Bioconda, which makes it available for download and installation. A copy of the recipe is also"]},{"l":"The Harpy repository"},{"l":"repo structure","p":["Harpy exists as a Git repository and has 5 standard branches that are used in specific ways during development. Git is a popular version control system and discussing its use is out of the scope of this documentation, however there is no"]},{"l":"development workflow","p":["The dev workflow is reasonably standard:"]},{"l":"containerization","p":["As of Harpy v1.0, the software dependencies that the Snakemake workflows use are pre-configured as a Docker image that is uploaded to Dockerhub. Updating or editing this container can be done automatically or manually."]},{"l":"automatically","p":["The testing GitHub Action will automatically create a Dockerfile with (a hidden harpy command) and build a new Docker container, then upload it to dockerhub with the latest tag. This process is triggered on"]},{"l":"manually","p":["The dockerfile for that container is created by using a hidden harpy command"]},{"l":"Automations"},{"l":"testing","p":["CI ( C ontinuous I ntegration) is a term describing automated actions that do things to/with your code and are triggered by how you interact with a repository. Harpy has a series of GitHub Actions triggered by interactions with the"]},{"l":"releases","p":["There is an automation that gets triggered every time Harpy is tagged with the new version. It strips out the unnecessary files and will upload a cleaned tarball to the new release (reducing filesize by orders of magnitude). The automation will also"]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file +[[{"i":"#","p":["Using Harpy to process your haplotagged data"]},{"l":"Home","p":["Harpy is a haplotagging data processing pipeline for Linux-based systems. It uses all the magic of Snakemake under the hood to handle the worklfow decision-making, but as a user, you just interact with it like a normal command-line"]},{"l":"Harpy Modules","p":["Harpy is modular, meaning you can use different parts of it independent from each other. Need to only align reads? Great! Only want to call variants? Awesome! All modules are called by"]},{"l":"Using Harpy","p":["You can call harpy without any arguments (or with --help) to print the docstring to your terminal. You can likewise call any of the modules without arguments or with --help to see their usage (e.g."]},{"l":"Linked-Read Workflow","p":["Depending on your project goals, you may want any combination of SNPs, structural variants (inversions, deletions, duplications), or phased haplotypes. Below is a flow chart outlining a general workflow of linked-read data."]}],[{"l":"Install"},{"l":"Install Harpy","p":["Harpy is hosted on Bioconda! That means to install it, you just need to have mamba(or conda) on your Linux-based system and install it with a simple command. You can install Harpy into an existing environment or create a new one for it (recommended)."]},{"l":"install into a new environment"},{"i":"recommended","l":"✨recommended✨","p":["The code snippet below creates a new environment called harpy(the -n harpy part) and installs harpy into it from the bioconda channel (-c bioconda part). You can name this environment anything (e.g."]},{"l":"install into an existing evironment","p":["If you want to install harpy into an existing environment, then with an environment already activated (via mamba activate env_name) simply use the mamba install command and harpy"]},{"l":"Update Harpy","p":["If you want to update Harpy, the process is quite similar:"]}],[{"i":"#","p":["Align haplotagged sequences"]},{"l":"Align Sequences to a Genome","p":["After your sequences (in FASTQ format) have been checked for quality, you will need to align them to a reference genome before you can call variants. Harpy offers several aligners for this purpose:"]}],[{"i":"#","p":["Align haplotagged sequences with BWA MEM"]},{"l":"Map Reads onto a genome with BWA MEM","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"BWA workflow"}],[{"i":"#","p":["Align haplotagged sequences with EMA"]},{"l":"Map Reads onto a genome with EMA","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Barcode whitelist","p":["Some linked-read methods (e.g. 10x, Tellseq) require the inclusion of a barcode \"whitelist.\" This file is a simple text file that has one barcode per line so a given software knows what barcodes to expect in your data."]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["EMA marks duplicates in the resulting alignments, however the read with invalid barcodes are aligned separately with BWA. Therefore, Harpy uses samtools markdup to mark putative"]},{"l":"EMA workflow"}],[{"i":"#","p":["Align haplotagged sequences with strobealign"]},{"l":"Map Reads onto a genome with strobealign","p":["Once sequences have been trimmed and passed through other QC filters, they will need to be aligned to a reference genome. This module within Harpy expects filtered reads as input,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Read Length","p":["The strobealign program uses a new strobemer design for aligning and requires its own way of indexing the genome. The index must be configured for the average read length of the sample"]},{"l":"Molecule distance","p":["The --molecule-distance option is used during the BWA alignment workflow to assign alignments a unique Molecular Identifier MI:i tag based on their haplotag barcode and the distance threshold you specify. See"]},{"l":"Quality filtering","p":["The --min-quality argument filters out alignments below a given MQ threshold. The default, 30, keeps alignments that are at least 99.9% likely correctly mapped. Set this value to"]},{"l":"Marking PCR duplicates","p":["Harpy uses samtools markdup to mark putative PCR duplicates. By using the --barcode-tag BX option, it considers the linked-read barcode for more accurate duplicate detection. Duplicate"]},{"l":"Strobealign workflow"}],[{"i":"#","p":["Create a genome assembly from linked reads"]},{"l":"Create a Genome Assembly","p":["If you have single-sample data, you might be interested in a genome assembly. Unlike metagenome assemblies, a classic genome assembly assumes there is exactly one genome present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using the command-line arguments below. Since the assembly process consists of several distinct phases, the descriptions are provided with an extra badge to reflect which part of the assembly process they correspond to."]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Assembly Workflow"}],[{"i":"#","p":["Resolve barcodes shared by different molecules"]},{"l":"Resolve barcodes shared by different molecules","p":["Running is optional. In the alignment workflows (), Harpy already uses a distance-based approach to deconvolve barcodes and assign MI tags (Molecular Identifier), whereas the workflow has the"]},{"l":"Running Options"},{"l":"Resulting Barcodes","p":["After deconvolution, some barcodes may have a hyphenated suffix like -1 or -2(e.g. A01C33B41D93-1). This is how deconvolution methods create unique variants of barcodes to denote that identical barcodes"]},{"l":"Harpy Deconvolution Nuances","p":["Some of the downstream linked-read tools Harpy uses expect linked read barcodes to either look like the 16-base 10X variety or a standard haplotag (AxxCxxBxxDxx). Their pattern-matching would not recognize barcodes deconvoluted with"]}],[{"i":"#","p":["Demultiplex raw sequences into haplotag barcoded samples"]},{"l":"Demultiplex Raw Sequences","p":["When pooling samples and sequencing them in parallel on an Illumina sequencer, you will be given large multiplexed FASTQ files in return. These files contain sequences for all of your samples and need to be demultiplexed using barcodes to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Haplotag Types"},{"l":"Gen I Demultiplex Workflow"}],[{"i":"#","p":["Impute genotypes for haplotagged data with Harpy"]},{"l":"Impute Genotypes using Sequences","p":["After variants have been called, you may want to impute missing genotypes to get the most from your data. Harpy uses STITCH to impute genotypes, a haplotype-based method that is linked-read aware. Imputing genotypes requires a variant call file"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Extra STITCH parameters","p":["You may add additional parameters to STITCH by way of the--extra-params(or -x) option. Since STITCH is a function in the R language, the parameters you add must be in R syntax (e.g."]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Parameter file","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters (explained in next section). The solution Harpy uses for this is to have the user"]},{"l":"STITCH Parameters"},{"l":"Imputation Workflow"}],[{"i":"#","p":["Create a metagenome assembly from linked reads"]},{"l":"Create a Metagenome Assembly","p":["If you have mixed-sample data, you might be interested in a metagenome assembly, also known as a metassembly. Unlike a single-sample assembly, a metassembly assumes there are multiple genomes present in your sequences and will try to"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Deconvolved Inputs","p":["For linked-read assemblies, the barcodes need to be deconvolved in the sequence data, meaning that barcodes that are shared by reads that originate from different molecules need to have unique barcode"]},{"l":"Metassembly Workflow"}],[{"i":"#","p":["Generate extra files for analysis with Harpy"]},{"l":"Other Harpy modules","p":["Some parts of Harpy (variant calling, imputation) want or need extra files. You can create various files necessary for different modules using these extra modules:"]},{"l":"Other modules"},{"l":"resume","p":["When calling a workflow (e.g. ), Harpy performs various file checks and validations, sets up the Snakemake command, output folder(s), etc. In the event you want to continue a failed or manually terminated workflow without overwriting the workflow"]},{"l":"arguments","p":["The DIRECTORY is the output directory of a previous harpy-invoked workflow, which must have the workflow/config.yaml file. For example, if you previously ran harpy align bwa -o align-bwa ..."]},{"l":"popgroup","p":["Creates a sample grouping file for variant calling"]},{"i":"arguments-1","l":"arguments","p":["This optional file is useful if you want SNP variant calling to happen on a per-population level via or on samples pooled-as-populations via ."]},{"l":"stitchparams","p":["Create a template parameter file for the module. The file is formatted correctly and serves as a starting point for using parameters that make sense for your study."]},{"i":"arguments-2","l":"arguments","p":["Typically, one runs STITCH multiple times, exploring how results vary with different model parameters. The solution Harpy uses for this is to have the user provide a tab-delimited dataframe file where the columns are the 6 STITCH model"]}],[{"i":"#","p":["Phase haplotypes for haplotagged data with Harpy"]},{"l":"Phase SNPs into Haplotypes","p":["You may want to phase your genotypes into haplotypes, as haplotypes tend to be more informative than unphased genotypes (higher polymorphism, captures relationship between genotypes). Phasing"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Prioritize the vcf file","p":["Sometimes you want to run imputation on all the samples present in the INPUTS, but other times you may want to only impute the samples present in the --vcf file. By default, Harpy assumes you want to use all the samples"]},{"l":"Molecule distance","p":["The molecule distance refers to the base-pair distance dilineating separate molecules. In other words, when two alignments on a single contig share the same barcode, how far away from each other are we willing to say they were and still consider them having"]},{"l":"Pruning threshold","p":["The pruning threshold refers to a PHRED-scale value between 0-1 (a percentage) for removing low-confidence SNPs from consideration. With Harpy, you configure this value as an integer"]},{"l":"Phasing Workflow"}],[{"i":"#","p":["Run file format checks on haplotagged FASTQ/BAM files"]},{"l":"Pre-flight checks for input files","p":["Harpy does a lot of stuff with a lot of software and each of these programs expect the incoming data to follow particular formats (plural, unfortunately). These formatting opinions/specifics are at the mercy of the original developers and while there are times when Harpy can (and does)"]},{"l":"When to run"},{"l":"Running Options","p":["In addition to the , the and modules are configured using only command-line input arguments:"]},{"l":"Workflow"}],[{"i":"#","p":["Quality trim haplotagged sequences with Harpy"]},{"l":"Quality Trim Sequences","p":["Raw sequences are not suitable for downstream analyses. They have sequencing adapters, index sequences, regions of poor quality, etc. The first step of any genetic sequence analyses is to remove these adapters and trim poor quality data. You can remove adapters,"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"QC Workflow"}],[{"i":"#","p":["Simulate genomic data"]},{"l":"Simulate Genomic Data","p":["You may be interested in benchmarking variant detection or maybe just trying out haplotagging data without any financial commitment-- that's where simulations come in handy."]},{"l":"Simulate Genomic Variants","p":["Harpy lets you simulate genomic variants via for different variant types such as single nucleotide polymorphisms (SNP), indels, inversions, copy number variants (CNV), and translocations. All you need is to provide a genome to simulate"]},{"l":"Simulate Haplotag Linked-Reads","p":["You can also simulate haplotag-style linked reads from an existing genome using . Harpy incorporates LRSIM to generate linked reads from a diploid genomic. If you only have a haploid genome, then you can create a diploid genome by simulating variants into it with"]}],[{"i":"#","p":["Simulate linked reads from a genome"]},{"l":"Simulate Linked Reads","p":["Simulate linked reads from a genome"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Mutation Rate","p":["The read simulation is two-part: first dwgsim generates forward and reverse FASTQ files from the provided genome haplotypes( HAP1_GENOME and HAP2_GENOME), then LRSIM takes over and creates linked-reads from that. The"]},{"l":"Simulating a single sample","p":["If you intend to simulate a \"single individual\" (i.e. use this module once), then you might want no additonal SNPs beyond the variants you may have already introduced into the genome and set"]},{"l":"Simulating multiple samples","p":["If you intend on simulating \"multiple individuals\" (i.e. use this module multiple times on the same genome haplotypes), it may make sense to set this value larger than 0 so there is some \"natural\" variation between your simulated individuals."]},{"l":"Partitions","p":["TL;DR: 10X partitions ≈ haplotag beads"]},{"l":"Barcodes","p":["Barcodes, if provided, must be given as 16-basepair nucleotide sequences, one per line. If not provided, Harpy will download the standard 10X Genomics 4M-with-alts-february-2016.txt"]},{"l":"10X to Haplotag conversion","p":["Harpy will convert the simulated 10X-style reads, where the 16-basepair barcode is at the beginning of read 1, to haplotag format, where the barcode is coded in the sequence header under the"]},{"l":"Choosing parameters","p":["LRSIM does internal calculations to determine the number of reads per molecule based on --read-pairs,--partitions, and --molecules-per. Understanding how these parameters affect the resulting sequences"]},{"l":"Parameter calculator","p":["Conveniently, we provide a calculator to help you make informed decisions for these parameters:"]},{"l":"Simulate Linkedreads Workflow"}],[{"i":"#","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Simulate Genomic Variants","p":["Simulate snps, indels, inversions, cnv, translocations"]},{"l":"Modules","p":["There are 4 submodules with very obvious names:"]},{"l":"Running Options","p":["While there are serveral differences between individual workflow options, each has available all the like other Harpy modules. Each requires and input genome at the end of the command line, and each requires either a"]},{"l":"Simulate known variants","p":["Rather than simulating random variants, you can use a VCF file as input to any of the workflows to have simuG simulate the variants (of that type) from the VCF file. This becomes particularly"]},{"l":"Heterozygosity","p":["Each workflow has a --heterozygosity parameter where you can specify the heterozygosity of the simulated variants, which creates two new VCF files ({prefix}.hap1.vcf,{prefix}.hap2.vcf"]}],[{"i":"#","p":["Call SNPs and small indels"]},{"l":"Call SNPs and small indels","p":["After reads have been aligned, e.g., with , you can use those alignment files(.bam) to call variants in your data. Harpy can call SNPs and small indels using bcftools mpileup or with"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"ploidy","p":["If you are calling haploid or diploid samples, using either mpileup or freebayes will be comparable. However, if you need to call SNPs in polyploids (ploidy >2), then you will need to use"]},{"l":"regions","p":["The --regions(-r) option lets you specify the genomic regions you want to call variants on. Keep in mind that mpileup uses 1-based positions for genomic intervals, whereas freebayes"]},{"l":"populations","p":["Grouping samples changes the way the variant callers computes certain statistics when calling variants. If you have reason to believe there is a biologically meaningful grouping scheme to your samples, then you should include"]},{"l":"SNP calling workflow"}],[{"i":"#","p":["Find structural variants"]},{"l":"Find structural variants","p":["The module identifies single nucleotide polymorphisms (SNP) and small indels, but you may want to (and should!) leverage the linked-read data to identify larger structural variants (SV) like large deletions, duplications, and"]},{"l":"Caveats"},{"l":"NAIBR","p":["While our testing shows that NAIBR tends to find known inversions that LEVIATHAN misses, the program requires haplotype phased bam files as input. That means the alignments have a"]},{"l":"LEVIATHAN","p":["LEVIATHAN relies on split-read information in the sequence alignments to call variants. The EMA aligner does not report split read alignments, instead it reports secondary alignments."]}],[{"i":"#","p":["Call structural variants using Leviathan"]},{"l":"Call Structural Variants using LEVIATHAN","p":["(like indels, insertions, duplications, breakends)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"LEVIATHAN workflow"}],[{"i":"#","p":["Call structural variants using NAIBR (plus)"]},{"l":"Call Structural Variants using NAIBR","p":["(like indels, insertions, duplications)"]},{"l":"Running Options","p":["In addition to the , the module is configured using these command-line arguments:"]},{"l":"Molecule distance","p":["The --molecule-distance option is used to let the program determine how far apart alignments on a contig with the same barcode can be from each other and still considered as originating from the same DNA molecule. See"]},{"l":"Single-sample variant calling","p":["When not using a population grouping file via --populations, variants will be called per-sample. Due to the nature of structural variant VCF files, there isn't an entirely fool-proof way"]},{"l":"Pooled-sample variant calling","p":["With the inclusion of a population grouping file via --populations, Harpy will merge the bam files of all samples within a population and call variants on these alignment pools. Preliminary work shows that this way identifies more variants and with fewer false"]},{"l":"Optional vcf file","p":["In order to get the best variant calling performance out of NAIBR, it requires phased bam files as input. Using --vcf is optional and not used by NAIBR directly. However, to use"]},{"i":"a-phased-input---vcf","l":"a phased input --vcf","p":["This file can be in vcf/vcf.gz/bcf format and most importantly it must be phased haplotypes. There are various ways to haplotype SNPs, but you can use to phase your SNPs into haplotypes using the haplotag barcode information. The resulting phased VCF file can then be used as input here."]},{"l":"NAIBR workflow"}],[{"l":"Haplotag data"},{"i":"what-is-haplotagging","l":"What is haplotagging?","p":["Linked-read sequencing exists to combine the throughput and accuracy of short-read sequencing with the long range haplotype information of long-read sequencing. Haplotagging is an implementation of linked-read sequencing developed by"]},{"l":"Data Format"},{"l":"Barcodes","p":["While barcodes are actually combinatorial bases, in the read headers they are represented with the format AxxCxxBxxDxx, where each barcode segment is denoted as Axx(or Bxx, etc.)."]},{"l":"barcode protocol varieties","p":["If you think haplotagging is as simple as exactly 96^4 unique barcodes, you would only be half-correct. The original haplotagging protocol in Meier et al. is good, but the authors (and others) have been working to improve this linked-read technology to improve"]},{"l":"where the barcodes go","p":["Chromium 10X linked-reads use a format where the barcode is the leading 16 bases of the forward (R1) read. However, haplotagging data does not use that format and many of the tools"]},{"l":"Read headers","p":["Like mentioned, the haplotag barcode is expected to be stored in the BX:Z: tag in the read header. This information is retained through the various Harpy steps. An example read header could look like:"]},{"l":"Read length","p":["Reads must be at least 30 base pairs in length for alignment. By default, the module removes reads <30bp."]},{"l":"Compression","p":["Harpy generally doesn't require the input sequences to be in gzipped/bgzipped format, but it's good practice to compress your reads anyway. Compressed files are expected to end with the extension"]},{"l":"Naming conventions","p":["Unfortunately, there are many different ways of naming FASTQ files, which makes it difficult to accomodate every wacky iteration currently in circulation. While Harpy tries its best to be flexible, there are limitations."]},{"l":"Barcode thresholds","p":["By the nature of linked read technologies, there will (almost always) be more DNA fragments than unique barcodes for them. As a result, it's common for barcodes to reappear in sequences. Rather than incorrectly assume that all sequences/alignments with the same barcode"]}],[{"l":"Common Harpy Options"},{"l":"Input Arguments","p":["Each of the main Harpy modules (e.g. or ) follows the format of"]},{"l":"Common command-line options","p":["Every Harpy module has a series of configuration parameters. These are arguments you need to input to configure the module to run on your data, such as the directory with the reads/alignments,"]},{"i":"--contigs","l":"--contigs","p":["Some of the workflows (like ) plot per-contig information in their reports. By default, Harpy will plot up to 30 of the largest contigs. If you are only interested in a specific set of contigs, then you can use"]},{"l":"example","p":["You could call and specify 20 threads with no output to console:"]},{"l":"The workflow folder","p":["When you run one of the main Harpy modules, the output directory will contain a workflow folder. This folder is both necessary for the module to run and is very useful to understand what the module did, be it for your own"]},{"l":"The Genome folder","p":["You will notice that many of the workflows will create a Genome folder in the working directory. This folder is to make it easier for Harpy to store the genome and the associated"]}],[{"l":"Troubleshooting","p":["Lots of stuff can go wrong during an analysis. The intent of this page is to guide you through navigating the inevitable errors associated with doing bioinformatics."]},{"l":"Troubleshooting Harpy","p":["Harpy has two steps: first it performs checks and validations, then it runs Snakemake."]},{"l":"checks and validations","p":["First, Harpy takes your command-line inputs and checks/validates the input files and parameters. If your parameters are not the correct type (e.g. a number where there should be a file), the"]},{"l":"snakemake validations","p":["Once all the file validations pass, Harpy passes the baton over to Snakemake. Snakemake builds a workflow graph of the rules and performs its own checks. If you get an error before the workflow starts processing any data (there"]},{"l":"error during a workflow","p":["Sometimes something goes wrong with one of the steps in a workflow. If/when that happens, Harpy will print the offending step and all the information Snakemake has regarding the failure. If the step had a log file, it will"]},{"l":"Common Issues"},{"l":"installation issue","p":["Conda is an awesome package manager, but it's slow and uses a ton of memory as dependencies increase. Harpy has a lot of dependencies and you might stall out conda trying to install it. Use mamba instead-- it'll work where conda fails."]},{"l":"imputation or phasing failure","p":["If you use bamutils clipOverlap on alignments that are used for the or modules, they will cause both programs to error. We don't know why, but they do."]},{"l":"SAM name and ID mismatch","p":["Aligning a sample to a genome via Harpy will insert the sample name (based on the file name) into the alignment header (the @RG ID:name SM:name tag). It likewise expects, through various steps,"]}],[{"l":"Snakamake Things"},{"l":"Workflow logs","p":["Barring a few exceptions, most of Harpy's options are Snakemake workflows. This means we are all at the mercy of how Snakemake operates, which includes the .snakemake/ folder in your project directory. That folder contains"]},{"l":"Adding Snakemake Parameters","p":["Harpy relies on Snakemake under the hood to handle file and job dependencies. Most of these details have been abstracted away from the end-user, but every module of Harpy (except"]},{"l":"Common use cases","p":["You likely wont need to invoke --snakemake very often, if ever. However, here examples of some possible use cases for this parameter."]}],[{"l":"Software used in Harpy","p":["Harpy is the sum of its parts, and out of tremendous respect for the developers involved in the included software, we would like to highlight the tools directly involved in Harpy's many moving pieces."]},{"l":"Standalone Software"},{"l":"Software Packages"}],[{"l":"Utilities","p":["Harpy is the sum of its parts and some of those parts are stand-alone scripts used by the workflows that are accessible from within the Harpy conda environment. This page serves to document those scripts, since using them outside of a workflow"]},{"i":"10xtohaplotagpy","l":"10xtoHaplotag.py","p":["Converts 10x linked reads to haplotag linked reads with barcodes in BX:Z and OX:Z header tags."]},{"i":"assign_mipy","l":"assign_mi.py","p":["Assign an MI:i(Molecular Identifier) tag to each barcoded record based on a molecular distance cutoff. Unmapped records are discarded in the output. Records without a BX:Z tag or"]},{"i":"bx_statspy","l":"bx_stats.py","p":["Calculates various linked-read molecule metrics from the (coordinate-sorted) input alignment file. Metrics include (per molecule):"]},{"i":"check_bampy","l":"check_bam.py","p":["Parses an aligment file to check:"]},{"i":"check_fastqpy","l":"check_fastq.py","p":["Parses a FASTQ file to check if any sequences don't conform to the SAM spec, whether BX:Z: is the last tag in the record, and the counts of:"]},{"i":"concatenate_bampy","l":"concatenate_bam.py","p":["Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample. This is a means of accomplishing the same as samtools cat, except all"]},{"i":"count_bxpy","l":"count_bx.py","p":["Parses a FASTQ file to count: total sequences, total number of BX tags, number of valid haplotagging BX tags, number of invalid BX tags, number of invalid BX tag segments (i.e."]},{"i":"depth_windowspy","l":"depth_windows.py","p":["Reads the output of samtools depth -a from stdin and calculates means within windows of a given windowsize."]},{"i":"haplotag_acbdpy","l":"haplotag_acbd.py","p":["Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes into the specified output_directory."]},{"i":"infer_svpy","l":"infer_sv.py","p":["Create column in NAIBR bedpe output inferring the SV type from the orientation. Removes variants with FAIL flags and you can use the optional -f(--fail) argument to output FAIL variants to a separate file."]},{"i":"make_windowspy","l":"make_windows.py","p":["Create a BED file of fixed intervals (-w, -- window) from a FASTA or fai file (the kind generated with samtools faidx). Nearly identical to bedtools makewindows, except the intervals are nonoverlapping. The"]},{"i":"molecule_coveragepy","l":"molecule_coverage.py","p":["Using the statsfile generated by bx_stats.py from Harpy, will calculate \"molecular coverage\" across the genome. Molecular coverage is the \"effective\" alignment coverage if you treat a molecule inferred from linked-read data as"]},{"i":"parse_phaseblockspy","l":"parse_phaseblocks.py","p":["Parse a phase block file from HapCut2 to pull out summary information"]},{"l":"rename_bam","p":["Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. This process creates a new file new_name.bam and you may use -d to delete the original file. Requires"]},{"l":"separate_validbx","p":["Split a BAM file with BX tags into 2 files, one with valid ACBD barcodes ( stdout), one with invalid ACBD barcodes ( stderr)."]}],[{"l":"Blog"}],[{"i":"#","p":["Sorting data by linked-read barcode"]},{"l":"Sort data by barcode","p":["You would think sorting data would be a no-brainer, and in most cases it is. You can use seqtk or seqkit to sort FASTQ/A files by their IDs, samtools to sort SAM/BAM/CRAM files by name or coordinates. However, in the world of linked-read"]},{"l":"Sorting Alignments","p":["Let's start with BAM (or SAM/CRAM) files because the process is much simpler. Since the linked-read barcode is stored in a BX:Z tag (or less often as BC:Z:), we can use a little feature of"]},{"l":"Sorting FASTQ","p":["Sorting FASTQ files by barcode is trickier, only because there aren't (to our knowledge!) any existing convenience methods to do it. Like any bioinformatics puzzle, you could probably solve it with a sophisticated AWK command, but HTSlib tools are so much more"]},{"l":"1. convert FASTQ to SAM","p":["Yep, we're solving our problem by doing a simple file conversion to SAM/BAM. That's the easiest way to do it, surprisingly. FASTQ files can be converted to unmapped BAM files using"]},{"l":"2. sort the SAM by barcode","p":["Exactly like shown above to sort a SAM/BAM file with samtools sort, we're going to do the same on the unmapped SAM file we just created:"]},{"l":"3. convert SAM back to FASTQ","p":["Now that the data have been sorted, we need to convert it back into forward and reverse FASTQ files using samtools fastq. The -T * argument once again preserves all the tags between file formats. The"]},{"l":"as a single pipe","p":["Rather than splitting out these three processess, you can stream/pipe them in a single workflow:"]}],[{"i":"#","p":["A realistic workflow to simulate variants"]},{"l":"Simulating variants","p":["You may want to (and are encouraged to) simulate data before investing in the costs associated with linked-read sample preparation and subsequent sequencing. Harpy provides both a variant and linked-read simulators and this tutorial serves to"]},{"l":"1. Add random inversions","p":["First, we will need to simulate some inversions and set a --heterozygosity value >0 to get a diploid genome as the output. If you wanted to manually create inversions in specific areas or with specific lengths, this would be a good starting point too since"]},{"l":"2. Add snps and indels","p":["Let's say we wanted to simulate SNPs and indels like so:"]},{"i":"3-simulate-known-snps-and-indels-onto-the-diploid-genome-with-inversions","l":"3. Simulate \"known\" snps and indels onto the diploid genome with inversions","p":["We will run Harpy twice, once for each haplotype, using the corresponding VCFs from Step 2:"]},{"l":"5. Simulating linked-reads","p":["Now that you have heterozygous haplotypes created from your starting genome, you can simulate linked-reads from it using harpy simulate linkedreads. A simple implementation of that could look like:"]}],[{"i":"#","p":["Why pool samples for SV calling and when to do it"]},{"l":"Pooling samples for SV calling","p":["One of the cool benefits of linked-read data is the fact that you can call structural variants with it. Depending on the depth of your data, you may want (or need) to pool samples together. This"]},{"l":"Sample depth"},{"i":"depth-explained","l":"Depth, explained","p":["In bioinformatics, the terms \"coverage\" and \"depth\" and often used interchangeably, which is incorrect and leads to confusion. Coverage refers to the proportion of a genome that is sequenced, and"]},{"i":"depth-in-context","l":"Depth, in context","p":["Historically, one would have wanted to sequence fewer individuals at higher depth to get confident genotype calls, rather than sequence more individuals at lower depth. Recent advances in bioinformatics have enabled low-coverage whole genome sequencing"]},{"l":"The problem","p":["It's recommended to have at least 10X-12X depth to get decent structural variant calls(definitely read that in a paper that I would like to link here, but I can't seem to find it). If your data already has a minimum of 10X for each individual, great! Feel free to use"]},{"l":"The solution","p":["One way to get your low-coverage (low depth) data and still call structural variants is to pool samples together, which would effectively boost the depth. By doing this, you will"]},{"l":"Pooling considerations","p":["If pooling samples, you must pool them sensibly and with a biological context to do so. In other words, you don't just pool random samples together to inflate depth. Since haplotag data is just whole genome sequence data plus a little extra information, you should"]}],[{"i":"#","p":["Deciding between using Conda or Containers"]},{"l":"Choosing a software runtime method","p":["There are two ways you can run Harpy, using a container with the necessary software environments in it (the default), or with local conda environments(with the --conda option). If software development and containerization"]},{"i":"tldr","l":"TL;DR"},{"l":"What Harpy Provides","p":["An conda-based installation of Harpy provides only the minimal set of programs Harpy needs to begin a workflow. These include: python 3.12, snakemake-minimal, pandas, and the htslib programs (htslib, samtools, bcftools, tabix)."]},{"l":"How Harpy Provides the Other Stuff","p":["Instead of a monolithic Harpy environment, which would be impossible with the current software dependencies, there are a handful of defined conda environment recipes that Harpy workflows generate. Snakemake will make"]},{"l":"Harpy and Containers","p":["The Harpy team manages a container on Dockerhub called, you guessed it, Harpy, that is synchronously versioned with the Harpy software. In other words, if you're using Harpy v1.4, it will use the container version v1.4. The"]},{"i":"whats-the-catch","l":"What's the Catch?","p":["While local conda enviroments at runtime or containers might seem like foolproof approaches, there are drawbacks."]},{"i":"conda-caveats","l":"Conda Caveats:"},{"i":"conda-caveat-1-inconsistent","l":"⚠️ Conda Caveat 1: Inconsistent","p":["Despite our and conda's best efforts, sometimes programs just don't install correctly on some systems due to unexpected system (or conda) configurations. This results in frustrating errors where jobs fail because software that is"]},{"i":"conda-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Conda Caveat 2: Troubleshooting","p":["To manually troubleshoot many of the tasks Harpy workflows perform, you may need to jump into one of the local conda environments in .snakemake/conda. That itself isn't terrible, but it's an extra step because you will"]},{"l":"Container Caveats"},{"i":"container-caveat-1-speed","l":"\uD83D\uDEA5 Container Caveat 1: Speed","p":["The overhead of Snakemake creating a container instance for a job, then cleaning it up after the job is done is not trivial and can negatively impact runtime."]},{"i":"container-caveat-2-troubleshooting","l":"\uD83D\uDCA3 Container Caveat 2: Troubleshooting","p":["The command Snakemake secretly invokes to run a job in a container is quite lengthy. In most cases that shouldn't matter to you, but when something eventually goes wrong and you need to troubleshoot, it's harder"]}],[{"i":"#","p":["A gentle introduction to the wild world of filtering SNPs"]},{"l":"Filtering Variants","p":["The discussion around filtering SNPs and indels is massive and many researchers go about it differently, each very opinionated as to why their method is the best. As a starting point, have a look at how the authors of"]},{"i":"genotype-quality-qual","l":"genotype quality (QUAL)","p":["You will obviously want higher quality genotype calls to remove false positives. The HTSlib guide suggests at least 50(e.g. -i 'QUAL=50'), but we typically filter much higher at"]},{"i":"read-depth-dp","l":"read depth (DP)","p":["Variant sites with too few reads backing up the genotype might be false positives, although this may not hold true for very low-coverage data. Conversely, a maximum cut off is important because sites with very high read depths (relative to the distribution of read depth)"]},{"i":"minor-allele-frequency-maf","l":"minor allele frequency (MAF)","p":["It's usually advisable to set a minor allele frequency threshold with which to remove sites below that threshold. The reasoning is that if a MAF is too low, it might be because of incorrectly called genotypes in a very small handful of individuals (e.g. one or two)."]},{"i":"missing-data-f_missing","l":"missing data (F_MISSING)","p":["Missing data is, frankly, not terribly useful. The amount of missing data you're willing to tolerate will depend on your study, but it's common to remove sites with >20% missing data (e.g."]}]] \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 26f1f176c..d32df0c5b 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ diff --git a/snakemake/index.html b/snakemake/index.html index 7ed243ec5..1a1dc76fa 100644 --- a/snakemake/index.html +++ b/snakemake/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
    @@ -99,7 +99,7 @@
  • - + @@ -118,6 +118,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -223,7 +233,7 @@
  • - + @@ -242,6 +252,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -267,7 +287,7 @@

    # - Snakamake Things + Snakamake Things

    diff --git a/software/index.html b/software/index.html index c745c7422..16ae014e3 100644 --- a/software/index.html +++ b/software/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
    - + Next - Development + Utilities diff --git a/static/lr_conversion.png b/static/lr_conversion.png index 009399333..9ea067213 100644 Binary files a/static/lr_conversion.png and b/static/lr_conversion.png differ diff --git a/static/lr_conversion.svg b/static/lr_conversion.svg index cb64466f0..1f247525e 100644 --- a/static/lr_conversion.svg +++ b/static/lr_conversion.svg @@ -26,11 +26,11 @@ inkscape:pagecheckerboard="0" inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" - inkscape:zoom="1.2094621" - inkscape:cx="422.50187" - inkscape:cy="212.49116" + inkscape:zoom="1.7104377" + inkscape:cx="258.41339" + inkscape:cy="171.88583" inkscape:window-width="1920" - inkscape:window-height="1052" + inkscape:window-height="1020" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1" @@ -126,8 +126,8 @@ inkscape:groupmode="layer" id="layer1" transform="translate(-10.701555,-116.78158)">@seq_id/1 TX:Z: BX:Z:@seq_id/1 OX:Z: BX:Z:@seq_id/2 TX:Z: BX:Z:@seq_id/2 OX:Z: BX:Z: - + @@ -34,12 +34,12 @@ - + - + - - + +
    @@ -101,7 +101,7 @@
  • - + @@ -120,6 +120,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -225,7 +235,7 @@
  • - + @@ -244,6 +254,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/utilities/index.html b/utilities/index.html new file mode 100644 index 000000000..8c531a80f --- /dev/null +++ b/utilities/index.html @@ -0,0 +1,562 @@ + + + + + + + + + + + + + Utilities | Harpy haplotag + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + +
    +
    + + +
    + + +
    + + + +
    + + +
    +
    + +
    + + + + + + + + +
    + +
    +
    +
    +
    + +
    + + +

    + # + Utilities +

    +
    +

    Harpy is the sum of its parts and some of those parts are stand-alone scripts +used by the workflows that are accessible from within the Harpy conda environment. +This page serves to document those scripts, since using them outside of a workflow +might be useful too. You can call up the docstring for any one of these utilities +by calling the program without any arguments.

    + +

    + # + 10xtoHaplotag.py +

    +
    +
    +
    10xtoHaplotag.py -f <forward.fq.gz> -r <reverse.fq.gz> -b <barcodes.txt> -p <prefix> > barcodes.conversion.txt
    +
    +

    Converts 10x linked reads to haplotag linked reads with barcodes in BX:Z and OX:Z header tags.

    + +

    + # + assign_mi.py +

    +
    +
    +
    assign_mi.py -c cutoff -o output.bam input.bam
    +
    +

    Assign an MI:i (Molecular Identifier) tag to each barcoded +record based on a molecular distance cutoff. Unmapped records +are discarded in the output. Records without a BX:Z tag or +with an invalid barcode (00 as one of its segments) are presevered +but are not assigned an MI:i tag. Input file must be coordinate sorted.

    + +

    + # + bx_stats.py +

    +
    +
    +
    bx_stats.py -o output.gz input.bam
    +
    +

    Calculates various linked-read molecule metrics from the (coordinate-sorted) input alignment file. +Metrics include (per molecule):

    +
      +
    • number of reads
    • +
    • position start
    • +
    • position end
    • +
    • length of molecule inferred from alignments
    • +
    • total aligned basepairs
    • +
    • total length of inferred inserts
    • +
    • molecule coverage (%) based on aligned bases
    • +
    • molecule coverage (%) based on total inferred insert length
    • +
    + +

    + # + check_bam.py +

    +
    +
    +
    check_bam.py input.bam > output.txt
    +
    +

    Parses an aligment file to check:

    +
      +
    • if the sample name matches the RG tag
    • +
    • whether BX:Z is the last tag in the record
    • +
    • the counts of: +
        +
      • total alignments
      • +
      • alignments with an MI:i tag
      • +
      • alignments without BX:Z tag
      • +
      • incorrect BX:Z tag
      • +
      +
    • +
    + +

    + # + check_fastq.py +

    +
    +
    +
    check_bam.py input.bam > output.txt
    +
    +

    Parses a FASTQ file to check if any sequences don't conform to the SAM spec, +whether BX:Z: is the last tag in the record, and the counts of:

    +
      +
    • total reads
    • +
    • reads without BX:Z tag
    • +
    • reads with incorrect BX:Z tag
    • +
    + +

    + # + concatenate_bam.py +

    +
    +
    +
    concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam
    +# or #
    +concatenate_bam.py -o output.bam -b bam_files.txt
    +
    +

    Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample. +This is a means of accomplishing the same as samtools cat, except all MI (Molecule Identifier) tags are updated +so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide +all the files you want to concatenate, or a single file featuring filenames with the -b option.

    + +

    + # + count_bx.py +

    +
    +
    +
    count_bx.py input.fastq > output.txt
    +
    +

    Parses a FASTQ file to count: total sequences, total number of BX tags, +number of valid haplotagging BX tags, number of invalid BX tags, number of +invalid BX tag segments (i.e. A00, C00, B00, D00).

    + +

    + # + depth_windows.py +

    +
    +
    +
    samtools depth -a file.bam | depth_windows.py windowsize > output.txt
    +
    +

    Reads the output of samtools depth -a from stdin and calculates means within windows of a given windowsize.

    + +

    + # + haplotag_acbd.py +

    +
    +
    +
    haplotag_acbd.py output_directory
    +
    +

    Generates the BC_{ABCD}.txt files necessary to demultiplex Gen I haplotag barcodes into the specified output_directory.

    + +

    + # + infer_sv.py +

    +
    +
    +
    infer_sv.py file.bedpe [-f fail.bedpe] > outfile.bedpe
    +
    +

    Create column in NAIBR bedpe output inferring the SV type from the orientation. Removes variants with FAIL flags +and you can use the optional -f (--fail) argument to output FAIL variants to a separate file.

    + +

    + # + make_windows.py +

    +
    +
    +
    make_windows.py -w <window.size> -m <0,1> input.fasta[.fai] > output.bed
    +
    +

    Create a BED file of fixed intervals (-w, --window) from a FASTA or fai file (the kind generated with samtools faidx). +Nearly identical to bedtools makewindows, except the intervals are nonoverlapping. The -m (--mode) option specified +whether indexing starts at 0 or 1.

    + +

    + # + molecule_coverage.py +

    +
    +
    +
    molecule_coverage.py -f genome.fasta.fai statsfile > output.cov
    +
    +

    Using the statsfile generated by bx_stats.py from Harpy, will calculate "molecular coverage" across the genome. +Molecular coverage is the "effective" alignment coverage if you treat a molecule inferred from linked-read data as +one contiguous alignment, even though the reads that make up that molecule don't cover its entire length. Requires a +FASTA fai index (the kind created with samtools faidx) to know the actual sizes of the contigs.

    + +

    + # + parse_phaseblocks.py +

    +
    +
    +
    parse_phaseblocks.py input > output.txt
    +
    +

    Parse a phase block file from HapCut2 to pull out summary information

    + +

    + # + rename_bam +

    +
    +
    +
    rename_bam.py [-d] new_name input.bam
    +
    +

    Rename a sam/bam file and modify the @RG tag of the alignment file to reflect the change for both ID and SM. +This process creates a new file new_name.bam and you may use -d to delete the original file. Requires samtools.

    + +

    + # + separate_validbx +

    +
    +
    +
    separate_validbx input.bam > valid.bam 2> invalid.bam
    +
    +

    Split a BAM file with BX tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes (stderr).

    + + + + +
    + +
    + +
    +
    +
    +
      +
    +
    + +
    +
    +
    + + + + + + + +
    + +
    +
    + + + +
    + + +
    + + + + diff --git a/workflows/align/bwa/index.html b/workflows/align/bwa/index.html index 9b2304fc9..1a51c3ef5 100644 --- a/workflows/align/bwa/index.html +++ b/workflows/align/bwa/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -104,7 +104,7 @@
  • - + @@ -123,6 +123,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -228,7 +238,7 @@
  • - + @@ -247,6 +257,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -387,7 +407,6 @@

    short name type default -required description @@ -397,15 +416,15 @@

    file/directory paths -‼️ -Files or directories containing input FASTQ files + + required + Files or directories containing input FASTQ files --contigs file path or list - Contigs to plot in the report @@ -413,7 +432,6 @@

    -x string - Additional EMA-align/BWA arguments, in quotes @@ -421,15 +439,15 @@

    -g file path -‼️ -Genome assembly for read mapping + + required + Genome assembly for read mapping --keep-unmapped -u toggle false - Output unmapped sequences too @@ -437,7 +455,6 @@

    -q integer (0-40) 30 - Minimum MQ (SAM mapping quality) to pass filtering @@ -445,7 +462,6 @@

    -d integer 100000 - Base-pair distance threshold to separate molecules diff --git a/workflows/align/ema/index.html b/workflows/align/ema/index.html index 30f149a9e..a8ca538a0 100644 --- a/workflows/align/ema/index.html +++ b/workflows/align/ema/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -104,7 +104,7 @@
  • - + @@ -123,6 +123,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -228,7 +238,7 @@
  • - + @@ -247,6 +257,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -405,7 +425,6 @@

    short name type default -required description @@ -415,15 +434,15 @@

    file/directory paths -‼️ -Files or directories containing input FASTQ files + + required + Files or directories containing input FASTQ files --contigs file path or list - Contigs to plot in the report @@ -431,7 +450,6 @@

    -d toggle false - Perform read fragment density optimization @@ -439,7 +457,6 @@

    -e integer (1-1000) 500 - Number of barcode bins for EMA @@ -447,7 +464,6 @@

    -x string - Additional EMA-align arguments, in quotes @@ -455,15 +471,15 @@

    -g file path -‼️ -Genome assembly for read mapping + + required + Genome assembly for read mapping --keep-unmapped -u toggle false - Output unmapped sequences too @@ -471,7 +487,6 @@

    -q integer (0-40) 30 - Minimum MQ (SAM mapping quality) to pass filtering @@ -479,15 +494,15 @@

    -p string haplotag -‼️ -Linked read technology: haplotag or 10x + + required + Linked read technology: haplotag or 10x --whitelist -w file path - Path to barcode whitelist (--platform 10x only) diff --git a/workflows/align/index.html b/workflows/align/index.html index c70437135..9b13a896b 100644 --- a/workflows/align/index.html +++ b/workflows/align/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
    @@ -98,7 +98,7 @@
  • - + @@ -117,6 +117,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -222,7 +232,7 @@
  • - + @@ -241,6 +251,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/workflows/align/strobe/index.html b/workflows/align/strobe/index.html index a547bce73..ff4f06b2c 100644 --- a/workflows/align/strobe/index.html +++ b/workflows/align/strobe/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -104,7 +104,7 @@
  • - + @@ -123,6 +123,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -228,7 +238,7 @@
  • - + @@ -247,6 +257,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -387,7 +407,6 @@

    short name type default -required description @@ -397,15 +416,15 @@

    file/directory paths -‼️ -Files or directories containing input FASTQ files + + required + Files or directories containing input FASTQ files --contigs file path or list - Contigs to plot in the report @@ -413,7 +432,6 @@

    -x string - Additional EMA-align/BWA arguments, in quotes @@ -421,15 +439,15 @@

    -g file path -‼️ -Genome assembly for read mapping + + required + Genome assembly for read mapping --keep-unmapped -u toggle false - Output unmapped sequences too @@ -437,7 +455,6 @@

    -d integer (0-40) 30 - Minimum MQ (SAM mapping quality) to pass filtering @@ -445,7 +462,6 @@

    -m integer 100000 - Base-pair distance threshold to separate molecules @@ -453,7 +469,6 @@

    -l choice auto - Average read length for creating index. Options: [auto, 50, 75, 100, 125, 150, 250, 400] diff --git a/workflows/assembly/index.html b/workflows/assembly/index.html index f5f2117d5..22ad2b88a 100644 --- a/workflows/assembly/index.html +++ b/workflows/assembly/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -321,7 +341,6 @@

    argument short name default -required description @@ -330,21 +349,22 @@

    FASTQ_R1 -‼️ -FASTQ file of forward reads + + required + FASTQ file of forward reads FASTQ_R2 -‼️ -FASTQ file of reverse reads + + required + FASTQ file of reverse reads --extra-params -x - spades assembly Additional spades parameters, in quotes @@ -353,7 +373,6 @@

    --kmer-length -k auto - spades assembly Kmer lengths to use for initial spades assembly. They must be odd and <128, separated by commas, and without spaces. (e.g. 13,23,51) @@ -362,7 +381,6 @@

    --max-memory -r 10000 - spades assembly Maximum memory for spades to use, given in megabytes @@ -371,7 +389,6 @@

    --arcs-extra -y - arcs scaffold Additional ARCS parameters, in quotes and option=arg format @@ -380,7 +397,6 @@

    --contig-length -c 500 - arcs scaffold Minimum contig length @@ -389,7 +405,6 @@

    --links -n 5 - arcs scaffold Minimum number of links to compute scaffold @@ -398,7 +413,6 @@

    --min-aligned -a 5 - arcs scaffold Minimum aligned read pairs per barcode @@ -407,7 +421,6 @@

    --min-quality -q 0 - arcs scaffold Minimum mapping quality @@ -416,7 +429,6 @@

    --mismatch -m 5 - arcs scaffold Maximum number of mismatches @@ -425,7 +437,6 @@

    --molecule-distance -d 50000 - arcs scaffold Distance cutoff to split molecules (bp) @@ -434,7 +445,6 @@

    --molecule-length -l 2000 - arcs scaffold Minimum molecule length (bp) @@ -443,7 +453,6 @@

    --seq-identity -i 98 - arcs scaffold Minimum sequence identity @@ -452,7 +461,6 @@

    --span -s 20 - arcs scaffold Minimum number of spanning molecules to be considered assembled @@ -461,7 +469,6 @@

    --organism-type -u eukaryote - report Organism type for assembly report: eukaryote,prokaryote, or fungus diff --git a/workflows/deconvolve/index.html b/workflows/deconvolve/index.html index c9499166f..b0bc32613 100644 --- a/workflows/deconvolve/index.html +++ b/workflows/deconvolve/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + + @@ -101,7 +101,7 @@
  • - + @@ -120,6 +120,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -225,7 +235,7 @@
  • - + @@ -244,6 +254,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -419,7 +439,6 @@

    argument short name default -required description @@ -428,35 +447,32 @@

    INPUTS -‼️ -Files or directories containing input FASTQ files + + required + Files or directories containing input FASTQ files --density -d 3 - On average, \frac{1}{2^d} kmers are indexed --dropout -a 0 - Minimum cloud size to deconvolve --kmer-length -k 21 - Size of k-mers to search for similarities --window-size -w 40 - Size of window guaranteed to contain at least one kmer diff --git a/workflows/demultiplex/index.html b/workflows/demultiplex/index.html index 4ab6967d4..ca617df4f 100644 --- a/workflows/demultiplex/index.html +++ b/workflows/demultiplex/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -312,7 +332,6 @@

    argument short name -required description @@ -320,38 +339,44 @@

    METHOD -‼️ -Haplotag technology of the sequences [gen1] + + required + Haplotag technology of the sequences [gen1] R1_FQ -‼️ -The forward multiplexed FASTQ file + + required + The forward multiplexed FASTQ file R2_FQ -‼️ -The reverse multiplexed FASTQ file + + required + The reverse multiplexed FASTQ file I1_FQ -‼️ -The forward FASTQ index file provided by the sequencing facility + + required + The forward FASTQ index file provided by the sequencing facility I2_FQ -‼️ -The reverse FASTQ index file provided by the sequencing facility + + required + The reverse FASTQ index file provided by the sequencing facility --schema -s -‼️ -Tab-delimited file of sample<tab>barcode + + required + Tab-delimited file of sample<tab>barcode diff --git a/workflows/impute/index.html b/workflows/impute/index.html index 76e0a2348..20e3a10ae 100644 --- a/workflows/impute/index.html +++ b/workflows/impute/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -104,7 +104,7 @@
  • - + @@ -123,6 +123,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -228,7 +238,7 @@
  • - + @@ -247,6 +257,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -403,7 +423,6 @@

    argument short name default -required description @@ -412,36 +431,37 @@

    INPUTS -‼️ -Files or directories containing input BAM files + + required + Files or directories containing input BAM files --extra-params -x - Extra arguments to add to the STITCH R function, provided in quotes and R syntax --parameters -p -‼️ -STITCH parameter file (tab-delimited) + + required + STITCH parameter file (tab-delimited) --vcf -v -‼️ -Path to VCF/BCF file + + required + Path to VCF/BCF file --vcf-samples - -Use samples present in vcf file for imputation rather than those found the directory +Use samples present in vcf file for imputation rather than those found the directory (see below) diff --git a/workflows/metassembly/index.html b/workflows/metassembly/index.html index 726a79491..54dd9e9e9 100644 --- a/workflows/metassembly/index.html +++ b/workflows/metassembly/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -319,7 +339,6 @@

    argument short name default -required description @@ -328,56 +347,54 @@

    FASTQ_R1 -‼️ -deconvolved FASTQ file of forward reads + + required + deconvolved FASTQ file of forward reads FASTQ_R2 -‼️ -deconvolved FASTQ file of reverse reads + + required + deconvolved FASTQ file of reverse reads --bx-tag -b BX -‼️ -Which sequence header tag encodes the linked-read barcode (BX for BX:Z or BC for BC:Z) + + required + Which sequence header tag encodes the linked-read barcode (BX for BX:Z or BC for BC:Z) --extra-params -x - Additional spades parameters, in quotes --ignore-bx - Ignore linked-read info for initial spades assembly --kmer-length -k auto - Kmer lengths to use for initial spades assembly. They must be odd and <128, separated by commas, and without spaces. (e.g. 13,23,51) --max-memory -r 10000 - Maximum memory for spades to use, given in megabytes --organism-type -u eukaryote - Organism type for assembly report. Options: eukaryote,prokaryote,fungus diff --git a/workflows/other/index.html b/workflows/other/index.html index 14490ddb8..b2424c473 100644 --- a/workflows/other/index.html +++ b/workflows/other/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
    @@ -99,7 +99,7 @@
  • - + @@ -118,6 +118,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -223,7 +233,7 @@
  • - + @@ -242,6 +252,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/workflows/phase/index.html b/workflows/phase/index.html index 74150059f..d2fa7e4f6 100644 --- a/workflows/phase/index.html +++ b/workflows/phase/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -364,7 +384,6 @@

    argument short name default -required description @@ -373,63 +392,58 @@

    INPUTS -‼️ -Files or directories containing input BAM files + + required + Files or directories containing input BAM files --contigs - Contigs to plot in the report --extra-params -x - Additional Hapcut2 arguments, in quotes --genome -g - Path to genome if wanting to also use reads spanning indels --ignore-bx -b - Ignore haplotag barcodes for phasing --molecule-distance -d 100000 - Base-pair distance threshold to separate molecules --prune-threshold -p 7 - PHRED-scale (%) threshold for pruning low-confidence SNPs --vcf -v -‼️ -Path to BCF/VCF file + + required + Path to BCF/VCF file --vcf-samples - Use samples present in vcf file for imputation rather than those found the directory diff --git a/workflows/preflight/index.html b/workflows/preflight/index.html index e3c375e7a..6696f6cb5 100644 --- a/workflows/preflight/index.html +++ b/workflows/preflight/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + +
    @@ -101,7 +101,7 @@
  • - + @@ -120,6 +120,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -225,7 +235,7 @@
  • - + @@ -244,6 +254,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -403,7 +423,6 @@

    argument short name default -required description @@ -412,8 +431,9 @@

    INPUTS -‼️ -Files or directories containing input fastq or bam files + + required + Files or directories containing input fastq or bam files diff --git a/workflows/qc/index.html b/workflows/qc/index.html index 31de643b8..f3f91319b 100644 --- a/workflows/qc/index.html +++ b/workflows/qc/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -371,7 +391,6 @@

    argument short name default -required description @@ -380,28 +399,26 @@

    INPUTS -‼️ -Files or directories containing input FASTQ files + + required + Files or directories containing input FASTQ files --deconvolve -c - Resolve barcode clashes between reads from different molecules --deconvolve-params -p 21,40,3,0 - Accepts the QuickDeconvolution parameters for k,w,d,a, in that order --deduplicate -d - Identify and remove PCR duplicates recommended @@ -410,28 +427,24 @@

    --extra-params -x - Additional fastp arguments, in quotes --min-length -n 30 - Discard reads shorter than this length --max-length -m 150 - Maximum length to trim sequences down to --trim-adapters -a - Detect and remove adapter sequences recommended diff --git a/workflows/simulate/index.html b/workflows/simulate/index.html index 8ed18cffd..46d42d7e1 100644 --- a/workflows/simulate/index.html +++ b/workflows/simulate/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
    @@ -98,7 +98,7 @@
  • - + @@ -117,6 +117,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -222,7 +232,7 @@
  • - + @@ -241,6 +251,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/workflows/simulate/simulate-linkedreads/index.html b/workflows/simulate/simulate-linkedreads/index.html index c93542dac..cf4e57e19 100644 --- a/workflows/simulate/simulate-linkedreads/index.html +++ b/workflows/simulate/simulate-linkedreads/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -104,7 +104,7 @@
  • - + @@ -123,6 +123,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -228,7 +238,7 @@
  • - + @@ -247,6 +257,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -306,11 +326,11 @@

    The original LRSIM is a lengthy Perl script that, like Harpy, outsources -to various other programs (SURVIVOR, DWGSIM, samtools, msort) and acts as a workflow through these programs. The Harpy -version of LRSIM keeps only the novel LRSIM code that creates linked reads from reads simulated by DWGSIM. The -rest of LRSIM's components are reincorporated into the Snakemake workflow governing the +to various other programs (SURVIVOR, DWGSIM, samtools, msort) and acts as a workflow through these programs. The Harpy +version of LRSIM keeps only the novel LRSIM code that creates linked reads from reads simulated by DWGSIM. The +rest of LRSIM's components are reincorporated into the Snakemake workflow governing the simulate linkedreads -module, while removing the SURVIVOR part since + module, while removing the SURVIVOR part since simulate {snpindel,...} are used for that purpose.

    @@ -320,12 +340,12 @@

      -
    • dependencies are expected to be on the PATH, not hardcoded to the folder LRSIM is running from
    • +
    • dependencies are expected to be on the PATH, not hardcoded to the folder LRSIM is running from
    • -r parameter changed to folder prefix since Harpy uses -g for the haplotypes
    • outputs are coded a little differently for flexibility (and use the -r parameter for some parts)
    • -
    • SURVIVOR variant simulation functionality removed entirely
    • -
    • DWGSIM, samtools, msort, and extractReads functionality moved into Harpy workflow
    • -
    • uses newer version of DWGSIM
    • +
    • SURVIVOR variant simulation functionality removed entirely
    • +
    • DWGSIM, samtools, msort, and extractReads functionality moved into Harpy workflow
    • +
    • uses newer version of DWGSIM
    @@ -362,7 +382,6 @@

    argument short name default -required description @@ -371,70 +390,64 @@

    HAP1_GENOME -‼️ -Haplotype 1 of the diploid genome to simulate reads + + required + Haplotype 1 of the diploid genome to simulate reads HAP2_GENOME -‼️ -Haplotype 1 of the diploid genome to simulate reads + + required + Haplotype 2 of the diploid genome to simulate reads --barcodes -b 10X barcodes - File of linked-read barcodes to add to reads --distance-sd -s 15 - Standard deviation of read-pair distance --molecule-length -l 100 - Mean molecule length (kbp) --molecules-per -m 10 - Average number of molecules per partition --mutation-rate -r 0.001 - Random mutation rate for simulating reads (0 - 1.0) --outer-distance -d 350 - Outer distance between paired-end reads (bp) --patitions -p 1500 - Number (in thousands) of partitions/beads to generate --read-pairs -n 600 - Number (in millions) of read pairs to simulate @@ -511,7 +524,7 @@

    entire barcode due to a segment failing to be associated with a beadtag segment. In the simulated data, since 10X barcodes don't feature segments, failure to associate the first 16 bases of read 1 with barcodes provided to --barcodes will appear as BX:Z:A00C00B00D00. The original 10X barcode (or first 16 bases of read 1) -will be removed from the sequence and stored in the TX:Z sequence header tag, e.g. TX:Z:ATATGTACTCATACCA. +will be removed from the sequence and stored in the OX:Z sequence header tag, e.g. OX:Z:ATATGTACTCATACCA. The paired reverse read will also have these tags. The diagram below attempts to simplify this visually.
    10X linked read barcode conversion into AxxCxxBxxDxx haplotag barcode format diff --git a/workflows/simulate/simulate-variants/index.html b/workflows/simulate/simulate-variants/index.html index e11d274f8..8b7f10491 100644 --- a/workflows/simulate/simulate-variants/index.html +++ b/workflows/simulate/simulate-variants/index.html @@ -4,7 +4,7 @@ - + @@ -32,12 +32,12 @@ - + - + - - + +
    @@ -99,7 +99,7 @@
  • - + @@ -118,6 +118,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -223,7 +233,7 @@
  • - + @@ -242,6 +252,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -363,57 +383,50 @@

    argument -short name -required +short name description INPUT_GENOME - -‼️ -The haploid genome to simulate variants onto + + + required + The haploid genome to simulate variants onto --centromeres --c - +-c GFF3 file of centromeres to avoid --exclude-chr --e - +-e Text file of chromosomes to avoid, one per line --genes --g - +-g GFF3 file of genes to avoid simulating over (see snpindel for caveat) --heterozygosity --z - +-z proportion of simulated variants to make heterozygous (default: 0) --only-vcf - When used with --heterozygosity, will create the diploid VCFs but will not simulate a diploid genome --prefix - Naming prefix for output files (default: sim.{module_name}) --randomseed - Random seed for simulation diff --git a/workflows/snp/index.html b/workflows/snp/index.html index 3cc4cb37a..c9a85a8c1 100644 --- a/workflows/snp/index.html +++ b/workflows/snp/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -405,7 +425,6 @@

    argument short name default -required description @@ -414,42 +433,40 @@

    INPUTS -‼️ -Files or directories containing input BAM files + + required + Files or directories containing input BAM files --extra-params -x - Additional mpileup/freebayes arguments, in quotes --genome -g -‼️ -Genome assembly for variant calling + + required + Genome assembly for variant calling --ploidy -n 2 - Ploidy of samples --populations -p - Tab-delimited file of sample<tab>group --regions -r 50000 - Regions to call variants on (see below) diff --git a/workflows/sv/index.html b/workflows/sv/index.html index eec04801f..ca7b3bbb4 100644 --- a/workflows/sv/index.html +++ b/workflows/sv/index.html @@ -4,7 +4,7 @@ - + @@ -32,11 +32,11 @@ - + - + - +
    @@ -98,7 +98,7 @@
  • - + @@ -117,6 +117,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -222,7 +232,7 @@
  • - + @@ -241,6 +251,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • diff --git a/workflows/sv/leviathan/index.html b/workflows/sv/leviathan/index.html index ca0c71741..6591b8dbd 100644 --- a/workflows/sv/leviathan/index.html +++ b/workflows/sv/leviathan/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -410,7 +430,6 @@

    argument short name default -required description @@ -419,56 +438,52 @@

    INPUTS -‼️ -Files or directories containing input BAM files + + required + Files or directories containing input BAM files --contigs - Contigs to plot in the report --extra-params -x - Additional naibr arguments, in quotes --genome -g -‼️ -Genome assembly that was used to create alignments + + required + Genome assembly that was used to create alignments --iterations -i 50 - Number of iterations to perform through index (reduces memory) --min-barcodes -b 2 - Minimum number of barcode overlaps supporting candidate SV --min-sv -m 1000 - Minimum size of SV to detect --populations -p - Tab-delimited file of sample<tab>group diff --git a/workflows/sv/naibr/index.html b/workflows/sv/naibr/index.html index 8bdc0fe3d..86635cf8e 100644 --- a/workflows/sv/naibr/index.html +++ b/workflows/sv/naibr/index.html @@ -4,7 +4,7 @@ - + @@ -34,12 +34,12 @@ - + - + - - + + @@ -102,7 +102,7 @@
  • - + @@ -121,6 +121,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -226,7 +236,7 @@
  • - + @@ -245,6 +255,16 @@ Submit an Issue
  • +
  • + + + + + + + Development + +
  • @@ -397,7 +417,6 @@

    argument short name default -required description @@ -406,71 +425,67 @@

    INPUTS -‼️ -Files or directories containing input BAM files + + required + Files or directories containing input BAM files --contigs - Contigs to plot in the report --extra-params -x - Additional naibr arguments, in quotes --genome -g -‼️ -Genome assembly for phasing bam files + + required + Genome assembly for phasing bam files --min-barcodes -b 2 - Minimum number of barcode overlaps supporting candidate SV --min-quality -q 30 - Minimum MQ (SAM mapping quality) to pass filtering --min-sv -n 1000 - Minimum size of SV to detect --molecule-distance -m 100000 - Base-pair distance threshold to separate molecules --populations -p - Tab-delimited file of sample<tab>group --vcf -v - -Phased vcf file for phasing bam files (see below) + + conditionally required + Phased vcf file for phasing bam files (see below)