From 180d0e23d9dac5ca61d5bade3da404a0ac0b66bf Mon Sep 17 00:00:00 2001 From: fenderglass Date: Tue, 8 Aug 2017 15:26:09 -0700 Subject: [PATCH] docs update --- README.md | 5 ++-- abruijn/main.py | 7 ++++-- docs/INSTALL.md | 3 ++- docs/NEWS.md | 5 ++-- docs/USAGE.md | 65 +++++++++++++++++++++++++++---------------------- 5 files changed, 49 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 0d9712bc4..8e2957806 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ polisher module, which produces assembly of high nucleotide-level quality. Since the version 2.0, ABruijn performs additional repeat analysis step, which improves the structural accuracy of the resulting sequence. -The algorithm also produces the graph representation of the final assembly. +The algorithm also produces a graph representation of the final assembly. ABruijn has moderate memory requirements and is designed to run on a single node. Typically, assembly of a bacteria with 50x coverage takes less than half an hour @@ -30,7 +30,7 @@ See the *docs/USAGE.md* file. Publications ------------ -Yu Lin, Jeffrey Yuan, Mikhail Kolmogorov, Max W Shen, Pavel Pevzner, +Yu Lin, Jeffrey Yuan, Mikhail Kolmogorov, Max W Shen, Mark Chaisson and Pavel Pevzner, "Assembly of Long Error-Prone Reads Using de Bruijn Graphs", PNAS 2016 @@ -39,6 +39,7 @@ Third-party ABruijn package includes some third-party software: * libcuckoo [http://github.com/efficient/libcuckoo] +* BLASR [https://github.com/PacificBiosciences/blasr] License diff --git a/abruijn/main.py b/abruijn/main.py index 6f2656219..1ea0079e4 100755 --- a/abruijn/main.py +++ b/abruijn/main.py @@ -280,13 +280,16 @@ def _enable_logging(log_file, debug, overwrite): def main(): - def check_int_range(value, min_val, max_val): + def check_int_range(value, min_val, max_val, require_odd=False): ival = int(value) if ival < min_val or ival > max_val: raise argparse.ArgumentTypeError("value should be in " "range [{0}, {1}]".format(min_val, max_val)) + if require_odd and ival % 2 == 0: + raise argparse.ArgumentTypeError("should be an odd number") return ival + parser = argparse.ArgumentParser(description="ABruijn: assembly of long and" " error-prone reads") @@ -316,7 +319,7 @@ def check_int_range(value, min_val, max_val): choices=["pacbio", "nano", "pacbio_hi_err"], help="sequencing platform (default: pacbio)") parser.add_argument("-k", "--kmer-size", dest="kmer_size", - type=lambda v: check_int_range(v, 10, 32), + type=lambda v: check_int_range(v, 11, 31, require_odd=True), default=15, help="kmer size (default: 15)") parser.add_argument("-o", "--min-overlap", dest="min_overlap", type=lambda v: check_int_range(v, 2000, 10000), diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 32165f405..2de6e6ebd 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -42,4 +42,5 @@ Additonally, you may install the package for the better OS integration: python setup.py install Alternatively, you can perform local user installation by adding '--user' or '--prefix' -options to the previous command. +options to the previous command. The system installation does not include BLASR, which +should be installed separately. diff --git a/docs/NEWS.md b/docs/NEWS.md index 3abbea4c6..00edfc994 100644 --- a/docs/NEWS.md +++ b/docs/NEWS.md @@ -1,6 +1,7 @@ ABruijn 2.0b (25 Jul 2017) -======================== +========================== + * A new repeat graph analysis module for more complete and accurate assembly -* ABruijn now outputs repeat graphs of the assemblies +* ABruijn now outputs a graph representation of the final assembly * Significant improvements in performance and reduced memory footprint * Various bugfixes diff --git a/docs/USAGE.md b/docs/USAGE.md index cbb107ce8..54fa353b1 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -24,7 +24,7 @@ Quick usage -t THREADS, --threads THREADS number of parallel threads (default: 1) -i NUM_ITERS, --iterations NUM_ITERS - number of polishing iterations (default: 2) + number of polishing iterations (default: 1) -p {pacbio,nano,pacbio_hi_err}, --platform {pacbio,nano,pacbio_hi_err} sequencing platform (default: pacbio) -k KMER_SIZE, --kmer-size KMER_SIZE @@ -77,8 +77,8 @@ FASTA format using the corresponding official tools. ABruijn was tested on the newest P6-C4 chemistry data with error rates 11-15%. Typically, a bacterial WGS project with 20x-30x+ coverage can be assembled into a single, structurally concordant contig for each chromosome. However, to get the best -nucleotide-level quaity, you might need deeper coverage. For a 55x E. coli dataset, -ABruijn makes roughly 20 errors (single nucleotide insertions/deletions). +nucleotide-level quaity, you might need deeper coverage. For a 50x E. coli dataset, +ABruijn makes roughly 30 errors (single nucleotide insertions/deletions). Below are empirical error estimates for E. Coli assemblies with lower coverage: cov. errors @@ -90,8 +90,8 @@ Below are empirical error estimates for E. Coli assemblies with lower coverage: 25x 687 If the coverage of the bacterial dataset is significantly higher than 100x, you -might consider reducing the coverage by filtering out shorter reads - this -may reduce the running time and memory footprint without affecting the quality. +might consider decreasing the coverage by filtering out shorter reads - this +should reduce the running time and memory footprint without affecting the quality. However, for some complicated genomes (such as those enriched with mosaic tandem repeats), incorporating all available reads may be preferred for obtaining a more accurate structural assembly. @@ -101,9 +101,9 @@ incorporating all available reads may be preferred for obtaining a more accurate We performed our benchmarks with ONT 2D pass reads with error rates 13-19%. Due to the increased error rate, you might need deeper coverage to get a complete chromosome assembly (60x as in the E. coli example above). For low coverage datasets -(<30x), you might need to adjust some parameters (as described below) to get complete chromosomes. -Due to the biased error pattern, per-nucleotide accuracy is usually lower for ONT data than with -PacBio data, especially in homopolymer regions. +(<30x) or datasets with shorter read length you might need to adjust some parameters +(as described below) to get complete chromosomes. Due to the biased error pattern, +per-nucleotide accuracy is usually lower for ONT data than with PacBio data, especially in homopolymer regions. Input Data Preparation ---------------------- @@ -135,7 +135,7 @@ so ABruijn can account for their different error patterns. ### Number of polishing iterations ABruijn first constructs a draft assembly of the genome, which is a -concatenation of a collection of raw read segments. Then, the draft assembly +concatenation of a collection of raw read segments. In the end, the draft assembly is polished into a high quality sequence. By default, ABruijn runs one polishing iteration. The number could be increased, which might correct a small number of additional errors (due to improvements on how reads may align to the corrected assembly; @@ -149,20 +149,19 @@ prematurely. The assembly will continue from the last previously completed step. ### Minimum overlap length -This sets a minimum overlap length for two reads to be considered truly overlapping. -Since the algithm is based on approximate overlaps (without alignment), we require -relatively long overlaps (5000 by default), which is suitable for most datasets -with coverage 30x+. However, you may decrease this parameter for better contiguity -on low-coverage datasets. You may also increase it to account for the presence of -a large number of long repetitive elements. +This sets a minimum overlap length for two reads to be considered overlapping. +Since the algithm is based on approximate overlaps (without computing exact alignment), +we require relatively long overlaps (5000 by default), which is suitable for the most datasets +with realtively good read length and coverage. However, you may decrease this parameter for better contiguity +on low-coverage datasets or datasets with shorter mean read length (such as produced with older +PacBio chemistry). ### Kmer size This parameter controls the size of the kmers used to construct the ABruijn graph. The default kmer size (15) is suitable for most genomes under several hundreds of megabytes in size. You might want to increase it -a bit (17 or 19) for larger genomes, which will also require more memory -for processing. +a bit (17 or 19) for larger genomes (500 Mb+). Should be an odd number. ### Minimum / maximum kmer frequency @@ -175,6 +174,7 @@ and sequencing platform. We recommend that it be chosen automatically by the assembler. + Running time and memory requirements ------------------------------------ @@ -184,18 +184,16 @@ can be assembled within a day on a machine with 128Gb of memory and 20 CPUs. The amount of memory required scales linearly with the genome size and read coverage. A rough formula for estimating memory requirements is: -1 Gb of RAM = 1 Mb of genome x coverage / 50 +1 Gb of RAM = 1 Mb of genome x coverage / 100 For example, an assembly of 500 Mb genome with 50x coverage would require -approximately 500 Gb of memory. Below are running times and memory footprints +approximately 250 Gb of memory. Below are running times and memory footprints for different datasets. - Genome Size Coverage Wall_clock CPU_time RAM - E. Coli 4. 6Mb 50 44m 2h40m 0.5 Gb - X. Oryzae 5.1 Mb 140 2h55m 10h 5 Gb - S. Cerevisiae 12.2 Mb 120 4h50m 19h20m 7 Gb - B. Neritina* 200 Mb 30 48h5m 1400h 70 Gb - -* B. Neretina assembly also included symbiotic bacteria genomes. + Genome Size Coverage Wall_clock CPU_time RAM + E. Coli 4. 6Mb 50 25m 2h 0.5 Gb + X. Oryzae 5.1 Mb 140 1h30m 10h 5 Gb + S. Cerevisiae 12.2 Mb 120 1h50m 10h20m 7 Gb + B. Neritina meta 200 Mb 30 36h5m 600h 70 Gb Algorithm description @@ -211,10 +209,19 @@ for more detailed information. The assembly pipeline is organized as follows: * Contig assembly by read extension The resulting contig assembly is now simply a concatenation of read parts -and is error-prone. The next pipeline steps are aimed at polishing this -draft assembly into a high-quality one. +and is error-prone. ABruijn then aligns the reads on the draft contigs and +calls a rough consensus. Afterwards, the algorithm performs additional repeat analysis +as follows: + +* Repeat graph is reconstructed from the assembled sequence +* In this graph all repeats longer than minimum overlap are collapsed +* The algorithm resolves repeats using the read information and graph structure +* The unbranching paths in the graph are output as contigs + +Finally, ABruijn performs polishing of the resulting assembly +to correct the remaining errors: -* Alignment of all reads to the draft assembly using BLASR +* Alignment of all reads to the current assembly using BLASR * Selection of solid regions * Partition the total alignment of all reads into mini-alignments (bubbles) * Error correction of each bubble using a maximum likelihood approach