Skip to content

Commit

Permalink
update for r350
Browse files Browse the repository at this point in the history
  • Loading branch information
chhylp123 committed Jul 26, 2021
1 parent b14f894 commit 321acb2
Show file tree
Hide file tree
Showing 12 changed files with 241 additions and 130 deletions.
12 changes: 6 additions & 6 deletions CommandLines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,16 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " -D FLOAT drop k-mers occurring >FLOAT*coverage times [%.1f]\n", asm_opt->high_factor);
fprintf(stderr, " -N INT consider up to max(-D*coverage,-N) overlaps for each oriented read [%d]\n", asm_opt->max_n_chain);
fprintf(stderr, " -r INT round of correction [%d]\n", asm_opt->number_of_round);
fprintf(stderr, " -z INT length of adapters that should be removed [%d]\n", asm_opt->adapterLen);
fprintf(stderr, " Assembly:\n");
fprintf(stderr, " -a INT round of assembly cleaning [%d]\n", asm_opt->clean_round);
fprintf(stderr, " -z INT length of adapters that should be removed [%d]\n", asm_opt->adapterLen);
fprintf(stderr, " -m INT pop bubbles of <INT in size in contig graphs [%lld]\n", asm_opt->large_pop_bubble_size);
fprintf(stderr, " -p INT pop bubbles of <INT in size in unitig graphs [%lld]\n", asm_opt->small_pop_bubble_size);
fprintf(stderr, " -n INT remove tip unitigs composed of <=INT reads [%d]\n", asm_opt->max_short_tip);
fprintf(stderr, " -x FLOAT max overlap drop ratio [%.2g]\n", asm_opt->max_drop_rate);
fprintf(stderr, " -y FLOAT min overlap drop ratio [%.2g]\n", asm_opt->min_drop_rate);
fprintf(stderr, " -i ignore saved read correction and overlaps\n");
fprintf(stderr, " -u disable post join contigs step which may improve N50\n");
fprintf(stderr, " -u disable post-join step for contigs which may improve N50\n");
fprintf(stderr, " --hom-cov INT\n");
fprintf(stderr, " homozygous read coverage [auto]\n");
fprintf(stderr, " --lowQ INT\n");
Expand All @@ -95,12 +95,12 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " Trio-partition:\n");
fprintf(stderr, " -1 FILE hap1/paternal k-mer dump generated by \"yak count\" []\n");
fprintf(stderr, " -2 FILE hap2/maternal k-mer dump generated by \"yak count\" []\n");
fprintf(stderr, " -3 FILE list of hap1/paternal read names []\n");
fprintf(stderr, " -4 FILE list of hap2/maternal read names []\n");
fprintf(stderr, " -c INT lower bound of the binned k-mer's frequency [%d]\n", asm_opt->min_cnt);
fprintf(stderr, " -d INT upper bound of the binned k-mer's frequency [%d]\n", asm_opt->mid_cnt);
fprintf(stderr, " -3 FILE list of hap1/paternal read names []\n");
fprintf(stderr, " -4 FILE list of hap2/maternal read names []\n");
fprintf(stderr, " --t-occ INT\n");
fprintf(stderr, " force remove unitigs with >INT unexpected haplotype-specific reads;\n");
fprintf(stderr, " forcedly remove unitigs with >INT unexpected haplotype-specific reads;\n");
fprintf(stderr, " ignore graph topology; [%d]\n", asm_opt->trio_flag_occ_thres);

fprintf(stderr, " Purge-dups:\n");
Expand Down Expand Up @@ -131,7 +131,7 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " detect misjoined unitigs of >=INT in size; 0 to disable [%lu]\n", asm_opt->misjoin_len);

fprintf(stderr, "Example: ./hifiasm -o NA12878.asm -t 32 NA12878.fq.gz\n");
fprintf(stderr, "See `man ./hifiasm.1' for detailed description of these command-line options.\n");
fprintf(stderr, "See `https://hifiasm.readthedocs.io/en/latest/' or `man ./hifiasm.1' for complete documentation.\n");
}

void init_opt(hifiasm_opt_t* asm_opt)
Expand Down
2 changes: 1 addition & 1 deletion CommandLines.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <pthread.h>
#include <stdint.h>

#define HA_VERSION "0.15.4-r349"
#define HA_VERSION "0.15.5-r350"

#define VERBOSE 0

Expand Down
9 changes: 4 additions & 5 deletions Purge_Dups.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5267,11 +5267,7 @@ uint32_t just_coverage, hap_cov_t *cov, uint32_t collect_p_trans, uint32_t colle
hap_alignment_struct_pip hap_buf;
long long k_mer_only, coverage_only;

if(asm_opt.pur_global_coverage != -1)
{
hap_buf.cov_threshold = asm_opt.pur_global_coverage;
}
else if(asm_opt.hom_global_coverage != -1)
if(asm_opt.hom_global_coverage != -1)
{
hap_buf.cov_threshold = (asm_opt.hom_global_coverage_set?
(((double)asm_opt.hom_global_coverage)*((double)HOM_PEAK_RATE)):(asm_opt.hom_global_coverage));
Expand Down Expand Up @@ -5325,6 +5321,9 @@ uint32_t just_coverage, hap_cov_t *cov, uint32_t collect_p_trans, uint32_t colle
}
}
if(asm_opt.hom_global_coverage == -1) asm_opt.hom_global_coverage = hap_buf.cov_threshold;
if(asm_opt.pur_global_coverage != -1) hap_buf.cov_threshold = asm_opt.pur_global_coverage;
fprintf(stderr, "[M::%s] homozygous read coverage threshold: %d\n", __func__, asm_opt.hom_global_coverage_set?
asm_opt.hom_global_coverage:(int)(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE)));
fprintf(stderr, "[M::%s] purge duplication coverage threshold: %lld\n", __func__, hap_buf.cov_threshold);
if(just_coverage) goto end_coverage;

Expand Down
42 changes: 11 additions & 31 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ cd hifiasm && make
# Run on test data (use -f0 for small datasets)
wget https://github.com/chhylp123/hifiasm/releases/download/v0.7/chr11-2M.fa.gz
./hifiasm -o test -t4 -f0 chr11-2M.fa.gz 2> test.log
awk '/^S/{print ">"$2;print $3}' test.p_ctg.gfa > test.p_ctg.fa # get primary contigs in FASTA
awk '/^S/{print ">"$2;print $3}' test.bp.p_ctg.gfa > test.p_ctg.fa # get primary contigs in FASTA

# Assemble inbred/homozygous genomes (-l0 disables duplication purging)
hifiasm -o CHM13.asm -t32 -l0 CHM13-HiFi.fa.gz 2> CHM13.asm.log
Expand Down Expand Up @@ -81,8 +81,8 @@ hifiasm -o NA12878.asm -t 32 NA12878.fq.gz
```
where `NA12878.fq.gz` provides the input reads, `-t` sets the number of CPUs in
use and `-o` specifies the prefix of output files. For this example, the
primary contigs are written to `NA12878.asm.bp.p_ctg.gfa` and alternate contigs to
`NA12878.asm.bp.a_ctg.gfa`. Since v0.15, hifiasm also produces two sets of
primary contigs are written to `NA12878.asm.bp.p_ctg.gfa`.
Since v0.15, hifiasm also produces two sets of
partially phased contigs at `NA12878.asm.bp.hap?.p_ctg.gfa`. This pair of files
can be thought to represent the two haplotypes in a diploid genome, though with
occasional switch errors. The frequency of switches is determined by the
Expand Down Expand Up @@ -116,8 +116,8 @@ In this mode, each contig is supposed to be a haplotig, which by definition
comes from one parental haplotype only. Hifiasm often puts all contigs from the
same parental chromosome in one assembly. It has cleanly separated chrX and
chrY for a human male dataset. Nonetheless, phasing across centromeres is
challenging. Users should not expect hifiasm to phase entire chromosomes at the
moment. Also, contigs from different parental chromosomes are randomly mixed as
challenging. Hifiasm is often able to phase entire chromosomes but it may fail
in rare cases. Also, contigs from different parental chromosomes are randomly mixed as
it is just not possible to phase across chromosomes with Hi-C.

Hifiasm does not perform scaffolding for now. You need to run a standalone
Expand All @@ -133,7 +133,7 @@ yak count -k31 -b37 -t16 -o pat.yak paternal.fq.gz
yak count -k31 -b37 -t16 -o mat.yak maternal.fq.gz
hifiasm -o NA12878.asm -t 32 -1 pat.yak -2 mat.yak NA12878.fq.gz
```
Here `NA12878.asm.hap1.p_ctg.gfa` and `NA12878.asm.hap2.p_ctg.gfa` give the two
Here `NA12878.asm.dip.hap1.p_ctg.gfa` and `NA12878.asm.dip.hap2.p_ctg.gfa` give the two
haplotype assemblies. In the binning mode, hifiasm does not purge haplotig
duplicates by default. Because hifiasm reuses saved overlaps, you can
generate both primary/alternate assemblies and trio binning assemblies with
Expand All @@ -145,32 +145,10 @@ The second command line will run much faster than the first.

### <a name="output"></a>Output files

For non-trio assembly, hifiasm generates the following files:

1. Haplotype-resolved raw [unitig][unitig] graph in [GFA][gfa] format
(*prefix*.r\_utg.gfa). This graph keeps all haplotype information, including
somatic mutations and recurrent sequencing errors.
2. Haplotype-resolved processed unitig graph without small bubbles
(*prefix*.p\_utg.gfa). Small bubbles might be caused by somatic mutations or noise in data,
which are not the real haplotype information.
3. Primary assembly [contig][unitig] graph (*prefix*.p\_ctg.gfa). This graph collapses different
haplotypes.
4. Alternate assembly contig graph (*prefix*.a\_ctg.gfa). This graph consists of all assemblies that
are discarded in primary contig graph.

For trio assembly, hifiasm generates the following files:

1. Haplotype-resolved raw [unitig][unitig] graph in [GFA][gfa] format
(*prefix*.r\_utg.gfa). This graph keeps all haplotype information.

2. Phased paternal/haplotype1 contig graph (*prefix*.hap1.p\_ctg.gfa). This graph keeps the phased
paternal/haplotype1 assembly.

3. Phased maternal/haplotype2 contig graph (*prefix*.hap2.p\_ctg.gfa). This graph keeps the phased
maternal/haplotype2 assembly.

Hifiasm writes error corrected reads to the *prefix*.ec.bin binary file and
Hifiasm generates different types of assemblies based on the input data.
It also writes error corrected reads to the *prefix*.ec.bin binary file and
writes overlaps to *prefix*.ovlp.source.bin and *prefix*.ovlp.reverse.bin.
For more details, please see the complete [documentation][tutorial_output].

## <a name="results"></a>Results

Expand Down Expand Up @@ -226,6 +204,8 @@ non-human ones are available [here][zenodo-nonh].
[paf]: https://github.com/lh3/miniasm/blob/master/PAF.md
[yak]: https://github.com/lh3/yak
[tutorial]: https://hifiasm.readthedocs.io/en/latest/index.html
[tutorial_output]: https://hifiasm.readthedocs.io/en/latest/interpreting-output.html#interpreting-output


## <a name="help"></a>Getting Help

Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
# built documents.
#
# The short X.Y version.
version = '0.15.5-r348'
version = '0.15.5-r350'
# The full version, including alpha/beta/rc tags.
release = '0.15.5'

Expand Down
30 changes: 22 additions & 8 deletions docs/source/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@ Are diploid genomes supported?
Are polyploid genomes supported?
-------------------------------------

The ``*r_utg.gfa`` and ``*p_utg.gfa`` are lossless so that they also work for polyploid genomes. However, currently the contig-generation modules of hifiasm are designed for diploid samples, which means both the partially phased assembly and the fully-phased assembly does not directly support polyploid genomes. Please use primary assembly for polyploid samples and run multiple rounds of purging steps using third-party tools such as purge_dups.
The ``*r_utg.gfa`` and ``*p_utg.gfa`` are lossless so that they also work for polyploid genomes. However, currently the contig-generation modules of hifiasm are designed for diploid samples, which means both the partially phased assembly and the fully-phased assembly does not directly support polyploid genomes. If it is set to >2, the quality of primary assembly for polyploid genomes might be improved. Please use primary assembly for polyploid samples and run multiple rounds of purging steps using third-party tools such as purge_dups.

Why one Hi-C integrated assembly is larger than another one?
------------------------------------------------------------

For some samples like human male, the paternal haplotype should be larger than the maternal haplotype. However, if one assembly is much larger than another one, it should be the issues of hifiasm. To fix it, please set smaller value for ``-s`` (default: 0.55).

Another possibility is that hifiasm misidentifies coverage threshold for homozygous reads. For instance, hifiasm prints the following information during assembly without setting the option ``--purge-cov``:
Another possibility is that hifiasm misidentifies coverage threshold for homozygous reads. For instance, hifiasm prints the following information during assembly:
::

[M::purge_dups] purge duplication coverage threshold: 46
[M::purge_dups] purge duplication coverage threshold: 36

In this example, hifiasm identifies the coverage threshold for homozygous reads as ``46/1.25 = 36``. If it is significantly smaller than the homozygous coverage peak, hifiasm will generate two unbalanced assemblies. In this case, please set ``--purge-cov`` to homozygous coverage peak. Please note that tuning ``--purge-cov`` may affect ``*p_utg*gfa`` so that ``*hic*.bin`` should be deleted. Since v0.15.5, hifiasm can detect such changes and renew Hi-C bin files automatically.
In this example, hifiasm identifies the coverage threshold for homozygous reads as ``36``. If it is significantly smaller than the homozygous coverage peak, hifiasm will generate two unbalanced assemblies. In this case, please set ``--hom-cov`` to homozygous coverage peak. Please note that tuning ``--hom-cov`` may affect ``*p_utg*gfa`` so that ``*hic*.bin`` should be deleted. Since v0.15.5, hifiasm can detect such changes and renew Hi-C bin files automatically.



Expand All @@ -57,12 +57,12 @@ For Hi-C integrated assembly, why the assembly size of both haplotypes are much

[M::stat] # heterozygous bases: 645155110; # homozygous bases: 1495396634

If most bases of a diploid sample are homozygous, the coverage threshold is wrongly determined by hifiasm. For instance, hifiasm prints the following information during assembly without setting the option ``--purge-cov``:
If most bases of a diploid sample are homozygous, the coverage threshold is wrongly determined by hifiasm. For instance, hifiasm prints the following information during assembly:
::

[M::purge_dups] purge duplication coverage threshold: 46
[M::purge_dups] purge duplication coverage threshold: 36

In this example, hifiasm identifies the coverage threshold for homozygous reads as ``46/1.25 = 36``. If it is much smaller than homozygous coverage peak, hifiasm thinks most reads are homozygous and assign them to both assemblies, making both of them much larger than the estimated haploid genome size. In this case, please set ``--purge-cov`` to homozygous coverage peak. Please note that tuning ``--purge-cov`` may affect ``*p_utg*gfa`` so that ``*hic*.bin`` should be deleted. Since v0.15.5, hifiasm can detect such changes and renew Hi-C bin files automatically.
In this example, hifiasm identifies the coverage threshold for homozygous reads as ``36``. If it is much smaller than homozygous coverage peak, hifiasm thinks most reads are homozygous and assign them to both assemblies, making both of them much larger than the estimated haploid genome size. In this case, please set ``--hom-cov`` to homozygous coverage peak. Please note that tuning ``--hom-cov`` may affect ``*p_utg*gfa`` so that ``*hic*.bin`` should be deleted. Since v0.15.5, hifiasm can detect such changes and renew Hi-C bin files automatically.


.. _hic-iss:
Expand All @@ -80,10 +80,14 @@ Why the size of primary assembly or partially phased assembly is much larger tha
It could be because the estimated genome size is incorrect. Another possibility is that hifiasm does not perform enough purging. Setting smaller value for ``-s`` (default: 0.55) or turning ``--purge-cov`` should be helpful. See :ref:`loginter` for more details.


.. _p-hamming:

Why the hamming error rate or the swith error rate of trio-binning assembly is very high?
---------------------------------------------------------------------------------------------------------------
In rare cases, a potential issue is that a few contigs may misjoin two haplotypes. For example, half of a contig come from mother while another half come from father. Such misjoined contigs can be fixed by manually breaking. The coordinates of problematic regions can be found by A-lines in GFA file or ``yak trioeval -e`` (see `issue 37 <https://github.com/chhylp123/hifiasm/issues/37>`_ for more details). However, if there are many misjoined contigs or the switch/hamming error rate reported by ``yak trioeval`` is very high, users should check if the parental data is correct (see `issue 130 <https://github.com/chhylp123/hifiasm/issues/130#issuecomment-862347943>`_ for more details).

Another possibility is that there are some unitigs in unitig graph misjoining two haplotypes. Such problematic unitigs might be ignored by the graph-binning strategy. Set smaller value for ``--t-occ`` forcedly remove unitig including unexpected haplotype-specific reads.

Why does hifiasm stuck or crash?
-------------------------------------
In most cases, it is caused by the low quality HiFi reads. A good HiFi dataset should have a k-mer plot like `issue10 <https://github.com/chhylp123/hifiasm/issues/10#issuecomment-616213684>`_ or `issue49 <https://github.com/chhylp123/hifiasm/issues/49#issue-729106823>`_. In contrast, low quality HiFi data often lead to weird k-mer plot like `issue93 <https://github.com/chhylp123/hifiasm/issues/93#issue-852259042>`_. Such weird k-mer plots usually indicate insufficient coverage or presence of contaminants. See :ref:`loginter` for more details. If the HiFi data look fine, please raise an issue at the `issue page <https://github.com/chhylp123/hifiasm/issues>`_.
Expand All @@ -98,7 +102,7 @@ Can I generate HiFi-only assembly first, and then add Hi-C or trio data later?

What is the minimum read coverage required for hifiasm?
-------------------------------------------------------
Usually >=14x HiFi reads per haplotype. Higher coverage might be able to improve the contiguity of assembly.
Usually >=13x HiFi reads per haplotype. Higher coverage might be able to improve the contiguity of assembly.

Why the primary assembly is more contiguous than the fully-phased assemblies and the partially phased assemblies (i.e. ``*.hap*.p_ctg.gfa``)?
----------------------------------------------------------------------------------------------------------------------------------------------------
Expand All @@ -107,3 +111,13 @@ Why the primary assembly is more contiguous than the fully-phased assemblies and

When producing fully-phased assemblies and partially phased assemblies, hifiasm is designed to keep both haplotypes contiguous. It is important for many downstream applications like SV calling.

My assembly is fragmented or not contiguous enough, how do I improve it?
--------------------------------------------------------------------------

Raising ``-D`` or ``-N`` may improve the resolution of repetitive regions but takes longer time. These two options affect all types of assemblies and usually do not have a negative impact on the assembly quality. In contrast, ``--purge-max`` only affects primary assembly. Setting larger value for ``--purge-max`` makes primary assembly more contiguous but may collapse repeats or segmental duplications.

If the assembly is too fragmented, users should check if HiFi data is good enough. See `Why does hifiasm stuck or crash?`_ for details.

How do I avoid misassemblies?
--------------------------------------------------------------------------
Set smaller value for ``--purge-max``, ``-s`` and ``-O``, or use the ``-u`` option.
Loading

0 comments on commit 321acb2

Please sign in to comment.