Merge pull request #271 from pirovc/dev

* v2.0.0-rc.1 * Feature/multiple matches classify (#263) * output one instead of lca, --multiple-matches on classigy * better params * tests * more tests * docs * more docs * more docs * check overlap between target or spec and nodes, used .add from multitax (#265) * Feature/seqan330 (#266) * use SeqAn v3.3.0, GCC >= 11 * fix stopclock print * travis gcc11 12 13 * test travis * travis jammy * gcc13 and cov * remove gcc13, no travis support yet * reverse placeholder for . in files, fixed on raptor 3.0.1 (#267) * set threads to 1 for bgzf (#268) * feature/hibf-default (#269) * --filter-type instead of --hibf on build * small fix * Feature/infra docs (#270) * docs * more docs * docs * ganon2 and some better feature description
pirovc · Nov 17, 2023 · e626f89 · e626f89
2 parents 8014610 + fd8fb07
commit e626f89
Show file tree

Hide file tree

Showing 30 changed files with 683 additions and 381 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,31 +1,44 @@
 language: cpp
-dist: focal
+dist: jammy
 
 matrix:
   include:
-    - name: "linux gcc-8"
+    - name: "linux gcc-11"
       os: linux
       addons:
-        apt: { packages: [g++-8, parallel] }
-      env: MATRIX_EVAL="CC=gcc-8 && CXX=g++-8 && BUILD_TYPE=Release"
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-11
+            - parallel
+      env:
+        - MATRIX_EVAL="CC=gcc-11 && CXX=g++-11 && BUILD_TYPE=Release"
 
-    - name: "linux gcc-9"
+    - name: "linux gcc-12"
       os: linux
       addons:
-        apt: { packages: [g++-9, parallel] }
-      env: MATRIX_EVAL="CC=gcc-9 && CXX=g++-9 && BUILD_TYPE=Release"
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-12
+            - parallel
+      env:
+        - MATRIX_EVAL="CC=gcc-12 && CXX=g++-12 && BUILD_TYPE=Release"
 
-    - name: "linux gcc-10"
+    - name: "linux gcc-11 (coverage)"
       os: linux
       addons:
-        apt: { packages: [g++-10, parallel] }
-      env: MATRIX_EVAL="CC=gcc-10 && CXX=g++-10 && BUILD_TYPE=Release"
-
-    - name: "linux gcc-8 (coverage)"
-      os: linux
-      addons:
-        apt: { packages: [g++-8, lcov, parallel] }
-      env: MATRIX_EVAL="CC=gcc-8 && CXX=g++-8 && BUILD_TYPE=Coverage && GCOV=gcov-8"
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-11
+            - parallel
+            - lcov
+      env:
+        - MATRIX_EVAL="CC=gcc-11 && CXX=g++-11 && BUILD_TYPE=Coverage && GCOV=gcov-11"
 
 before_install:
   - eval "${MATRIX_EVAL}"

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,20 +2,20 @@
 # ganon
 # =============================================================================
 
-cmake_minimum_required( VERSION 3.10 FATAL_ERROR )
-project( ganon VERSION 1.9.0 LANGUAGES CXX )
+cmake_minimum_required( VERSION 3.4 FATAL_ERROR )
+project( ganon VERSION 2.0.0 LANGUAGES CXX )
 
 # -----------------------------------------------------------------------------
 # build setup
 # -----------------------------------------------------------------------------
 
-set( CMAKE_CXX_STANDARD 17 )
+set( CMAKE_CXX_STANDARD 20 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
 set( CMAKE_CXX_EXTENSIONS OFF )
 
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} )
 
-if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU" )
+if( NOT CMAKE_CXX_COMPILER_ID MATCHES "GNU" )
     message( FATAL_ERROR
         "Compiler id '${CMAKE_CXX_COMPILER_ID}' is not supported, please \
         check the documentation." )
@@ -67,7 +67,7 @@ set( CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}" )
 
 add_compile_options( -Wall -Wextra -Wshadow -Wuninitialized -Wcast-align -Wunused 
     -Woverloaded-virtual -Wpedantic -Wnull-dereference -Wdouble-promotion 
-    -Wformat=2 -Wstrict-aliasing -Wunused-variable )
+    -Wformat=2 -Wstrict-aliasing -Wunused-variable -Wno-interference-size -Wno-null-dereference )
 
 add_compile_options( -Wno-shadow -Wno-old-style-cast )
 
@@ -103,21 +103,15 @@ else()
     target_include_directories( cxxopts SYSTEM INTERFACE libs/cxxopts/include )
 endif()
 
-# 3. Zlib (optional for SeqAn):
+# 3. Zlib and Bzip2:
 
-find_package( ZLIB )
+find_package( BZip2 REQUIRED )
+find_package( ZLIB REQUIRED )
 
 # 4. SeqAn3:
 
-set( SEQAN3_INCLUDE_PATH libs/seqan3/include )
-find_package (SeqAn3 3.1.0 REQUIRED HINTS libs/seqan3/build_system)
-
-add_library( seqan3 INTERFACE )
-target_include_directories( seqan3 SYSTEM INTERFACE ${SEQAN3_INCLUDE_DIRS} )
-target_link_libraries( seqan3 INTERFACE ${SEQAN3_LIBRARIES} )
-
-add_compile_options( ${SEQAN3_DEFINITIONS} )
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN3_CXX_FLAGS}" )
+list (APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/libs/seqan3/build_system")
+find_package (seqan3 3.3.0 REQUIRED)
 
 # 5. Catch2:
 
@@ -143,21 +137,20 @@ endif()
 
 if( VERBOSE_CONFIG )
     message( STATUS "SeqAn3 symbols")
-    message( STATUS "  SEQAN3_VERSION   : ${SEQAN3_VERSION}" )
-    message( STATUS "  SEQAN3_DEFINITIONS   : ${SEQAN3_DEFINITIONS}" )
+    message( STATUS "  SEQAN3_VERSION       : ${SEQAN3_VERSION}" )
     message( STATUS "  SEQAN3_CXX_FLAGS     : ${SEQAN3_CXX_FLAGS}" )
+    message( STATUS "  SEQAN3_DEFINITIONS   : ${SEQAN3_DEFINITIONS}" )
     message( STATUS "  SEQAN3_INCLUDE_DIRS  : ${SEQAN3_INCLUDE_DIRS}" )
     message( STATUS "  SEQAN3_LIBRARIES     : ${SEQAN3_LIBRARIES}" )
     message( STATUS "Misc symbols")
+    get_directory_property( dirCompileOptions COMPILE_OPTIONS )
     message( STATUS "  Build type          : ${CMAKE_BUILD_TYPE}" )
     message( STATUS "  CMAKE_CXX_FLAGS     : ${CMAKE_CXX_FLAGS}" )
-    message( STATUS "  INCLUDE_DIRS        : ${INCLUDE_DIRS}" )
     message( STATUS "  CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}" )
     message( STATUS "  CONDA               : ${CONDA}" )
-    message( STATUS "  LONGREADS           : ${LONGREADS}" )
-    get_directory_property( dirCompileOptions COMPILE_OPTIONS )
     message( STATUS "  COMPILE_OPTIONS   : ${dirCompileOptions}" )
-
+    message( STATUS "  INCLUDE_DIRS        : ${INCLUDE_DIRS}" )
+    message( STATUS "  LONGREADS           : ${LONGREADS}" )
 endif()
 
 # -----------------------------------------------------------------------------

diff --git a/README.md b/README.md
@@ -1,35 +1,32 @@
-# ganon [![GitHub release (latest by date)](https://img.shields.io/github/v/release/pirovc/ganon)](https://github.com/pirovc/ganon) [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458)
+# ganon [![GitHub release (latest by date)](https://img.shields.io/github/v/release/pirovc/ganon)](https://github.com/pirovc/ganon)
 
-ganon classifies DNA sequences against large sets of genomic reference sequences efficiently. It features:
+[![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458)
 
-- automatic download, build and update procedures for commonly used databases (RefSeq and GenBank)
-- classification with binning and taxonomic profiling
-- multiple taxonomy integration (NCBI and GTDB) with lowest common ancestor (LCA)
-- read reassignment EM algorithm for multi-matching reads
-- hierarchical use of multiple databases
-- taxonomic and sequence abundance reports with genome size correction
-- advanced reporting and filtration of results
-- contingency table generation
+ganon2 classifies DNA sequences against large sets of genomic reference sequences efficiently. It features:
+
+- integrated download and build of any subset from RefSeq/Genbank/GTDB with incremental updates
+- NCBI and GTDB native support for taxonomic classification, custom taxonomy or no taxonomy at all
+- customizable database build for local or non-standard sequence files
+- optimized taxonomic binning and classification configurations
+- build and classify at various taxonomic levels, strain, assembly, file, sequence or custom specialization
+- hierarchical classification using several databases in one or more levels in just one run
+- EM and/or LCA algorithms to solve multiple-matching reads
+- reporting of multiple and unique matches for every read
+- reporting of sequence, taxonomic or multi-match abundances with optional genome size correction
+- advanced tree-like reports with several filter options
+- generation of contingency tables with several filters for multi-sample studies
 
 Find out more information in the user manual: https://pirovc.github.io/ganon/
 
-## Quick install with conda
+## Quick install and usage
 
 ```sh
+# Install
 conda install -c bioconda -c conda-forge ganon
-```
-
-## Basic usage
-
-### Download and Build (Archaea - complete genomes - NCBI RefSeq)
-
-```bash
+# Download and Build (Archaea - complete genomes - NCBI RefSeq)
 ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --complete-genomes --threads 24
-```
-
-### Classify
-```bash
+# Classify
 ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --paired-reads my_reads.1.fq.gz my_reads.2.fq.gz --threads 24
 ```
 
-For further examples, database guide, installation from source and more: https://pirovc.github.io/ganon/
+For further examples, database build guides, installation from source and more: https://pirovc.github.io/ganon/
diff --git a/docs/classification.md b/docs/classification.md
@@ -1,46 +1,59 @@
 # Classification
 
 `ganon classify` will match single and/or paired-end sets of reads against one or [more databases](#multiple-and-hierarchical-classification). 
-By default, parameters are optimized for **taxonomic profiling**. 
-
-Example:
+By default, parameters are optimized for **taxonomic profiling**, meaning that less reads will be classified but with a higher sensitivity. For example:
 
 ```bash
 ganon classify --db-prefix my_db --paired-reads reads.1.fq.gz reads.2.fq.gz --output-prefix results --threads 32
 ```
 
-`ganon report` will be automatically executed after `ganon classify` and a [report will be created `.tre`](../outputfiles/#ganon-report).
+Output files:
+
+ - `results.rep`: plain report of the run, used to further generate tree-like reports
+ - `results.tre`: tree-like report with cumulative abundances by taxonomic ranks (can be re-generated with `ganon report`)
+
+By default, `ganon classify` only write report files. To get files with the classification of each read, use `--output-one` and/or `--output-all`. More information about output files [here](../outputfiles/#ganon-classify).
 
-ganon can perform **taxonomic profiling** and/or **binning** (one tax. assignment for each read) at a taxonomic, strain or sequence level with `ganon classify` + `ganon report`. Some guidelines are listed below, please choose the parameters according to your application:
+!!! Note
+    ganon performs **taxonomic profiling** and/or **binning** (one tax. assignment for each read) at a taxonomic, strain or sequence level. Some guidelines are listed below, please choose the parameters according to your application.
 
 ### Profiling
 
 `ganon classify` is set-up by default to perform taxonomic profiling. It uses:
 
- - strict thresholds: `--rel-cutoff 0.75` and `--rel-filter 0`
+ - strict thresholds: `--rel-cutoff 0.75` and `--rel-filter 0.1`
+ - `--min-count 0.00005` (0.005%) to exclude very low abundant taxa
+ - `--report-type abundance` to generate taxonomic abundances, correcting for genome sizes  (more infos [here](../reports/#report-type-report-type))
 
-`ganon report` will automatically run after classification with:
+### Binning
 
- - `--min-count 0.005` (0.5%) to exclude low abundant taxa
- - `--report-type abundance` to generate taxonomic abundances, re-distributing read counts and correcting for genome sizes
+To achieve better results for taxonomic binning or sequence classification, `ganon classify` can be configured with `--binning`, that is the same as:
 
-!!! Note
-    `ganon report` can be used independently from `ganon classify` with the output file `.rep`
+ - less strict thresholds: `--rel-cutoff 0.25 --rel-filter 0`
+ - `--min-count 0` reports all taxa with at least one read assigned to it
+ - `--report-type reads` will report sequence abundances instead of taxonomic abundances (more infos [here](../reports/#report-type-report-type))
 
-### Binning
+!!! Tip
+    Database parameters in `ganon build` can also influence your results. Lower `--max-fp` (e.g. 0.1, 0.001) and higher `--kmer-size` (e.g. `23`, `27`) will improve sensitivity of your results at cost of a larger database and memory usage.
+
+## Reads with multiple matches
 
-To achieve better results for taxonomic binning or sequence classification, ganon can be configured with:
+There are two ways to solve reads with multiple-matches in `ganon classify`:
 
- - `--output-all` and `--output-lca` to write `.all` `.lca` files for binning results
- - less strict `--rel-cutoff` and `--rel-filter` values (e.g. `0.25` and `0.1`, respectively)
- - activate the `--reassign` to apply an EM algorithm, re-assigning reads with LCA matches to one most probable target (defined by `--level` in the build procedure). In this case, the `.all` file will be re-generated with one assignment per read.
+ - `--multiple-matches em` (default): uses an Expectation-Maximization algorithm, re-assigning reads with multiple matches to one most probable target (defined by `--level` in the build procedure).
+ - `--multiple-matches lca`: uses the Lowest Common Ancestor algorithm, re-assigning reads with multiple matches to higher common ancestors in the taxonomic tree.
+ - `--multiple-matches skip`: will not resolve multi-matching reads
+
+!!! Tip
+    - The Expectation-Maximization can be performed independently with `ganon reassign` using the output files `.rep` and `.all`.
+    - Reports can be generated independently with `ganon report` using the output file `.rep`
 
 !!! Note
-    `ganon reassign` can be used independently from `ganon classify` with the output file `.rep` and `.all`
+    `--multiple-matches lca` paired with `--report-type abundance` or `dist` will distribute read **counts** with multiple matches to one most probable target (defined by `--level` in the build procedure), instead of a higher taxonomic rank. In this case the distribution is simply based on the number of taxa with unique matches and it is not as precise as the EM algorithm, but it will run faster since the per-read basis re-assignment can be skipped.
 
-!!! tip
-    Database parameters can also influence your results. Lower `--max-fp` (e.g. 0.1, 0.001) and higher `--kmer-size` (e.g. `23`, `27`) will improve sensitivity of your results at cost of a larger database and memory usage
+## Classifying more reads
 
+By default ganon will classify less reads in favour of sensitivity. To classify more reads, use less strict `--rel-cutoff` and `--rel-filter` values (e.g. `0.25` and `0`, respectively). More details [here](#cutoff-and-filter-rel-cutoff-rel-filter).
 
 ## Multiple and Hierarchical classification
 

diff --git a/docs/custom_databases.md b/docs/custom_databases.md
@@ -16,6 +16,9 @@ It is also possible to use **non-standard accessions and headers** to build cust
 !!! tip
     If you just want to build a database without any taxonomic or target information, just sent the files with `--input`, use `--taxonomy skip` and choose between `--input-target file` or `sequence`.
 
+!!! warning
+    the target and specialization fields (2nd and 4th col) cannot be the same as the target (3rd col)
+
 <details>
   <summary>Examples of --input-file</summary>
   <br>
@@ -159,7 +162,7 @@ tail -n+2 genomes-all_metadata.tsv | cut -f 1,20 | xargs -P 12 -n2 sh -c 'curl -
 tail -n+2 genomes-all_metadata.tsv | cut -f 1,15  | tr ';' '\t' | awk -F"\t" '{tax="1";for(i=NF;i>1;i--){if(length($i)>3){tax=$i;break;}};print $1".fna.gz\t"$1"\t"tax}' > ganon_input_file.tsv
 
 # Build ganon database
-ganon build-custom --input-file ganon_input_file.tsv --db-prefix mgnify_human_oral_v1 --taxonomy gtdb --level leaves --hibf --threads 32
+ganon build-custom --input-file ganon_input_file.tsv --db-prefix mgnify_human_oral_v1 --taxonomy gtdb --level leaves --threads 32
 ```
 
 !!! note
@@ -222,7 +225,7 @@ awk -v db="$(realpath ${db})" '{file=db"/"substr($2,1,1)"/"$2".fna"; print ">"$1
 sort | uniq > "${db}_ganon_input_file.tsv"
 
 # Build ganon database
-ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --hibf --level species --threads 12
+ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --threads 12
 
 # Delete extracted files and auxiliary files
 cat "${db}_extracted_files.txt" | xargs rm