Skip to content

Commit

Permalink
Merge pull request #271 from pirovc/dev
Browse files Browse the repository at this point in the history
* v2.0.0-rc.1

* Feature/multiple matches classify (#263)

* output one instead of lca, --multiple-matches on classigy

* better params

* tests

* more tests

* docs

* more docs

* more docs

* check overlap between target or spec and nodes, used .add from multitax (#265)

* Feature/seqan330 (#266)

* use SeqAn v3.3.0, GCC >= 11

* fix stopclock print

* travis gcc11 12 13

* test travis

* travis jammy

* gcc13 and cov

* remove gcc13, no travis support yet

* reverse placeholder for . in files, fixed on raptor 3.0.1 (#267)

* set threads to 1 for bgzf (#268)

* feature/hibf-default (#269)

* --filter-type instead of --hibf on build

* small fix

* Feature/infra docs (#270)

* docs

* more docs

* docs

* ganon2 and some better feature description
  • Loading branch information
pirovc authored Nov 17, 2023
2 parents 8014610 + fd8fb07 commit e626f89
Show file tree
Hide file tree
Showing 30 changed files with 683 additions and 381 deletions.
45 changes: 29 additions & 16 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
language: cpp
dist: focal
dist: jammy

matrix:
include:
- name: "linux gcc-8"
- name: "linux gcc-11"
os: linux
addons:
apt: { packages: [g++-8, parallel] }
env: MATRIX_EVAL="CC=gcc-8 && CXX=g++-8 && BUILD_TYPE=Release"
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-11
- parallel
env:
- MATRIX_EVAL="CC=gcc-11 && CXX=g++-11 && BUILD_TYPE=Release"

- name: "linux gcc-9"
- name: "linux gcc-12"
os: linux
addons:
apt: { packages: [g++-9, parallel] }
env: MATRIX_EVAL="CC=gcc-9 && CXX=g++-9 && BUILD_TYPE=Release"
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-12
- parallel
env:
- MATRIX_EVAL="CC=gcc-12 && CXX=g++-12 && BUILD_TYPE=Release"

- name: "linux gcc-10"
- name: "linux gcc-11 (coverage)"
os: linux
addons:
apt: { packages: [g++-10, parallel] }
env: MATRIX_EVAL="CC=gcc-10 && CXX=g++-10 && BUILD_TYPE=Release"

- name: "linux gcc-8 (coverage)"
os: linux
addons:
apt: { packages: [g++-8, lcov, parallel] }
env: MATRIX_EVAL="CC=gcc-8 && CXX=g++-8 && BUILD_TYPE=Coverage && GCOV=gcov-8"
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-11
- parallel
- lcov
env:
- MATRIX_EVAL="CC=gcc-11 && CXX=g++-11 && BUILD_TYPE=Coverage && GCOV=gcov-11"

before_install:
- eval "${MATRIX_EVAL}"
Expand Down
37 changes: 15 additions & 22 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
# ganon
# =============================================================================

cmake_minimum_required( VERSION 3.10 FATAL_ERROR )
project( ganon VERSION 1.9.0 LANGUAGES CXX )
cmake_minimum_required( VERSION 3.4 FATAL_ERROR )
project( ganon VERSION 2.0.0 LANGUAGES CXX )

# -----------------------------------------------------------------------------
# build setup
# -----------------------------------------------------------------------------

set( CMAKE_CXX_STANDARD 17 )
set( CMAKE_CXX_STANDARD 20 )
set( CMAKE_CXX_STANDARD_REQUIRED ON )
set( CMAKE_CXX_EXTENSIONS OFF )

set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} )

if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU" )
if( NOT CMAKE_CXX_COMPILER_ID MATCHES "GNU" )
message( FATAL_ERROR
"Compiler id '${CMAKE_CXX_COMPILER_ID}' is not supported, please \
check the documentation." )
Expand Down Expand Up @@ -67,7 +67,7 @@ set( CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}" )

add_compile_options( -Wall -Wextra -Wshadow -Wuninitialized -Wcast-align -Wunused
-Woverloaded-virtual -Wpedantic -Wnull-dereference -Wdouble-promotion
-Wformat=2 -Wstrict-aliasing -Wunused-variable )
-Wformat=2 -Wstrict-aliasing -Wunused-variable -Wno-interference-size -Wno-null-dereference )

add_compile_options( -Wno-shadow -Wno-old-style-cast )

Expand Down Expand Up @@ -103,21 +103,15 @@ else()
target_include_directories( cxxopts SYSTEM INTERFACE libs/cxxopts/include )
endif()

# 3. Zlib (optional for SeqAn):
# 3. Zlib and Bzip2:

find_package( ZLIB )
find_package( BZip2 REQUIRED )
find_package( ZLIB REQUIRED )

# 4. SeqAn3:

set( SEQAN3_INCLUDE_PATH libs/seqan3/include )
find_package (SeqAn3 3.1.0 REQUIRED HINTS libs/seqan3/build_system)

add_library( seqan3 INTERFACE )
target_include_directories( seqan3 SYSTEM INTERFACE ${SEQAN3_INCLUDE_DIRS} )
target_link_libraries( seqan3 INTERFACE ${SEQAN3_LIBRARIES} )

add_compile_options( ${SEQAN3_DEFINITIONS} )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN3_CXX_FLAGS}" )
list (APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/libs/seqan3/build_system")
find_package (seqan3 3.3.0 REQUIRED)

# 5. Catch2:

Expand All @@ -143,21 +137,20 @@ endif()

if( VERBOSE_CONFIG )
message( STATUS "SeqAn3 symbols")
message( STATUS " SEQAN3_VERSION : ${SEQAN3_VERSION}" )
message( STATUS " SEQAN3_DEFINITIONS : ${SEQAN3_DEFINITIONS}" )
message( STATUS " SEQAN3_VERSION : ${SEQAN3_VERSION}" )
message( STATUS " SEQAN3_CXX_FLAGS : ${SEQAN3_CXX_FLAGS}" )
message( STATUS " SEQAN3_DEFINITIONS : ${SEQAN3_DEFINITIONS}" )
message( STATUS " SEQAN3_INCLUDE_DIRS : ${SEQAN3_INCLUDE_DIRS}" )
message( STATUS " SEQAN3_LIBRARIES : ${SEQAN3_LIBRARIES}" )
message( STATUS "Misc symbols")
get_directory_property( dirCompileOptions COMPILE_OPTIONS )
message( STATUS " Build type : ${CMAKE_BUILD_TYPE}" )
message( STATUS " CMAKE_CXX_FLAGS : ${CMAKE_CXX_FLAGS}" )
message( STATUS " INCLUDE_DIRS : ${INCLUDE_DIRS}" )
message( STATUS " CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}" )
message( STATUS " CONDA : ${CONDA}" )
message( STATUS " LONGREADS : ${LONGREADS}" )
get_directory_property( dirCompileOptions COMPILE_OPTIONS )
message( STATUS " COMPILE_OPTIONS : ${dirCompileOptions}" )

message( STATUS " INCLUDE_DIRS : ${INCLUDE_DIRS}" )
message( STATUS " LONGREADS : ${LONGREADS}" )
endif()

# -----------------------------------------------------------------------------
Expand Down
43 changes: 20 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,35 +1,32 @@
# ganon [![GitHub release (latest by date)](https://img.shields.io/github/v/release/pirovc/ganon)](https://github.com/pirovc/ganon) [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458)
# ganon [![GitHub release (latest by date)](https://img.shields.io/github/v/release/pirovc/ganon)](https://github.com/pirovc/ganon)

ganon classifies DNA sequences against large sets of genomic reference sequences efficiently. It features:
[![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458)

- automatic download, build and update procedures for commonly used databases (RefSeq and GenBank)
- classification with binning and taxonomic profiling
- multiple taxonomy integration (NCBI and GTDB) with lowest common ancestor (LCA)
- read reassignment EM algorithm for multi-matching reads
- hierarchical use of multiple databases
- taxonomic and sequence abundance reports with genome size correction
- advanced reporting and filtration of results
- contingency table generation
ganon2 classifies DNA sequences against large sets of genomic reference sequences efficiently. It features:

- integrated download and build of any subset from RefSeq/Genbank/GTDB with incremental updates
- NCBI and GTDB native support for taxonomic classification, custom taxonomy or no taxonomy at all
- customizable database build for local or non-standard sequence files
- optimized taxonomic binning and classification configurations
- build and classify at various taxonomic levels, strain, assembly, file, sequence or custom specialization
- hierarchical classification using several databases in one or more levels in just one run
- EM and/or LCA algorithms to solve multiple-matching reads
- reporting of multiple and unique matches for every read
- reporting of sequence, taxonomic or multi-match abundances with optional genome size correction
- advanced tree-like reports with several filter options
- generation of contingency tables with several filters for multi-sample studies

Find out more information in the user manual: https://pirovc.github.io/ganon/

## Quick install with conda
## Quick install and usage

```sh
# Install
conda install -c bioconda -c conda-forge ganon
```

## Basic usage

### Download and Build (Archaea - complete genomes - NCBI RefSeq)

```bash
# Download and Build (Archaea - complete genomes - NCBI RefSeq)
ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --complete-genomes --threads 24
```

### Classify
```bash
# Classify
ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --paired-reads my_reads.1.fq.gz my_reads.2.fq.gz --threads 24
```

For further examples, database guide, installation from source and more: https://pirovc.github.io/ganon/
For further examples, database build guides, installation from source and more: https://pirovc.github.io/ganon/
51 changes: 32 additions & 19 deletions docs/classification.md
Original file line number Diff line number Diff line change
@@ -1,46 +1,59 @@
# Classification

`ganon classify` will match single and/or paired-end sets of reads against one or [more databases](#multiple-and-hierarchical-classification).
By default, parameters are optimized for **taxonomic profiling**.

Example:
By default, parameters are optimized for **taxonomic profiling**, meaning that less reads will be classified but with a higher sensitivity. For example:

```bash
ganon classify --db-prefix my_db --paired-reads reads.1.fq.gz reads.2.fq.gz --output-prefix results --threads 32
```

`ganon report` will be automatically executed after `ganon classify` and a [report will be created `.tre`](../outputfiles/#ganon-report).
Output files:

- `results.rep`: plain report of the run, used to further generate tree-like reports
- `results.tre`: tree-like report with cumulative abundances by taxonomic ranks (can be re-generated with `ganon report`)

By default, `ganon classify` only write report files. To get files with the classification of each read, use `--output-one` and/or `--output-all`. More information about output files [here](../outputfiles/#ganon-classify).

ganon can perform **taxonomic profiling** and/or **binning** (one tax. assignment for each read) at a taxonomic, strain or sequence level with `ganon classify` + `ganon report`. Some guidelines are listed below, please choose the parameters according to your application:
!!! Note
ganon performs **taxonomic profiling** and/or **binning** (one tax. assignment for each read) at a taxonomic, strain or sequence level. Some guidelines are listed below, please choose the parameters according to your application.

### Profiling

`ganon classify` is set-up by default to perform taxonomic profiling. It uses:

- strict thresholds: `--rel-cutoff 0.75` and `--rel-filter 0`
- strict thresholds: `--rel-cutoff 0.75` and `--rel-filter 0.1`
- `--min-count 0.00005` (0.005%) to exclude very low abundant taxa
- `--report-type abundance` to generate taxonomic abundances, correcting for genome sizes (more infos [here](../reports/#report-type-report-type))

`ganon report` will automatically run after classification with:
### Binning

- `--min-count 0.005` (0.5%) to exclude low abundant taxa
- `--report-type abundance` to generate taxonomic abundances, re-distributing read counts and correcting for genome sizes
To achieve better results for taxonomic binning or sequence classification, `ganon classify` can be configured with `--binning`, that is the same as:

!!! Note
`ganon report` can be used independently from `ganon classify` with the output file `.rep`
- less strict thresholds: `--rel-cutoff 0.25 --rel-filter 0`
- `--min-count 0` reports all taxa with at least one read assigned to it
- `--report-type reads` will report sequence abundances instead of taxonomic abundances (more infos [here](../reports/#report-type-report-type))

### Binning
!!! Tip
Database parameters in `ganon build` can also influence your results. Lower `--max-fp` (e.g. 0.1, 0.001) and higher `--kmer-size` (e.g. `23`, `27`) will improve sensitivity of your results at cost of a larger database and memory usage.

## Reads with multiple matches

To achieve better results for taxonomic binning or sequence classification, ganon can be configured with:
There are two ways to solve reads with multiple-matches in `ganon classify`:

- `--output-all` and `--output-lca` to write `.all` `.lca` files for binning results
- less strict `--rel-cutoff` and `--rel-filter` values (e.g. `0.25` and `0.1`, respectively)
- activate the `--reassign` to apply an EM algorithm, re-assigning reads with LCA matches to one most probable target (defined by `--level` in the build procedure). In this case, the `.all` file will be re-generated with one assignment per read.
- `--multiple-matches em` (default): uses an Expectation-Maximization algorithm, re-assigning reads with multiple matches to one most probable target (defined by `--level` in the build procedure).
- `--multiple-matches lca`: uses the Lowest Common Ancestor algorithm, re-assigning reads with multiple matches to higher common ancestors in the taxonomic tree.
- `--multiple-matches skip`: will not resolve multi-matching reads

!!! Tip
- The Expectation-Maximization can be performed independently with `ganon reassign` using the output files `.rep` and `.all`.
- Reports can be generated independently with `ganon report` using the output file `.rep`

!!! Note
`ganon reassign` can be used independently from `ganon classify` with the output file `.rep` and `.all`
`--multiple-matches lca` paired with `--report-type abundance` or `dist` will distribute read **counts** with multiple matches to one most probable target (defined by `--level` in the build procedure), instead of a higher taxonomic rank. In this case the distribution is simply based on the number of taxa with unique matches and it is not as precise as the EM algorithm, but it will run faster since the per-read basis re-assignment can be skipped.

!!! tip
Database parameters can also influence your results. Lower `--max-fp` (e.g. 0.1, 0.001) and higher `--kmer-size` (e.g. `23`, `27`) will improve sensitivity of your results at cost of a larger database and memory usage
## Classifying more reads

By default ganon will classify less reads in favour of sensitivity. To classify more reads, use less strict `--rel-cutoff` and `--rel-filter` values (e.g. `0.25` and `0`, respectively). More details [here](#cutoff-and-filter-rel-cutoff-rel-filter).

## Multiple and Hierarchical classification

Expand Down
7 changes: 5 additions & 2 deletions docs/custom_databases.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ It is also possible to use **non-standard accessions and headers** to build cust
!!! tip
If you just want to build a database without any taxonomic or target information, just sent the files with `--input`, use `--taxonomy skip` and choose between `--input-target file` or `sequence`.

!!! warning
the target and specialization fields (2nd and 4th col) cannot be the same as the target (3rd col)

<details>
<summary>Examples of --input-file</summary>
<br>
Expand Down Expand Up @@ -159,7 +162,7 @@ tail -n+2 genomes-all_metadata.tsv | cut -f 1,20 | xargs -P 12 -n2 sh -c 'curl -
tail -n+2 genomes-all_metadata.tsv | cut -f 1,15 | tr ';' '\t' | awk -F"\t" '{tax="1";for(i=NF;i>1;i--){if(length($i)>3){tax=$i;break;}};print $1".fna.gz\t"$1"\t"tax}' > ganon_input_file.tsv

# Build ganon database
ganon build-custom --input-file ganon_input_file.tsv --db-prefix mgnify_human_oral_v1 --taxonomy gtdb --level leaves --hibf --threads 32
ganon build-custom --input-file ganon_input_file.tsv --db-prefix mgnify_human_oral_v1 --taxonomy gtdb --level leaves --threads 32
```

!!! note
Expand Down Expand Up @@ -222,7 +225,7 @@ awk -v db="$(realpath ${db})" '{file=db"/"substr($2,1,1)"/"$2".fna"; print ">"$1
sort | uniq > "${db}_ganon_input_file.tsv"

# Build ganon database
ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --hibf --level species --threads 12
ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --threads 12

# Delete extracted files and auxiliary files
cat "${db}_extracted_files.txt" | xargs rm
Expand Down
Loading

0 comments on commit e626f89

Please sign in to comment.