Merge pull request #123 from pirovc/dev

v0.2.2
pirovc · Apr 23, 2020 · a4d6bb7 · a4d6bb7
2 parents 87858b5 + 8fdc442
commit a4d6bb7
Show file tree

Hide file tree

Showing 9 changed files with 201 additions and 139 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@
 # =============================================================================
 
 cmake_minimum_required( VERSION 3.10 FATAL_ERROR )
-project( ganon VERSION 0.2.1 LANGUAGES CXX )
+project( ganon VERSION 0.2.2 LANGUAGES CXX )
 
 # -----------------------------------------------------------------------------
 # build setup

diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ ganon classify --db-prefix sample_bacteria --single-reads tests/ganon-classify/d
 ### report
 
 ```
-ganon report --db-prefix sample_bacteria --rep-file sample_bateria.rep --min-matches-perc 50 --ranks all --output-report new_report.tre
+ganon report --db-prefix sample_bacteria --rep-file sample_results.rep --min-matches-perc 50 --ranks all --output-report new_report.tre
 ```
 
 ### update
@@ -90,7 +90,7 @@ This is going to download the new files (and remove the outdated ones) into the
 Updating the index based on the files of the example above:
 
 	ganon update --db-prefix ganon_db --output-db-prefix ganon_db_updated \
-	             --input-files $(find RefSeqCG_arc_bac/v2/files/*.genomic.fna.gz -type f)
+	             --input-files $(find RefSeqCG_arc_bac/v2/files/ -name *.genomic.fna.gz -type f)
 
 If `--output-db-prefix` is not set, the database files `ganon_db.*` will be overwritten with the updated version. The `find` command will only look for files `-type f` inside the `v2/files/`, ignoring symbolic links from sequences which were not changing in this version.
 
@@ -115,7 +115,7 @@ The same goes for the update process:
 	# Use generated files on ganon update
 	ganon update --db-prefix ganon_db \
 	             --output-db-prefix ganon_db_updated \
-	             --input-files $(find RefSeqCG_arc_bac/v2/files/*.genomic.fna.gz -type f) \
+	             --input-files $(find RefSeqCG_arc_bac/v2/files/ -name *.genomic.fna.gz -type f) \
 	             --seq-info-file RefSeqCG_arc_bac/v2/seqinfo.txt \
 	             --taxdump-file RefSeqCG_arc_bac/v2/{TIMESTAMP}_taxdump.tar.gz
 

diff --git a/src/ganon-classify/GanonClassify.cpp b/src/ganon-classify/GanonClassify.cpp
@@ -16,6 +16,7 @@
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <unordered_set>
 #include <vector>
 
 namespace GanonClassify
@@ -59,6 +60,8 @@ typedef std::unordered_map< std::string, Rep >  TRep;
 typedef std::unordered_map< std::string, Node > TTax;
 typedef std::map< uint32_t, std::string >       TMap;
 
+typedef std::unordered_set< std::string > TUniqueTarget;
+
 struct ReadBatches
 {
 
@@ -206,12 +209,12 @@ inline void check_unique( ReadOut& read_out_lca,
     // if kmer count is lower than expected set match to parent node
     if ( read_out_lca.matches[0].kmer_count < threshold_error_unique )
     {
-        read_out_lca.matches[0].target = tax[read_out_lca.matches[0].target].parent; // parent node
-        rep[read_out_lca.matches[0].target].lca_reads++;                             // count as lca for parent
+        read_out_lca.matches[0].target = tax.at( read_out_lca.matches[0].target ).parent; // parent node
+        rep.at( read_out_lca.matches[0].target ).lca_reads++;                             // count as lca for parent
     }
     else
     {
-        rep[read_out_lca.matches[0].target].unique_reads++; // count as unique for target
+        rep.at( read_out_lca.matches[0].target ).unique_reads++; // count as unique for target
     }
 }
 
@@ -232,10 +235,11 @@ void select_matches( TMatches&                matches,
             uint16_t maxKmerCountBin = std::max( selectedBins[binNo], selectedBinsRev[binNo] );
             // keep only the best match target/read when same targets are split in several
             // bins
-            if ( matches.count( filter.map[binNo] ) == 0 || maxKmerCountBin > matches[filter.map[binNo]] )
+
+            if ( matches.count( filter.map.at( binNo ) ) == 0 || maxKmerCountBin > matches[filter.map.at( binNo )] )
             {
                 // store match to target
-                matches[filter.map[binNo]] = maxKmerCountBin;
+                matches[filter.map.at( binNo )] = maxKmerCountBin;
                 if ( maxKmerCountBin > maxKmerCountRead )
                     maxKmerCountRead = maxKmerCountBin;
             }
@@ -323,7 +327,7 @@ uint32_t filter_matches( ReadOut&  read_out,
     { // matches[target] = kmerCount
         if ( v.second >= threshold_strata )
         { // apply strata filter
-            rep[v.first].direct_matches++;
+            rep.at( v.first ).direct_matches++;
             read_out.matches.push_back( ReadMatch{ v.first, v.second } );
         }
     }
@@ -340,7 +344,7 @@ void lca_matches( ReadOut& read_out, ReadOut& read_out_lca, uint16_t max_kmer_co
     }
 
     std::string target_lca = lca.getLCA( targets );
-    rep[target_lca].lca_reads++;
+    rep.at( target_lca ).lca_reads++;
     read_out_lca.matches.push_back( ReadMatch{ target_lca, max_kmer_count_read } );
 }
 
@@ -444,7 +448,7 @@ void classify( std::vector< Filter >&    filters,
                         }
                         else
                         {
-                            rep[read_out.matches[0].target].unique_reads++;
+                            rep.at( read_out.matches[0].target ).unique_reads++;
                         }
                     }
                     else
@@ -499,8 +503,12 @@ void write_report( TRep&       rep,
 
     for ( auto const& [target, report] : rep )
     {
-        out_rep << hierarchy_label << '\t' << target << '\t' << report.direct_matches << '\t' << report.unique_reads
-                << '\t' << report.lca_reads << '\t' << tax[target].rank << '\t' << tax[target].name << '\n';
+        if ( report.direct_matches || report.lca_reads || report.unique_reads )
+        {
+            out_rep << hierarchy_label << '\t' << target << '\t' << report.direct_matches << '\t' << report.unique_reads
+                    << '\t' << report.lca_reads << '\t' << tax.at( target ).rank << '\t' << tax.at( target ).name
+                    << '\n';
+        }
     }
 
     if ( hierarchy_last )
@@ -511,7 +519,10 @@ void write_report( TRep&       rep,
     out_rep.close();
 }
 
-void load_filters( std::vector< Filter >& filters, std::string hierarchy_label, Config& config )
+bool load_filters( std::vector< Filter >& filters,
+                   TUniqueTarget&         unique_targets,
+                   std::string            hierarchy_label,
+                   Config&                config )
 {
     uint16_t first_kmer_size = 0;
     for ( auto const& filter_config : config.parsed_hierarchy[hierarchy_label].filters )
@@ -560,10 +571,19 @@ void load_filters( std::vector< Filter >& filters, std::string hierarchy_label,
                 fields.push_back( field );
             // target <tab> binid
             map[std::stoul( fields[1] )] = fields[0];
+            unique_targets.insert( fields[0] );
+
             // std::cerr << std::stoul( fields[1] ) << " -- " << fields[0] << std::endl;
         }
         infile.close();
 
+        // Check consistency of map file and ibf file
+        if ( map.size() != filter.noOfBins )
+        {
+            std::cerr << "ERROR: .ibf and .map files are inconsistent." << std::endl;
+            return false;
+        }
+
         // tax file
         infile.open( filter_config.tax_file );
         while ( std::getline( infile, line, '\n' ) )
@@ -581,6 +601,8 @@ void load_filters( std::vector< Filter >& filters, std::string hierarchy_label,
 
         filters.push_back( Filter{ std::move( filter ), map, tax, filter_config } );
     }
+
+    return true;
 }
 
 void print_time( const StopClock& timeGanon, const StopClock& timeLoadFilters, const StopClock& timeClassPrint )
@@ -809,6 +831,19 @@ TTax merge_tax( std::vector< detail::Filter >& filters )
     }
 }
 
+void validate_targets_tax( const TUniqueTarget unique_targets, TTax& tax )
+{
+    for ( const auto& target : unique_targets )
+    {
+        if ( tax.count( target ) == 0 )
+        {
+            tax[target] = Node{ "1", "no rank", target };
+            std::cerr << "WARNING: target [" << target << "] without tax entry, setting parent node to 1 (root)"
+                      << std::endl;
+        }
+    }
+}
+
 void pre_process_lca( LCA& lca, TTax& tax )
 {
     for ( auto const& [target, node] : tax )
@@ -898,14 +933,29 @@ bool run( Config config )
 
         timeLoadFilters.start();
         std::vector< detail::Filter > filters;
-        detail::load_filters( filters, hierarchy_label, config );
+        detail::TUniqueTarget         unique_targets;
+        bool                          loaded = detail::load_filters( filters, unique_targets, hierarchy_label, config );
+        if ( !loaded )
+        {
+            return false;
+        }
         timeLoadFilters.stop();
 
+        // merge repeated elements
+        detail::TTax tax = detail::merge_tax( filters );
+
         // Reports
         detail::TRep rep;
 
-        // merge repeated elements
-        detail::TTax tax = detail::merge_tax( filters );
+        // initialize entries for the report with all possible taxa (to direct access it in multiple threads)
+        for ( const auto& it : tax )
+        {
+            rep[it.first]; // initialize using [] due to std::atomic being not movable/copyable
+        }
+
+        // validate targets and tax
+        // if target not found in tax, add node target with parent = "1" (root)
+        detail::validate_targets_tax( unique_targets, tax );
 
         // pre-processing of nodes
         LCA lca;
@@ -990,6 +1040,7 @@ bool run( Config config )
         {
             task.get();
         }
+
         // Inform that no more reads are going to be pushed
         classified_all_queue.notify_push_over();
         classified_lca_queue.notify_push_over();
@@ -1003,7 +1054,6 @@ bool run( Config config )
             detail::write_report(
                 rep, tax, stats, hierarchy_config.output_file_rep, hierarchy_label, hierarchy_first, hierarchy_last );
         }
-
         // Wait here until all files are written
         for ( auto&& task : write_tasks )
         {

diff --git a/src/ganon/ganon.py b/src/ganon/ganon.py
@@ -7,7 +7,7 @@
 
 def main(arguments=None):
 
-    version = '0.2.1'
+    version = '0.2.2'
 
     ####################################################################################################