Merge pull request #195 from pirovc/dev

ganon version 1.1.2
pirovc · Jan 28, 2022 · fced6f2 · fced6f2
2 parents 28c94f4 + c07c443
commit fced6f2
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 16 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@
 # =============================================================================
 
 cmake_minimum_required( VERSION 3.10 FATAL_ERROR )
-project( ganon VERSION 1.1.1 LANGUAGES CXX )
+project( ganon VERSION 1.1.2 LANGUAGES CXX )
 
 # -----------------------------------------------------------------------------
 # build setup

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Vitor C. Piro
+Copyright (c) 2022 Vitor C. Piro pirovc.github.io
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/setup.py b/setup.py
@@ -13,11 +13,10 @@ def read(filename):
 
 setup(
     name="ganon",
-    version="1.1.1",
+    version="1.1.2",
     url="https://www.github.com/pirovc/ganon",
     license='MIT',
     author="Vitor C. Piro",
-    author_email="pirovc@posteo.net",
     description="ganon is a k-mer based read classification tool which uses Interleaved Bloom Filters in conjunction with a taxonomic clustering and a k-mer counting-filtering scheme.",
     long_description=read("README.md"),
     package_dir={'': 'src'},

diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py
@@ -500,10 +500,14 @@ def retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg):
         for acc2txid in seq_info_mode:
             dowloaded_acc2txid_files.append(get_accession2taxid(acc2txid, tmp_output_folder, cfg.quiet))
 
-        print_log("Parsing accession2taxid files", cfg.quiet)     
+        print_log("Parsing accession2taxid files", cfg.quiet)
         count_acc2txid = parse_acc2txid(seqinfo, dowloaded_acc2txid_files)
         for acc2txid_file, cnt in count_acc2txid.items():
             print_log(" - " + str(cnt) + " entries found in the " + acc2txid_file.split("/")[-1] + " file", cfg.quiet)
+
+        # filter out taxids not found
+        seqinfo.dropna(subset=['taxid'])
+
         # Check if retrieved taxids are the same as number of inputs, reset counter
         if seqinfo.size() < seqid_total_count:
             print_log(" - could not retrieve taxid for " + str(seqid_total_count - seqinfo.size()) + " accessions", cfg.quiet)
@@ -516,6 +520,7 @@ def retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg):
             parse_eutils(seqinfo, tmp_output_folder, cfg.path_exec['get_seq_info'], cfg.quiet, skip_len_taxid=True, get_assembly=True)
             print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet)
 
+
 def get_accession2taxid(acc2txid, tmp_output_folder, quiet):
     tx = time.time()
     acc2txid_file = acc2txid + ".accession2taxid.gz"
@@ -526,21 +531,32 @@ def get_accession2taxid(acc2txid, tmp_output_folder, quiet):
     print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", quiet)
     return acc2txid_file
 
+
 def parse_acc2txid(seqinfo, acc2txid_files):
     count_acc2txid = {}
     unique_seqids = set(seqinfo.get_seqids())
     for acc2txid in acc2txid_files:
-        tmp_seqid_taxids = pd.read_csv(acc2txid, sep='\t', header=None, skiprows=1, usecols=[1,2], names=['seqid','taxid'], converters={'seqid':lambda x: x if x in unique_seqids else ""}, dtype={'taxid': 'str'})
-        tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids['seqid']!=""] #keep only seqids used
-        tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids['taxid']!="0"] # filter out taxid==0
+        tmp_seqid_taxids = pd.read_csv(acc2txid,
+                                       sep='\t',
+                                       header=None,
+                                       skiprows=1,
+                                       usecols=[1, 2],
+                                       names=['seqid', 'taxid'],
+                                       index_col='seqid',
+                                       converters={'seqid': lambda x: x if x in unique_seqids else None},
+                                       dtype={'taxid': 'str'})
+        tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids.index.notnull()] # keep only seqids used
+        tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids['taxid'] != "0"]  # filter out taxid==0
+
         # save count to return
-        count_acc2txid[acc2txid] = tmp_seqid_taxids.shape[0]    
-        # merge taxid retrieved based on seqid
-        seqinfo.join(tmp_seqid_taxids, "taxid")
+        count_acc2txid[acc2txid] = tmp_seqid_taxids.shape[0]
+        # merge taxid retrieved based on seqid (if any)
+        if count_acc2txid[acc2txid]:
+            seqinfo.join(tmp_seqid_taxids, "taxid")
         del tmp_seqid_taxids
         #if already found all seqids no need to parse all files till the end)
-        if sum(count_acc2txid.values()) == len(unique_seqids): 
-            break 
+        if sum(count_acc2txid.values()) == len(unique_seqids):
+            break
 
     return count_acc2txid
 

diff --git a/src/ganon/config.py b/src/ganon/config.py
@@ -4,7 +4,7 @@
 
 class Config:
 
-    version = '1.1.1'
+    version = '1.1.2'
     path_exec = {'build': "", 'classify': "", 'get_seq_info': ""}
     empty = False
 
@@ -26,7 +26,7 @@ def __init__(self, which: str=None, **kwargs):
         build_group_important.add_argument('-r', '--rank',            type=str,            metavar='', default='species', help='Target taxonomic rank for classification [species,genus,...]. use "leaves" to use the leaf taxonomic node assigned to each sequence as targets. To use assembly, strain or further specializations, check --specialization. Default: species')
         build_group_important.add_argument('-m', '--max-filter-size', type=float,          metavar='',                    help='Given an approx. upper limit in Megabytes (MB) for filter/memory usage. When using --window-size, filter may be significantly smaller after build depending on the level of similarity of your input sequences. [Mutually exclusive --bin-length]')
         build_group_important.add_argument('-f', '--max-fp',          type=float,          metavar='', default=0.05,      help='Max. false positive rate for bloom filters [Mutually exclusive --filter-size]. Default: 0.05')
-        
+
         build_group_filter = build_parser.add_argument_group('filter arguments')
         build_group_filter.add_argument('-k', '--kmer-size',             type=int,      metavar='', default=19,        help='The k-mer size to split sequences. Default: 19')
         build_group_filter.add_argument('-w', '--window-size',           type=int,      metavar='', default=0,         help='The window-size to build filter with minimizers. 0 to turn it off. Default: 0')

diff --git a/src/ganon/seqinfo.py b/src/ganon/seqinfo.py
@@ -41,7 +41,7 @@ def dropna(self, subset):
         self.seqinfo.dropna(subset=subset, inplace=True)
 
     def join(self, df, field):
-        self.seqinfo[field] = self.seqinfo.join(df.set_index('seqid'), on="seqid", how="left", rsuffix="_tojoin")[field+"_tojoin"]        
+        self.seqinfo[field] = self.seqinfo.join(df, on="seqid", how="left", rsuffix="_tojoin")[field+"_tojoin"]
 
     def parse_seq_info_file(self, seq_info_file, use_specialization: bool=False):
         self.seqinfo = pd.read_csv(seq_info_file,