Skip to content

Commit

Permalink
Merge pull request #195 from pirovc/dev
Browse files Browse the repository at this point in the history
ganon version 1.1.2
  • Loading branch information
pirovc authored Jan 28, 2022
2 parents 28c94f4 + c07c443 commit fced6f2
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 16 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# =============================================================================

cmake_minimum_required( VERSION 3.10 FATAL_ERROR )
project( ganon VERSION 1.1.1 LANGUAGES CXX )
project( ganon VERSION 1.1.2 LANGUAGES CXX )

# -----------------------------------------------------------------------------
# build setup
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2021 Vitor C. Piro
Copyright (c) 2022 Vitor C. Piro pirovc.github.io

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ def read(filename):

setup(
name="ganon",
version="1.1.1",
version="1.1.2",
url="https://www.github.com/pirovc/ganon",
license='MIT',
author="Vitor C. Piro",
author_email="pirovc@posteo.net",
description="ganon is a k-mer based read classification tool which uses Interleaved Bloom Filters in conjunction with a taxonomic clustering and a k-mer counting-filtering scheme.",
long_description=read("README.md"),
package_dir={'': 'src'},
Expand Down
34 changes: 25 additions & 9 deletions src/ganon/build_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,10 +500,14 @@ def retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg):
for acc2txid in seq_info_mode:
dowloaded_acc2txid_files.append(get_accession2taxid(acc2txid, tmp_output_folder, cfg.quiet))

print_log("Parsing accession2taxid files", cfg.quiet)
print_log("Parsing accession2taxid files", cfg.quiet)
count_acc2txid = parse_acc2txid(seqinfo, dowloaded_acc2txid_files)
for acc2txid_file, cnt in count_acc2txid.items():
print_log(" - " + str(cnt) + " entries found in the " + acc2txid_file.split("/")[-1] + " file", cfg.quiet)

# filter out taxids not found
seqinfo.dropna(subset=['taxid'])

# Check if retrieved taxids are the same as number of inputs, reset counter
if seqinfo.size() < seqid_total_count:
print_log(" - could not retrieve taxid for " + str(seqid_total_count - seqinfo.size()) + " accessions", cfg.quiet)
Expand All @@ -516,6 +520,7 @@ def retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg):
parse_eutils(seqinfo, tmp_output_folder, cfg.path_exec['get_seq_info'], cfg.quiet, skip_len_taxid=True, get_assembly=True)
print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet)


def get_accession2taxid(acc2txid, tmp_output_folder, quiet):
tx = time.time()
acc2txid_file = acc2txid + ".accession2taxid.gz"
Expand All @@ -526,21 +531,32 @@ def get_accession2taxid(acc2txid, tmp_output_folder, quiet):
print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", quiet)
return acc2txid_file


def parse_acc2txid(seqinfo, acc2txid_files):
count_acc2txid = {}
unique_seqids = set(seqinfo.get_seqids())
for acc2txid in acc2txid_files:
tmp_seqid_taxids = pd.read_csv(acc2txid, sep='\t', header=None, skiprows=1, usecols=[1,2], names=['seqid','taxid'], converters={'seqid':lambda x: x if x in unique_seqids else ""}, dtype={'taxid': 'str'})
tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids['seqid']!=""] #keep only seqids used
tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids['taxid']!="0"] # filter out taxid==0
tmp_seqid_taxids = pd.read_csv(acc2txid,
sep='\t',
header=None,
skiprows=1,
usecols=[1, 2],
names=['seqid', 'taxid'],
index_col='seqid',
converters={'seqid': lambda x: x if x in unique_seqids else None},
dtype={'taxid': 'str'})
tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids.index.notnull()] # keep only seqids used
tmp_seqid_taxids = tmp_seqid_taxids[tmp_seqid_taxids['taxid'] != "0"] # filter out taxid==0

# save count to return
count_acc2txid[acc2txid] = tmp_seqid_taxids.shape[0]
# merge taxid retrieved based on seqid
seqinfo.join(tmp_seqid_taxids, "taxid")
count_acc2txid[acc2txid] = tmp_seqid_taxids.shape[0]
# merge taxid retrieved based on seqid (if any)
if count_acc2txid[acc2txid]:
seqinfo.join(tmp_seqid_taxids, "taxid")
del tmp_seqid_taxids
#if already found all seqids no need to parse all files till the end)
if sum(count_acc2txid.values()) == len(unique_seqids):
break
if sum(count_acc2txid.values()) == len(unique_seqids):
break

return count_acc2txid

Expand Down
4 changes: 2 additions & 2 deletions src/ganon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Config:

version = '1.1.1'
version = '1.1.2'
path_exec = {'build': "", 'classify': "", 'get_seq_info': ""}
empty = False

Expand All @@ -26,7 +26,7 @@ def __init__(self, which: str=None, **kwargs):
build_group_important.add_argument('-r', '--rank', type=str, metavar='', default='species', help='Target taxonomic rank for classification [species,genus,...]. use "leaves" to use the leaf taxonomic node assigned to each sequence as targets. To use assembly, strain or further specializations, check --specialization. Default: species')
build_group_important.add_argument('-m', '--max-filter-size', type=float, metavar='', help='Given an approx. upper limit in Megabytes (MB) for filter/memory usage. When using --window-size, filter may be significantly smaller after build depending on the level of similarity of your input sequences. [Mutually exclusive --bin-length]')
build_group_important.add_argument('-f', '--max-fp', type=float, metavar='', default=0.05, help='Max. false positive rate for bloom filters [Mutually exclusive --filter-size]. Default: 0.05')

build_group_filter = build_parser.add_argument_group('filter arguments')
build_group_filter.add_argument('-k', '--kmer-size', type=int, metavar='', default=19, help='The k-mer size to split sequences. Default: 19')
build_group_filter.add_argument('-w', '--window-size', type=int, metavar='', default=0, help='The window-size to build filter with minimizers. 0 to turn it off. Default: 0')
Expand Down
2 changes: 1 addition & 1 deletion src/ganon/seqinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def dropna(self, subset):
self.seqinfo.dropna(subset=subset, inplace=True)

def join(self, df, field):
self.seqinfo[field] = self.seqinfo.join(df.set_index('seqid'), on="seqid", how="left", rsuffix="_tojoin")[field+"_tojoin"]
self.seqinfo[field] = self.seqinfo.join(df, on="seqid", how="left", rsuffix="_tojoin")[field+"_tojoin"]

def parse_seq_info_file(self, seq_info_file, use_specialization: bool=False):
self.seqinfo = pd.read_csv(seq_info_file,
Expand Down

0 comments on commit fced6f2

Please sign in to comment.