From 2ae9b108c67147dad445e50840a6ca59e1bf5689 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Thu, 22 Jul 2021 10:37:53 +0200 Subject: [PATCH 01/15] README update --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8991fcf..bbbb8d5 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ New to FlexTaxD? --> see [wiki](https://github.com/FOI-Bioinformatics/flextaxd/w Supported input formats in version later than v0.4.2: For details see [Formats-wiki](https://github.com/FOI-Bioinformatics/flextaxd/wiki/Formats) * NCBI * TSV -* MPI-style +* MP-style * GTDB * QIIME * SILVA @@ -20,7 +20,7 @@ Supported database build programs * krakenuniq * centrifuge -The flextaxd (flextaxd) script allows customization of databases from NCBI, QIIME or CanSNPer source formats and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation). +The flextaxd (flextaxd) script allows customization of databases from several sources (See supported source formats above) and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation). ### Create database * Create your database from source files --taxonomy_file * and select --taxonomy_type [supported formats] From 4f114bbafd8fc8eef3acbd3c2da08e2d9a6d4a9b Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 09:45:48 +0200 Subject: [PATCH 02/15] Option to use a file with GCF/GCA names to download files added --- flextaxd/create_databases.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/flextaxd/create_databases.py b/flextaxd/create_databases.py index cd6792f..2fb99dd 100644 --- a/flextaxd/create_databases.py +++ b/flextaxd/create_databases.py @@ -91,6 +91,7 @@ def __init__(self, message,expression=""): download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set") download_opts.add_argument('--download', action='store_true', help="Download additional sequences") download_opts.add_argument('-r', '--representative', action='store_true', help="Download GTDB representative genomes") + download_opts.add_argument('--download_file', default=False, help="Download genomes from file (path to file)") download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path") download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)") download_opts.add_argument('--genomes_path', metavar="",default=None, help='path to genomes') @@ -125,6 +126,8 @@ def __init__(self, message,expression=""): if not os.path.exists(args.database): raise FileNotFoundError("No database file could be found, please provide a FlexTaxD database to run FlexTaxD!") + if args.create_db and not args.genomes_path: + raise InputError("genomes_path parameter was not given") if not os.path.exists(args.genomes_path): raise FileNotFoundError("Directory {path} does not exist".format(path=args.genomes_path)) @@ -186,16 +189,19 @@ def __init__(self, message,expression=""): process_directory_obj = process_directory(args.database) genomes, missing = process_directory_obj.process_folder(args.genomes_path) ''' 2. Download missing files''' - if args.download or args.representative: + if args.download or args.representative or args.download_file: download = dynamic_import("modules", "DownloadGenomes") download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path) - new_genome_path, missing = download_obj.run(missing,representative=args.representative,url=args.rep_path) - if not new_genome_path: - still_missing = missing - if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing))) + if args.download_file: + download_obj.download_from_file(args.download_file) else: - new_genomes, missing = process_directory_obj.process_folder(new_genome_path) - genomes += new_genomes + new_genome_path, missing = download_obj.run(missing,args.rep_path) + if not new_genome_path: + still_missing = missing + if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing))) + else: + new_genomes, missing = process_directory_obj.process_folder(new_genome_path) + genomes += new_genomes else: if len(missing) > 0: logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing))) From e0e703e0faff647662b65375b02552875c917648 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 09:58:38 +0200 Subject: [PATCH 03/15] Function to download genomes from file added --- flextaxd/modules/DownloadGenomes.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/flextaxd/modules/DownloadGenomes.py b/flextaxd/modules/DownloadGenomes.py index 470424c..eb67b97 100644 --- a/flextaxd/modules/DownloadGenomes.py +++ b/flextaxd/modules/DownloadGenomes.py @@ -172,6 +172,16 @@ def download_files(self,files): logger.error("Could not stop all subprocesses check your process manager and end them manually!") return self.not_downloaded + def download_from_file(self,inputfile,exclude="",all=False): + '''Download files with ncbis download script using an accession input file''' + if not all: + exclude = " ".join([exclude,"--exclude-gff3 --exclude-protein --exclude-rna --exclude-genomic-cds"]) + args = ["genome", "accession" ,"--inputfile", input_file, exclude"] + p = Popen(args, stdout=PIPE,cwd=self.location) + (output, err) = p.communicate() + p_status = p.wait() + return self.location + def run(self, files, representative=False,url=""): '''Download list of GCF and or GCA files from NCBI or download represenative genomes From 1d1915548371cab537dea700cb9383b3c2b58dc3 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 09:59:49 +0200 Subject: [PATCH 04/15] Reference value added when storing genomes (refseq/genbank) --- flextaxd/modules/ReadTaxonomy.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/flextaxd/modules/ReadTaxonomy.py b/flextaxd/modules/ReadTaxonomy.py index 019aa74..90e39ea 100644 --- a/flextaxd/modules/ReadTaxonomy.py +++ b/flextaxd/modules/ReadTaxonomy.py @@ -141,16 +141,23 @@ def read_nodes(self, treefile=False): self.length +=1 self.database.commit() - def parse_genomeid2taxid(self,genomeid2taxid): + def parse_genomeid2taxid(self,genomeid2taxid,reference=False): '''Parse file that annotates genome_id´s to nodes in the tree''' nodeDict = self.database.get_nodes() + _ref = reference with self.zopen(genomeid2taxid,"rt") as f: headers = f.readline().strip().split("\t") for row in f: if row.strip() != "": ## If there are trailing empty lines in the file - genomeid,taxid = row.strip().split("\t") try: - self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()]) + genomeid,taxid = row.strip().split("\t") + except: + if not _ref: ## override if there is a reference in file, and use given ref + genomeid,taxid,reference = row.strip().split("\t") + else: + genomeid,taxid,override = row.strip().split("\t") + try: + self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()],reference=reference) except KeyError: logger.warning("# WARNING: {taxid} not found in the database".format(taxid=taxid)) self.database.commit() From ad706a54c9bffbe171d7082309cf4b239262fc13 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 10:02:10 +0200 Subject: [PATCH 05/15] Function to define/store source added (refseq/genbank) --- flextaxd/modules/ReadTaxonomyNCBI.py | 4 +- flextaxd/modules/ReadTaxonomyQIIME.py | 72 ++++++++++++--------------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/flextaxd/modules/ReadTaxonomyNCBI.py b/flextaxd/modules/ReadTaxonomyNCBI.py index 7364f28..d4176c2 100644 --- a/flextaxd/modules/ReadTaxonomyNCBI.py +++ b/flextaxd/modules/ReadTaxonomyNCBI.py @@ -83,7 +83,7 @@ def parse_genebank_file(self,filepath,filename): self.refseqid_to_GCF[refseqid] = genebankid return - def parse_genomeid2taxid(self, genomes_path,annotation_file): + def parse_genomeid2taxid(self, genomes_path,annotation_file,reference="refseq"): '''To allow NCBI databases to be build from scratch the sequences names needs to be stored in the database, this function parses the accession2taxid file from NCBI to speed up the function and reduce the amount of stored datata only sequences in input genomes_path will be fetched @@ -116,7 +116,7 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file): logger.info("Error on first line in annotation file, check format!") try: genebankid = self.refseqid_to_GCF[refseqid] - self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8")) + self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8"),reference=reference) annotated_genome.add(refseqid) except KeyError: pass diff --git a/flextaxd/modules/ReadTaxonomyQIIME.py b/flextaxd/modules/ReadTaxonomyQIIME.py index e0d0a25..42d7189 100644 --- a/flextaxd/modules/ReadTaxonomyQIIME.py +++ b/flextaxd/modules/ReadTaxonomyQIIME.py @@ -4,23 +4,22 @@ Read QIIME formatted taxonomy files and holds a dictionary with taxonomy tree and name translation ''' -from .ReadTaxonomy import ReadTaxonomy,InputError +from .ReadTaxonomy import ReadTaxonomy from .database.DatabaseConnection import DatabaseFunctions import logging logger = logging.getLogger(__name__) class ReadTaxonomyQIIME(ReadTaxonomy): """docstring for ReadTaxonomyQIIME.""" - def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose=False, taxid_base=1,skip_annotation=False): - super(ReadTaxonomyQIIME, self).__init__(database=database,verbose=verbose) - #self.database = DatabaseFunctions(database,verbose=verbose) + def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose=False, taxid_base=1,**kwargs): + super(ReadTaxonomyQIIME, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose,**kwargs) + self.database = DatabaseFunctions(database,verbose=verbose) self.input = taxonomy_file self.names = {} self.taxid_base = taxid_base - #self.taxonomy = {} + self.taxonomy = {} self.length = 0 self.ids = 0 - self.skip_annotation = skip_annotation self.levelDict = { "n": "no rank", "sk": "superkingdom", @@ -45,8 +44,8 @@ def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose oth_id = self.add_node("Other") unc_id = self.add_node("Unclassified") - self.add_rank("n",qiime=True) - self.add_rank("sk",qiime=True) + self.add_rank("n") + self.add_rank("sk") ## Add basic links self.add_link(child=rootid, parent=rootid,rank="n") self.add_link(child=coid, parent=rootid,rank="n") @@ -68,7 +67,7 @@ def parse_tree(self,tree,current_i=0): if description.strip() == "": return False if level not in self.taxonomy: - self.add_rank(level,qiime=True) + self.add_rank(level) if current_i == len(tree)-1: '''Top parent reached return top parent id''' try: @@ -93,59 +92,54 @@ def parse_description(self,tree,current_i): '''Retrieve node description from QIIME formatted tree''' current_level = tree[current_i] level, description = current_level.split("__") + '''Fix greengenes bug with empty end nodes''' + if description == "": + level, description = self.parse_description(tree,current_i+1) return level,description - def qiime_to_tree(self, sep="\t"): + + def qiime_to_tree(self, sep="\t",reference=False): '''Read the qiime format file and parse out the relation tree (nodes.dmp)''' self.sep = sep self.tree = set() self.missed = 0 self.errors = 0 self.added = 0 + _ref = reference + refDict = {"RS":"refseq","GB":"genbank"} ## Refdict for GTDB formatted sources taxid_start = self.taxid_base - raiseWarning = False - if self.skip_annotation: - logger.info("Skip annotation is true, annotations in QIIME input file will be ignored.") with open(self.input) as f: '''Each row defines a genome annotation file connected to a tree level''' for row in f: if row.strip() != "": ## If there are trailing empty lines in the file data = row.strip().split("\t") - if self.skip_annotation: - genome_id = False - elif len(data) != 2: ## If the QIIME file is annotated it will contain two columns - if len(data) > 2: - raise InputError("The input format is not correct for QIIME. THe QIIME source format requires one [QIIME-tree] or two columns [gene_id\\tQIIME-tree]") - raiseWarning = True - genome_id = False - else: ## Annotation is given in column one - try: - if data[0].startswith(("RS","GB")): - '''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this''' - genome_id = data[0].split("_",1)[1].strip() ## Genome ID - else: - genome_id = data[0].strip() - #print(genome_id) - except IndexError: - logger.debug("Row {row} could not be parsed".format(row=data)) - self.errors +=1 + try: + if data[0].startswith(("RS","GB")): + '''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this''' + reference,genome_id = data[0].split("_",1) ## Genome ID + genome_id = genome_id.strip() + if not _ref: + reference = refDict[reference] + else: + genome_id = data[0].strip() + '''Greengenes adaption, empty levels''' + #print(genome_id) + except IndexError: + logger.debug("Row {row} could not be parsed".format(row=data)) + self.errors +=1 ### Walk through tree and make sure all nodes back to root are annotated! taxonomy = list(reversed(data[-1].split(";"))) taxonomy_i = self.parse_tree(taxonomy) - if taxonomy_i and genome_id: - test = self.database.add_genome(genome=genome_id,_id=taxonomy_i) + if taxonomy_i: + test = self.database.add_genome(genome=genome_id,_id=taxonomy_i,reference=reference) if not str(test).startswith("UNIQUE"): self.added +=1 - elif not genome_id: ## No genome id suggest a QIIME file with no annotation - pass else: logger.debug("Warning taxonomy: {taxonomy} could not be parsed!!") self.missed +=1 self.database.commit() self.length = self.taxid_base - taxid_start - if raiseWarning: logger.warning("Warning no genomeid2taxid file given and your QIIME formatted file does not seem to contain annotations!") - if not self.skip_annotation: - logger.info("Genomes added to database: {genomes}".format(genomes=self.added)) - logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors)) + logger.info("Genomes added to database: {genomes}".format(genomes=self.added)) + logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors)) logger.info("New taxonomy ids assigned {taxidnr}".format(taxidnr=self.length)) From 6d4836fb1e2d0aafe6640a46c2228f55b0db6d07 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 10:04:16 +0200 Subject: [PATCH 06/15] dump genomes function added --- flextaxd/modules/WriteTaxonomy.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/flextaxd/modules/WriteTaxonomy.py b/flextaxd/modules/WriteTaxonomy.py index 204b648..d00e786 100644 --- a/flextaxd/modules/WriteTaxonomy.py +++ b/flextaxd/modules/WriteTaxonomy.py @@ -10,7 +10,7 @@ class WriteTaxonomy(object): """docstring for WriteTaxonomy.""" - def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False,prefix="names,nodes",desc=False,dbprogram=None): + def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False,prefix="names,nodes",desc=False,dbprogram=None,dump_genomes=False): super(WriteTaxonomy, self).__init__() self.database = DatabaseFunctions(database) logging.debug("Write settings: ") @@ -39,6 +39,26 @@ def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False, self.link_order = False ## Default print is NCBI structure with child in the first column logging.debug("NCBI structure (child first): {parent}".format(parent=self.link_order)) + def dump_genomes(self): + '''Write the list of annotated genomes to a file''' + with open('{}{}.dmp'.format(self.path,"genomes"),"w") as outputfile: + genomes = self.get_all('genomes', 'genome,reference', sort="reference") + for genome in genomes: + print(*genome, sep="\t", end="\n", file=outputfile) + return + + def dump_genome_annotations(self, sort="reference"): + '''Dump all genomes, including their taxonomy reference (will work as input file for genomeid2taxid)''' + select = "genome,name,reference" + QUERY = "SELECT {select} FROM genomes JOIN nodes ON nodes.id=genomes.id".format(select=select) + if sort: + QUERY += " ORDER BY {col} DESC".format(col=sort) + logging.debug(QUERY) + with open('{}{}.dmp'.format(self.path,"genomes"),"w") as outputfile: + genomes = self.database.query(QUERY).fetchall() + for genome in genomes: + print(*genome, sep="\t", end="\n", file=outputfile) + return def set_separator(self,sep): self.separator=sep @@ -63,8 +83,10 @@ def set_prefix(self,prefix): logging.debug("Update output prefix nodes:{nodes} names:{names} ".format(nodes=self.prefix[1],names=self.prefix[0])) return self.prefix - def get_all(self, table, select="*"): + def get_all(self, table, select="*", sort=False): QUERY = "SELECT {select} FROM {table}".format(select=select, table=table) + if sort: + QUERY += " ORDER BY {col} DESC".format(col=sort) logging.debug(QUERY) return self.database.query(QUERY).fetchall() From fb8980725a7e26c97b9cbed7a63d1969ae95aef2 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 10:05:43 +0200 Subject: [PATCH 07/15] reference field added to flextaxd genomes table --- flextaxd/modules/ProcessDirectory.py | 8 ++++---- flextaxd/modules/database/CreateDatabase.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/flextaxd/modules/ProcessDirectory.py b/flextaxd/modules/ProcessDirectory.py index 37b6fa9..37f31fb 100644 --- a/flextaxd/modules/ProcessDirectory.py +++ b/flextaxd/modules/ProcessDirectory.py @@ -23,6 +23,7 @@ def __init__(self, database,limit=False): self.genome_names = [] self.genome_path_dict = {} self.files = [] + #logger.debug(self.genome_id_dict) def get_genome_names(self): ''' @@ -73,10 +74,6 @@ def is_gcf_gca(self,fname,debug=False): ''' try: GCX,END,REST = fname.split("_",2) ## If a name contains anything after the GCF number remove this if split by _ - if GCX in ["GB","RS"]: - return False ## GTDB stype input treat as non GCF - #GCX = END - #END,REST = REST.split("_",1) #remove GB or RS definitions if debug: logger.debug("[{} {} {}]".format(GCX,END,REST)) NUM,version = END.split(".",1) @@ -162,6 +159,7 @@ def walk_directory(self,folder_path): if not folder_path: raise IOError("Parameter --genomes_path was not set".format(folder_path)) logger.info("Process genome path ({path})".format(path=folder_path)) + logger.debug("Extensions valid: ({f})".format(f=self.ext)) for root, dirs, files in os.walk(folder_path,followlinks=True): for file in files: fname = file.strip(".gz") ## remove gz if present @@ -177,6 +175,8 @@ def walk_directory(self,folder_path): logger.info("Processed {count} genomes".format(count=count)) self.files = list(set(self.files)) self.genome_names = list(set(self.genome_names)) + if len(self.genome_names) == 0: + logger.info("# WARNING: No genomes from the input folder was added! Are your genomes annotated properly in the database?") return self.files, self.genome_names def process_folder(self,folder_path): diff --git a/flextaxd/modules/database/CreateDatabase.py b/flextaxd/modules/database/CreateDatabase.py index 905b324..ea22ddf 100644 --- a/flextaxd/modules/database/CreateDatabase.py +++ b/flextaxd/modules/database/CreateDatabase.py @@ -41,6 +41,7 @@ def __init__(self, verbose=False): self.sql_create_genomes_table = """ CREATE TABLE IF NOT EXISTS genomes ( id integer NOT NULL, genome VARCHAR(255) NOT NULL UNIQUE, + reference VARCHAR(255), FOREIGN KEY (id) REFERENCES nodes (id) ); """ From ba815ed9064ff640237ee296905dcf016696c654 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 10:10:04 +0200 Subject: [PATCH 08/15] Modifications required for reference field in table genomes --- .../modules/database/DatabaseConnection.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/flextaxd/modules/database/DatabaseConnection.py b/flextaxd/modules/database/DatabaseConnection.py index 8dfab23..4788a53 100644 --- a/flextaxd/modules/database/DatabaseConnection.py +++ b/flextaxd/modules/database/DatabaseConnection.py @@ -350,7 +350,7 @@ def check_parent(self): raise TreeError("Node: {node} has more than one parent!".format(node=name)) '''Get functions of class''' - def get_all(self, database=False, table=False): + def get_all(self, database=False, table=False,sort=False): '''Get full table from table ------ @@ -360,6 +360,8 @@ def get_all(self, database=False, table=False): if not table: raise Exception("Table was not supplied to DatabaseFuction get_all()") QUERY = '''SELECT * FROM {table}'''.format(table) + if sort: + QUERY += " ORDER BY {col} ASC".format(sort=sort) return self.query(QUERY).fetchall() def get_taxid_base(self): @@ -503,7 +505,7 @@ def add_link(self, child=None, parent=None, rank=1, table="tree"): logger.debug("link added: child {}, parent {}, rank {} ".format(child,parent,rank)) return self.insert(info, table="tree") - def add_genome(self, genome, _id=False): + def add_genome(self, genome, _id=False,reference=False): '''Add genome annotation to nodes Returns @@ -513,8 +515,12 @@ def add_genome(self, genome, _id=False): info = { "genome": genome } + if _id: info["id"] = _id + if reference: + info["reference"] = reference + logger.debug(info) return self.insert(info, table="genomes") def add_links(self,links, table="tree",hold=False): @@ -799,6 +805,15 @@ def get_id(self,name): raise NameError("Name not found in the database! {name}".format(name=name)) return res + def update_table(self,data,table): + '''Add annotation to table + + Returns + ------ + see update responses + ''' + return self.update(data, table=table) + def update_genome(self,data): '''Add genome annotation to nodes @@ -806,4 +821,4 @@ def update_genome(self,data): ------ see update responses ''' - return self.update(data, table="genomes") + return self.update_table(data, table="genomes") From 3eafae2a6d89de40f8c443cf7fbe3545685ca5c6 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 10:12:32 +0200 Subject: [PATCH 09/15] Added option for selected prefered source (refseq/genbank) on download. Added option to export annotated genomes to file. Preparation for an update node name function, however not yet functional --- flextaxd/custom_taxonomy_databases.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py index 6feaabe..7209652 100644 --- a/flextaxd/custom_taxonomy_databases.py +++ b/flextaxd/custom_taxonomy_databases.py @@ -20,7 +20,7 @@ each column (not |). ''' -__version__ = "0.4.2" +__version__ = "0.4.3" __author__ = "David Sundell" __credits__ = ["David Sundell"] __license__ = "GPLv3" @@ -121,10 +121,12 @@ def __init__(self, message,expression=""): mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications") mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to") mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source') + #mod_opts.add_argument('-un', '--update_names', metavar="",default=None, help='Update node names using old to new name file.') mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch") mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!") mod_opts.add_argument('--clean_database', action='store_true', help="Clean up database from unannotated nodes") mod_opts.add_argument('--skip_annotation', action='store_true', help="Do not automatically add annotation when creating GTDB database") + mod_opts.add_argument('--refdatabase', metavar="", default=False, help="For download command, give value of expected source, default (refseq)") out_opts = parser.add_argument_group('output_opts', "Output options") @@ -132,6 +134,8 @@ def __init__(self, message,expression=""): out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)") out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken") out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers") + out_opts.add_argument('--dump_genomes', action='store_true', default=False, help="Print list of genomes (and source) to file") + out_opts.add_argument('--dump_genome_annotations', action='store_true', default=False, help="Add genome taxid annotation to genomes dump") vis_opts = parser.add_argument_group('vis_opts', "Visualisation options") vis_opts.add_argument('--visualise_node', metavar='', default=False, help="Visualise tree from selected node") @@ -261,6 +265,16 @@ def __init__(self, message,expression=""): modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base) modify_obj.clean_database(ncbi=ncbi) + '''Dump option, export list of genomes, added in flextaxd version 0.4.2''' + if args.dump_genomes: + logger.info("Dump list of genomes") + write_module = dynamic_import("modules", "WriteTaxonomy") + write_obj = write_module(args.outdir, database=args.database,prefix=args.dump_prefix,separator=args.dump_sep,minimal=args.dump_mini,desc=args.dump_descriptions,dbprogram=args.dbprogram,dump_genomes=True) + if args.dump_genome_annotations: + write_obj.dump_genome_annotations() + else: + write_obj.dump_genomes() + ''' 0. Create taxonomy database (if it does not exist)''' if args.taxonomy_file: if not os.path.exists(args.database) or force: @@ -304,6 +318,11 @@ def __init__(self, message,expression=""): modify_obj = modify_module(database=args.database, update_genomes=True,taxid_base=args.taxid_base) modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid) + if args.update_names: + modify_module = dynamic_import("modules", "ModifyTree") + modify_obj = modify_module(database=args.database, update_node_names=True,taxid_base=args.taxid_base) + modify_obj.update_node_names(args.update_names) + if (args.mod_file or args.mod_database) and args.clean_database: modify_module = dynamic_import("modules", "ModifyTree") modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base) From 08f5a8b2e1ec3c3dcacea139bcaa7eb5fdc7d35e Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 10:13:25 +0200 Subject: [PATCH 10/15] Preparation for an update node name function, however not yet available --- flextaxd/modules/ModifyTree.py | 44 ++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py index 2c9c571..acb176d 100644 --- a/flextaxd/modules/ModifyTree.py +++ b/flextaxd/modules/ModifyTree.py @@ -55,7 +55,7 @@ def __init__(self, message): class ModifyTree(object): """docstring for ModifyTree.""" - def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, separator="\t",verbose=False,parent=False,replace=False,**kwargs): + def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, update_node_names=False,separator="\t",verbose=False,parent=False,replace=False,**kwargs): super(ModifyTree, self).__init__() self.verbose = verbose logger.info("Modify Tree") @@ -92,7 +92,7 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, c self.modsource = self.parse_modification(self.moddb,"database") elif mod_file: self.modsource = self.parse_modification(mod_file,"file") - elif update_genomes or clean_database: + elif update_genomes or clean_database or update_node_names: pass else: raise InputError("No modification source could be found both mod_database and mod_file file are empty!") @@ -295,21 +295,27 @@ def parse_modification(self, input,modtype="database"): '''Get all genomes annotated to new nodes in existing database''' return True - def update_annotations(self, genomeid2taxid): + def update_annotations(self, genomeid2taxid, reference=False): '''Function that adds annotation of genome ids to nodes''' logger.info("Update genome to taxid annotations using {genomeid2taxid}".format(genomeid2taxid=genomeid2taxid)) + _ref = reference update = { "set_column": "id", "where_column": "genome", "set_value": "", "where": "" } + #if _ref: + # update[""] updated = 0 added = 0 with open(genomeid2taxid) as f: for row in f: try: - genome,name = row.strip().split(self.sep) + if len(row.strip().split(self.sep)) > 2: + genome,name,reference = row.strip().split(self.sep) + else: + genome,name = row.strip().split(self.sep) except ValueError: genome,name = row.strip().split(" ") logger.debug("genome: {genome}, name: {name}".format(genome=genome,name=name)) @@ -386,6 +392,36 @@ def update_genomes(self): self.taxonomydb.commit() return + def update_node_names(self, refdict): + '''Function that adds annotation of genome ids to nodes''' + logger.info("Update node names in the database from {refdict}".format(refdict=refdict)) + update = { + "set_column": "name", + "where_column": "name", + "set_value": "", + "where": "" + } + updated = 0 + added = 0 + with open(refdict) as f: + for row in f: + try: + old_name,name = row.strip().split(self.sep) + except ValueError: + old_name,name = row.strip().split(" ") + logger.debug("old_name: {old_name}, name: {name}".format(old_name=old_name,name=name)) + ## If no exception occured add old_name + update["set_value"] = name + update["where"] = old_name.strip() + res = self.taxonomydb.update_table(update,table="nodes") + if self.taxonomydb.rowcount()!=0: + if res: + updated += 1 + self.taxonomydb.commit() + gid = self.taxonomydb.get_genomes() + logger.info("{updated} annotations were updated!".format(added=added, updated=updated)) + return + def clean_database(self, ncbi=False): '''Function that removes all node and node paths without annotation''' logger.info("Fetch annotated nodes") From ae8175be58abf23c0ab445c3a5e2113268d8b435 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 11:01:42 +0200 Subject: [PATCH 11/15] Function not ready --- flextaxd/custom_taxonomy_databases.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py index 7209652..a83d06b 100644 --- a/flextaxd/custom_taxonomy_databases.py +++ b/flextaxd/custom_taxonomy_databases.py @@ -318,10 +318,10 @@ def __init__(self, message,expression=""): modify_obj = modify_module(database=args.database, update_genomes=True,taxid_base=args.taxid_base) modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid) - if args.update_names: - modify_module = dynamic_import("modules", "ModifyTree") - modify_obj = modify_module(database=args.database, update_node_names=True,taxid_base=args.taxid_base) - modify_obj.update_node_names(args.update_names) + # if args.update_names: + # modify_module = dynamic_import("modules", "ModifyTree") + # modify_obj = modify_module(database=args.database, update_node_names=True,taxid_base=args.taxid_base) + # modify_obj.update_node_names(args.update_names) if (args.mod_file or args.mod_database) and args.clean_database: modify_module = dynamic_import("modules", "ModifyTree") From 97aa70d70fa915b4b2a8b5938bf922845348ac73 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Fri, 29 Apr 2022 11:02:11 +0200 Subject: [PATCH 12/15] Bug introduced reading GTDB formatted files --- flextaxd/modules/ReadTaxonomyQIIME.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flextaxd/modules/ReadTaxonomyQIIME.py b/flextaxd/modules/ReadTaxonomyQIIME.py index 42d7189..7452341 100644 --- a/flextaxd/modules/ReadTaxonomyQIIME.py +++ b/flextaxd/modules/ReadTaxonomyQIIME.py @@ -44,8 +44,8 @@ def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose oth_id = self.add_node("Other") unc_id = self.add_node("Unclassified") - self.add_rank("n") - self.add_rank("sk") + self.add_rank("n",qiime=True) + self.add_rank("sk",qiime=True) ## Add basic links self.add_link(child=rootid, parent=rootid,rank="n") self.add_link(child=coid, parent=rootid,rank="n") @@ -67,7 +67,7 @@ def parse_tree(self,tree,current_i=0): if description.strip() == "": return False if level not in self.taxonomy: - self.add_rank(level) + self.add_rank(level,qiime=True) if current_i == len(tree)-1: '''Top parent reached return top parent id''' try: From cef23c34c5102c79c797627ea5a5c428f78ef947 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Tue, 3 May 2022 16:03:55 +0200 Subject: [PATCH 13/15] Error cleaning NCBI structure, then merging with no previously annotated genomes --- flextaxd/modules/ModifyTree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py index acb176d..bbe8407 100644 --- a/flextaxd/modules/ModifyTree.py +++ b/flextaxd/modules/ModifyTree.py @@ -440,7 +440,7 @@ def clean_database(self, ncbi=False): logger.info("Parents added: {an}".format(an=len(self.annotated_nodes)-an)) if ncbi: logger.info("Keep main nodes of the NCBI taxonomy (parents on level 3 and above)") - self.keep = set(self.taxonomydb.get_children([1],maxdepth=1)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)]) + self.keep = set(self.taxonomydb.get_children([1],maxdepth=2)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)]) logger.info("Adding root levels {nlev}".format(nlev=len(self.keep-self.annotated_nodes))) self.annotated_nodes |= self.keep '''Get all links related to an annotated node and its parents''' From 0c5c0e1b4d57547fbb9282f5768fc5263ce7fbdd Mon Sep 17 00:00:00 2001 From: David Sundell Date: Wed, 13 Jul 2022 10:01:44 +0200 Subject: [PATCH 14/15] When merging databases keep information about a nodes previous level (if new database does not contain that information) --- flextaxd/modules/ModifyTree.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py index bbe8407..758319e 100644 --- a/flextaxd/modules/ModifyTree.py +++ b/flextaxd/modules/ModifyTree.py @@ -63,6 +63,8 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, c self.taxonomy = {} self.names = {} + self.keep_rank = {} ## Place holder for parent rank + self.parent=parent self.replace = replace self.ncbi_order = True @@ -163,8 +165,16 @@ def _parse_new_links(self, parent=None,child=None,rank="no rank"): self.new_nodes.add(parent_i) child_i = self.get_id(child) self.new_nodes.add(child_i) - - rank_i = self.add_rank(rank) + '''If node has no level (no rank), check if parent database had a classified level. If so default add level''' + try: + level = self.parent_levels[int(child_i)] + logger.info("Rank kept for {node}, {rank}".format(node=child, rank=level)) + except: + level = False + if not level: + rank_i = self.add_rank(rank) + else: + rank_i = level self.new_links.add((parent_i,child_i,rank_i)) return @@ -263,17 +273,15 @@ def parse_modification(self, input,modtype="database"): if modtype == "database": modparent = self.moddb.get_parent(self.moddb.get_id(self.parent)) logger.info("{n} existing links to {parent} ({parentlinks}) ({modparent})".format(n=len(self.existing_links),parent=self.parent,parentlinks=parentlinks,modparent=modparent)) - + self.parent_levels = self.keep_levels(self.existing_links | parentlinks | set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes)))) if modtype == "database": self.mod_genomes = self.database_mod(input,self.parent) elif modtype == "file": self.file_mod(input) else: raise InputError("Wrong modification input database or file must be supplied") - ### get links from current database self.old_nodes = self.existing_nodes - self.new_nodes - logger.info("nodes:") logger.info("old: {old}".format(old=len(self.old_nodes))) logger.info("new: {new}".format(new=len(self.new_nodes))) @@ -284,7 +292,6 @@ def parse_modification(self, input,modtype="database"): self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks) self.old_links = self.existing_links - self.new_links - logger.info("links:") logger.info("old: {old}".format(old=len(self.old_links))) logger.info("new: {new}".format(new=len(self.new_nodes))) @@ -440,7 +447,7 @@ def clean_database(self, ncbi=False): logger.info("Parents added: {an}".format(an=len(self.annotated_nodes)-an)) if ncbi: logger.info("Keep main nodes of the NCBI taxonomy (parents on level 3 and above)") - self.keep = set(self.taxonomydb.get_children([1],maxdepth=2)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)]) + self.keep = set(self.taxonomydb.get_children([1],maxdepth=3)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)]) logger.info("Adding root levels {nlev}".format(nlev=len(self.keep-self.annotated_nodes))) self.annotated_nodes |= self.keep '''Get all links related to an annotated node and its parents''' @@ -470,6 +477,14 @@ def clean_database(self, ncbi=False): self.taxonomydb.query("vacuum") logger.info("Database is cleaned!") + def keep_levels(self, links): + parent_levels = {} + for c,p,l in links: + if self.rank[l] != 1: + parent_levels[p] = l + print(parent_levels) + return parent_levels + def update_database(self): '''Update the database file''' if self.replace: @@ -482,6 +497,7 @@ def update_database(self): logger.info("Replace tree, deleting all nodes downstream of selected parent!") if len(self.old_links | self.non_overlapping_old_links-set(self.parent_link)) > 0: logger.debug("Delete links no longer valid!") + '''Add function to keep taxonomy level from previous database (belongs to parent)''' self.taxonomydb.delete_links((self.old_links | self.non_overlapping_old_links)-set(self.parent_link)) if len(self.old_nodes): logger.debug("Delete nodes!") From d7580cce73861e291e9a0c801b213e4fd51a5171 Mon Sep 17 00:00:00 2001 From: David Sundell Date: Wed, 13 Jul 2022 10:03:28 +0200 Subject: [PATCH 15/15] Bugfix root id --- flextaxd/modules/ReadTaxonomyQIIME.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flextaxd/modules/ReadTaxonomyQIIME.py b/flextaxd/modules/ReadTaxonomyQIIME.py index 7452341..46a528c 100644 --- a/flextaxd/modules/ReadTaxonomyQIIME.py +++ b/flextaxd/modules/ReadTaxonomyQIIME.py @@ -13,7 +13,7 @@ class ReadTaxonomyQIIME(ReadTaxonomy): """docstring for ReadTaxonomyQIIME.""" def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose=False, taxid_base=1,**kwargs): super(ReadTaxonomyQIIME, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose,**kwargs) - self.database = DatabaseFunctions(database,verbose=verbose) + #self.database = DatabaseFunctions(database,verbose=verbose) # Not nessesary opens in parent class self.input = taxonomy_file self.names = {} self.taxid_base = taxid_base @@ -35,7 +35,7 @@ def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose } self.set_qiime(True) ### Add root name these manual nodes are required when parsing the GTDB database! - rootid = self.add_node("root") ## Allways set in ReadTaxonomy + #rootid = self.add_node("root") ## Allways set in ReadTaxonomy coid = self.add_node("cellular organisms") bac_id = self.add_node("Bacteria") Euk_id = self.add_node("Eukaryota") @@ -47,14 +47,14 @@ def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose self.add_rank("n",qiime=True) self.add_rank("sk",qiime=True) ## Add basic links - self.add_link(child=rootid, parent=rootid,rank="n") - self.add_link(child=coid, parent=rootid,rank="n") + self.add_link(child=self.root, parent=self.root,rank="n") + self.add_link(child=coid, parent=self.root,rank="n") self.add_link(child=bac_id, parent=coid,rank="sk") self.add_link(child=Euk_id, parent=coid,rank="sk") self.add_link(child=arc_id, parent=coid,rank="sk") - self.add_link(child=vir_id, parent=rootid,rank="sk") - self.add_link(child=oth_id, parent=rootid,rank="n") - self.add_link(child=unc_id, parent=rootid, rank="n") + self.add_link(child=vir_id, parent=self.root,rank="sk") + self.add_link(child=oth_id, parent=self.root,rank="n") + self.add_link(child=unc_id, parent=self.root, rank="n") def parse_taxonomy(self): '''Parse taxonomy information'''