-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_ranks.py
130 lines (86 loc) · 2.76 KB
/
get_ranks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import logging
import string
import sys
from common import __version__, __email__, __author__
LOG = logging.getLogger(__name__)
__all__ = []
def read_kegg_org(file):
"""
read br08610.keg
:param file:
:return:
"""
r = {}
taxon = ""
level = "A"
levels = {k: n for n, k in enumerate(string.ascii_uppercase)}
for line in open(file):
line = line.strip()
tag = line[0]
if tag not in levels:
continue
if "TAX:" in line:
taxon = line.split("TAX:")[-1].split("]")[0]
level = tag
continue
if levels[tag] - levels[level] == 1:
if taxon:
# print("%s\t %s" % (line.split()[1], taxon))
org = line.split()[1]
if not org.isdigit():
r[org] = taxon
else:
taxon = ""
return r
def read_taxon(file):
r = {}
for line in open(file):
if line.startswith("#"):
continue
line = line.rstrip("\n")
taxon_id = line.split()[0]
r[taxon_id] = line
return r
def org2taxon(org, taxon):
r = {}
LOG.info("reading KEGG Organisms taxon from %r" % org)
org = read_kegg_org(org)
LOG.info("reading NCBI taxon ranks from %r" % taxon)
taxon = read_taxon(taxon)
LOG.info("process KEGG Organisms ranks")
for o, t in org.items():
if t in taxon:
r[o] = taxon[t]
else:
LOG.info("taxon_id %r not in taxon file" % t)
return r
def set_args():
args = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
description="""
get KEGG organism classification by taxon id
version: %s
contact: %s <%s>\
""" % (__version__, " ".join(__author__), __email__))
args.add_argument("--keg", metavar="FILE", required=True,
help="The htex file of KEGG Organisms in the NCBI Taxonomy, usually named as 'br08610.keg'")
args.add_argument("--taxon", metavar="FILE", required=True,
help="NCBI Taxonomy file, taxon_id, rank information separated with tab")
args.add_argument("--out", metavar="FILE", default="KEGG.ranks", help="output file (default: KEGG.ranks)")
return args.parse_args()
def main():
logging.basicConfig(
stream=sys.stderr,
level=logging.INFO,
format="[%(levelname)s] %(message)s"
)
args = set_args()
org_dict = org2taxon(args.keg, args.taxon)
LOG.info("output result to %r" % args.out)
with open(args.out, "w") as fh:
for k, v in sorted(org_dict.items()):
fh.write("%s\t%s\n" % (k, v))
if __name__ == "__main__":
main()