-
Notifications
You must be signed in to change notification settings - Fork 2
/
process_go.py
305 lines (233 loc) · 10.2 KB
/
process_go.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import os
import re
import sys
import gzip
from urlparse import urlsplit
from ConfigParser import SafeConfigParser
from go import go
from slugify import slugify
from utils import build_tags_dictionary
# Import and set logger
import logging
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
GO_NAMESPACE_MAP = {
'biological_process': 'BP',
'molecular_function': 'MF',
'cellular_component': 'CC',
}
DB_REMAP = {
'FB': 'FLYBASE',
'WB': 'WormBase',
}
def get_filtered_annotations(assoc_file, accepted_evcodes=None,
remove_leading_gene_id=None,
use_symbol=None, tax_id=None):
"""
This function reads in the association file and returns a list of
annotations. Only annotations that have evidence codes in
'accepted_evcodes' (if accepted_evcodes is not None) and annotations
that do not have details == 'NOT' will be included in this list.
Arguments:
assoc_file -- A string. Location of the GO association file to be
read in.
accepted_evcodes -- A list of evidence codes (e.g. ['EXP', 'IDA', 'IPI'])
to filter the annotations by.
remove_leading_gene_id -- True or False value. For some organisms, such
as Mouse, there is a leading tag on the gene IDs column in the gene
association file. This tag is just a duplicate of the type of gene
identifier (already present in the identifier type (xrdb) column) and
should be removed to get the pure gene ID (e.g. to get "99668" as
opposed to "MGI:99668").
Returns:
annotations -- A list of all the annotations that meet the desired
criteria. Each annotation in the list will be a tuple, which will
contain: (<crossrefDB>, <crossrefID>, <goid>, <refstring>, <date>)
"""
if assoc_file.endswith('.gz'):
assoc_fh = gzip.open(assoc_file, 'r')
else:
assoc_fh = open(assoc_file, 'r')
annotations = []
for line in assoc_fh:
if line.startswith('!'):
continue
toks = line.strip().split('\t')
(xrdb, xrid, details, goid, refstring, ev_code, taxon, date) = (
toks[0], toks[1], toks[3], toks[4], toks[5], toks[6],
toks[12].split(':')[1], toks[13])
if tax_id and (tax_id != taxon):
continue
if remove_leading_gene_id:
xrid = xrid.split(':')[1]
if xrdb in DB_REMAP:
xrdb = DB_REMAP[xrdb]
if use_symbol:
xrdb = 'Symbol'
if toks[0] == 'UniProtKB':
xrid = toks[2]
# These next few lines are needed for processing
# Arabidopsis annotations
if xrdb == 'TAIR':
tair_regex = re.compile('AT[0-9MC]G[0-9][0-9][0-9][0-9][0-9]')
first_alias = toks[10].split('|')[0]
if tair_regex.match(toks[2]):
xrid = toks[2]
elif tair_regex.match(toks[9]):
xrid = toks[9]
elif tair_regex.match(first_alias):
xrid = first_alias
if details == 'NOT':
continue
if accepted_evcodes is not None and (ev_code not in accepted_evcodes):
continue
annotation = (xrdb, xrid, goid, refstring, date)
annotations.append(annotation)
return annotations
def create_go_term_title(go_term):
"""
Small function to create the GO term title in the desired
format: GO-<GO_NAMESPACE>-<GO integer ID>:<GO term full name>
Example: GO-BP-0000280:nuclear division
Arguments:
go_term -- This is a go_term object from the go() class (go.go)
Returns:
title -- A string of the GO term's title in the desired format.
"""
go_id = go_term.go_id.split(':')[1]
namespace = GO_NAMESPACE_MAP[go_term.get_namespace()]
title = 'GO' + '-' + namespace + '-' + go_id + ':' + go_term.full_name
return title
def create_go_term_abstract(go_term, evlist=None):
"""
Function to create the GO term abstract in the desired
format.
Arguments:
go_term -- This is a go_term object from the go() class (go.go)
evlist -- A list of evidence codes we will use to filter the annotations
Returns:
abstract -- A string of the GO term's abstract in the desired format.
"""
evclause = ''
if evlist is not None:
evclause = ' Only annotations with evidence coded as '
if len(evlist) == 1:
evclause = evclause + evlist[0]
else:
evclause = evclause + ', '.join(evlist[:-1]) + ' or ' + evlist[-1]
evclause = evclause + ' are included.'
if go_term.description:
description = go_term.description + ' Annotations are propagated ' + \
'through transitive closure as recommended by the GO ' + \
'Consortium.' + evclause
else:
logger.info("No description on term %s", go_term)
# TODO: What do we put as description if go_term.description == None?
return description
def process_go_terms(species_ini_file, base_download_folder):
"""
Function to read in config INI file and run the other functions to
process GO terms.
"""
species_file = SafeConfigParser()
species_file.read(species_ini_file)
if not species_file.has_section('GO'):
logger.error('Species INI file has no GO section, which is needed'
' to run the process_go_terms function.')
sys.exit(1)
organism = species_file.get('species_info', 'SCIENTIFIC_NAME')
sd_folder = species_file.get('species_info', 'SPECIES_DOWNLOAD_FOLDER')
taxonomy_id = species_file.get('species_info', 'TAXONOMY_ID')
obo_url = urlsplit(species_file.get('GO', 'GO_OBO_URL'))
obo_filename = os.path.basename(obo_url.path)
obo_file = os.path.join(base_download_folder, obo_filename)
# Get whatever is saved in the ASSOC_FILE_URLS option minus any
# whitespace characters.
assoc_file_urls = re.sub(
r'\s', '', species_file.get('GO', 'ASSOC_FILE_URLS'))
# Convert this line into a list of urls, splitting by comma (','),
# and then make a list with the filenames in each of these urls.
assoc_file_url_list = [urlsplit(x) for x in assoc_file_urls.split(',')]
assoc_filenames = [os.path.basename(x.path) for x in assoc_file_url_list]
assoc_files = [os.path.join(sd_folder, 'GO', x) for x in assoc_filenames]
evcodes = species_file.get('GO', 'EVIDENCE_CODES')
evcodes = re.sub(r'\s', '', evcodes).split(',')
use_symbol = None
if species_file.has_option('GO', 'USE_SYMBOL'):
use_symbol = species_file.getboolean('GO', 'USE_SYMBOL')
remove_leading_gene_id = False
if species_file.has_option('GO', 'REMOVE_LEADING_GENE_ID'):
remove_leading_gene_id = species_file.getboolean(
'GO', 'REMOVE_LEADING_GENE_ID')
annotations = []
for assoc_file in assoc_files:
new_annotations = get_filtered_annotations(
assoc_file, evcodes,
remove_leading_gene_id=remove_leading_gene_id,
use_symbol=use_symbol, tax_id=taxonomy_id)
annotations.extend(new_annotations)
gene_ontology = go()
loaded_obo_bool = gene_ontology.load_obo(obo_file)
if loaded_obo_bool is False:
logger.error('GO OBO file could not be loaded.')
for annotation in annotations:
(xrdb, xrid, goid, refstring, date) = annotation
refs = refstring.split('|')
pub = None
for ref in refs:
# Check if publication source is PubMed (PMID).
# Otherwise, keep pub as None.
if ref.startswith('PMID:'):
pub = ref.split(':')[1]
gene_ontology.add_annotation(go_id=goid, gid=xrid, ref=pub,
date=date, xdb=xrdb, direct=True)
gene_ontology.populated = True
gene_ontology.propagate()
GO_terms = []
tags_dictionary = None
if species_file.has_option('GO', 'TAG_MAPPING_FILE'):
tag_mapping_file = species_file.get('GO', 'TAG_MAPPING_FILE')
go_id_column = species_file.getint('GO', 'GO_ID_COLUMN')
go_name_column = species_file.getint('GO', 'GO_NAME_COLUMN')
tag_column = species_file.getint('GO', 'TAG_COLUMN')
header = species_file.getboolean('GO', 'TAG_FILE_HEADER')
tags_dictionary = build_tags_dictionary(
tag_mapping_file, go_id_column, go_name_column, tag_column, header)
for (term_id, term) in gene_ontology.go_terms.iteritems():
if not term.annotations:
continue
go_term = {}
go_term['title'] = create_go_term_title(term)
go_term['abstract'] = create_go_term_abstract(term, evcodes)
go_term['organism'] = organism
go_term['slug'] = slugify(term_id + '-' + organism)
go_term['annotations'] = {}
go_term_xrdb = None
for annotation in term.annotations:
if annotation.gid not in go_term['annotations']:
# If annotation.gid is not already a key in the dictionary,
# make it one and initialize list. Else, the key and list
# already exist.
go_term['annotations'][annotation.gid] = []
if annotation.ref is not None:
try:
go_term['annotations'][annotation.gid].append(
int(annotation.ref))
except ValueError:
logger.error('Pubmed ID %s for GO term %s could not be '
'converted to an integer.', annotation.ref,
term_id)
if annotation.xdb is not None:
if go_term_xrdb and go_term_xrdb != annotation.xdb:
logger.info("There is more than one xrdb for annotations "
"in this GO term (%s and %s). Only the first "
"one will be saved in this GO term's 'xrdb' "
"field.", go_term_xrdb, annotation.xdb)
else:
go_term_xrdb = annotation.xdb
go_term['xrdb'] = go_term_xrdb
if go_term['annotations']:
if tags_dictionary and term_id in tags_dictionary:
go_term['tags'] = tags_dictionary[term_id]['gs_tags']
GO_terms.append(go_term)
return GO_terms