-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathRetrieveGene.py
298 lines (253 loc) · 13.1 KB
/
RetrieveGene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import pandas as pd
from allensdk.api.queries.rma_api import RmaApi
import allensdk.core.json_utilities as json_utilities
import numpy as np
import time
import csv # For saving string data to csv
#-------------------------------------------------------------------------------
# Globals:
#-------------------------------------------------------------------------------
# INPUT file names:
structIDSource = 'structureIDs.csv' # 'layerIDs.csv' # 'structureIDs.csv'
entrezSource = 'geneEntrezID.csv' # 'geneEntrezID.csv'
# OUTPUT file names:
structInfoFilename = 'structureInfo.csv' # structureInfo.csv
allDataFilename = 'bigDataFrame.csv'
#-------------------------------------------------------------------------------
# graph_id = 1 is the "Mouse Brain Atlas"
# (cf. http://help.brain-map.org/display/api/Atlas+Drawings+and+Ontologies)
mouse_graph_id = 1
mouse_product_id = 1
json_file_name = 'unionizes.json'
#-------------------------------------------------------------------------------
def download_mouse_unionizes(geneEntrezID,structureIDs,doReduced=False,writeOut=False):
# Given genes and set of structures, retrieves expression data from Allen API
# Can set geneEntrezID to empty to retrieve for all genes
# Can set geneEntrezID to be a single number (array length 1) for a single gene
# Can set geneEntrezID to an array to retrieve for multiple genes
# doReduced aims to speed up the process by only retrieving bare-bones info
# Convert list of structure IDs to a comma-separated string:
string_structureIDs = ','.join(str(sid) for sid in structureIDs)
# Set defaults:
if doReduced:
print("Only retrieving a reduced subset of results from the queries")
optionsString = "[only$eq'structures.id,data_sets.id,data_sets.plane_of_section_id,genes.entrez_id,structure_unionizes.expression_energy,structure_unionizes.expression_density']"
else:
optionsString = None
# Set criteria :
if len(geneEntrezID)==0:
# Get for all genes
print("Get for all genes")
# cf. Full query: http://api.brain-map.org/api/v2/data/query.xml?
# criteria=model::StructureUnionize,rma::criteria,structure[id$eq22][graph_id$eq1],
# section_data_set[failed$eqFalse](products[id$eq1]),
# rma::include,section_data_set,structure,section_data_set(genes)
criteria = [
"structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
"section_data_set[failed$eqfalse](products[id$eq%d])" % (mouse_product_id)
]
elif len([geneEntrezID])==1:
# Get for a single gene:
print("Get for the single gene: entrez_id = %u" % geneEntrezID[0])
criteria = [
"structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
"section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$eq%d])" % (mouse_product_id, geneEntrezID[0])
]
elif len(geneEntrezID)>1:
# Get for multiple genes:
print("Getting expression results for %u genes" % len(geneEntrezID))
string_geneEntrezIDs = ','.join(str(eid) for eid in geneEntrezIDs)
criteria = [
"structure[id$in%s][graph_id$eq%d]," % (string_structureIDs, mouse_graph_id),
"section_data_set[failed$eqfalse](products[id$eq%d],genes[entrez_id$in%s])" % (mouse_product_id, string_geneEntrezIDs)
]
# Query the API:
expressionData = QueryAPI(model='StructureUnionize',criteriaString="".join(criteria),
includeString="section_data_set,structure,section_data_set(genes)",
optionsString=optionsString,writeOut=writeOut)
return expressionData
#-------------------------------------------------------------------------------
def QueryAPI(model,criteriaString,includeString=None,optionsString=None,writeOut=False):
# Send a query to the Allen API, and assemble results
# Initiate RMA API for Allen data retrieval
api = RmaApi()
# Settings for retrieval
rows = []
blockSize = 2000
done = False
startRow = 0
# for i in range(0, total_rows, blockSize):
while not done:
print("Row %d, attempting to retrieve %d rows..." % (startRow, blockSize))
tot_rows = len(rows)
# apiQueryPartial = partial(api.model_query,model=model,criteria=criteriaString,
# startRow=startRow,num_rows=blockSize)
rows += api.model_query(model=model,
criteria=criteriaString,
include=includeString,
options=optionsString,
start_row=startRow,
num_rows=blockSize)
numRows = len(rows) - tot_rows # additional rows retrieved on running the query
startRow += numRows
print("%d rows retrieved." % numRows)
# Check if we're at the end of the road
if numRows == 0 or numRows < blockSize:
done = True
# Write out the results as they come in, if requested
if writeOut:
json_utilities.write(json_file_name, rows)
print("Wrote to %s" % json_file_name)
return rows
#-------------------------------------------------------------------------------
def unionizes_to_dataframe(unionizes):
fdata = []
for unionize in unionizes:
fdata.append({
'structure_id': unionize['structure_id'],
'expression_energy': unionize['expression_energy'],
'expression_density': unionize['expression_density'],
'data_set_id': unionize['section_data_set']['id'],
'plane_of_section_id': unionize['section_data_set']['plane_of_section_id'],
# 'gene_acronym': unionize['section_data_set']['genes'][0]['acronym'],
# 'gene_name': unionize['section_data_set']['genes'][0]['name'],
# 'gene_entrez': unionize['section_data_set']['genes'][0]['entrez_id'],
})
return pd.DataFrame.from_records(fdata)
#-------------------------------------------------------------------------------
def SaveListCSV(stringList,fileName):
resultFile = open(fileName,'wb')
wr = csv.writer(resultFile, dialect='excel')
wr.writerow(stringList)
#-------------------------------------------------------------------------------
def GetStructureDetails(structureIDs,filterFields,graph_id=1):
# Get structure info for structureIDs given from Allen API
# (mouse brain atlas)
print("\n---Downloading all mouse brain structures using the Allen Institute API...\n")
# Convert list of structure IDs to a comma-separated string:
string_structureIDs = ','.join(str(sid) for sid in structureIDs)
# Get the structure info by querying Allen API:
structs = QueryAPI(model="Structure",criteriaString=("[id$in%s][graph_id$eq%u]" \
% (string_structureIDs,graph_id))) # (Mouse Brain Atlas)
print("%u mouse-brain structures retrieved!\n" % len(structs))
# Filter to only certain fields, specified as filterFields:
structsFilt = []
for i,s in enumerate(structs):
structsFilt.append({keep_key: s[keep_key] for keep_key in filterFields})
return structsFilt
# # Build a dict from structure id to structure and identify each node's direct descendants:
# structHash = {} # dict
# for s in structs:
# s['num_children'] = 0 # add this field
# s['structure_id_path'] = [int(sid) for sid in s['structure_id_path'].split('/') if sid != '']
# s['acronym'] = s['acronym'].rstrip() # some acronyms have trailing whitespace (e.g., 'SUM'); remove
# structHash[s['id']] = s
#
# # Iterate through to add the 'num_children' field by incrementing the 'num_children'
# # field given structures with parents:
# for sid,s in structHash.iteritems():
# if len(s['structure_id_path']) > 1:
# parentId = s['structure_id_path'][-2]
# structHash[parentId]['num_children'] += 1
#-------------------------------------------------------------------------------
#---------------------------------------------------------------------------
# Read in structure IDs from csv -> save detailed structure info to file
#---------------------------------------------------------------------------
structureIDs = np.genfromtxt(structIDSource,delimiter=',')
print("Retrieved %d structures from %s..." % (len(structureIDs),structIDSource))
# Get details of the structures:
filterFields = ['id','name','acronym','color_hex_triplet']
structs = GetStructureDetails(structureIDs,filterFields,1)
# To dataframe:
df_struct = pd.DataFrame.from_records(structs)
# Save as a .csv file:
df_struct.to_csv(structInfoFilename)
#---------------------------------------------------------------------------
# RETRIEVING GENES ONE AT A TIME
#---------------------------------------------------------------------------
# Kind of need to go one at a time since the request is too long if you try
# to get all genes at once?
# Read in gene entrez IDs from csv:
geneEntrezIDs = np.genfromtxt(entrezSource,delimiter=',')
print("Read in %d genes from %s..." % (len(geneEntrezIDs), entrezSource))
# geneEntrezIDs = np.concatenate((geneEntrezIDs[0:1],[20604.]),axis=0)
unionizes = []
startTime = time.time()
for ind, geneEntrezID in enumerate(geneEntrezIDs):
unionizes_gid = download_mouse_unionizes([geneEntrezID],structureIDs,
doReduced=True,writeOut=False)
unionizes = unionizes + unionizes_gid
# df_i = unionizes_to_dataframe(unionizes_gid)
# df = pd.concat([df,df_i])
timeSoFar = time.time() - startTime
timeRemaining = timeSoFar/(ind+1)*(len(geneEntrezIDs)-ind+1)/60
print("We're at %d/%d, %f min remaining" % (ind+1,len(geneEntrezIDs),timeRemaining))
totalTime = (time.time() - startTime)/60.
print("Took %f min total" % totalTime)
# Make a data frame of the datasets downloaded:
df = unionizes_to_dataframe(unionizes)
df = df.sort_values(by=['structure_id','data_set_id'])
df.to_csv(allDataFilename)
print("Saved data frame to %s" % allDataFilename)
print(df)
# df = pd.read_csv(allDataFilename)
#-------------------------------------------------------------------------------
# RETRIEVING GENES ALL TOGETHER IN ONE QUERY
#-------------------------------------------------------------------------------
# Another option would be to retrieve everything in one query, and then filter
# back to the matching genes...? (25924 rows per structure...):
# No need to write to a json file as you go (slows down)
# Only retrieve the necessary fields (doReduced)
# unionizes = download_mouse_unionizes([],structureIDs,doReduced=True,writeOut=False)
#-------------------------------------------------------------------------------
# Write the agglomerated datasets to file
# json_utilities.write(json_file_name, unionizes)
# print("Wrote %d unionizes to %s" % (len(unionizes),json_file_name))
# Read in previously-downloaded results:
# unionizes = json_utilities.read(json_file_name)
# To dataframe and then filter to include only genes in our geneEntrezIDs list:
# dfFull = unionizes_to_dataframe(unionizes)
# dfFull.shape[0]
# # Now we need to filter to include only genes in our geneEntrezIDs list:
# # (actually this filtering is not necessary) -- potentially a nice check that we only retrieved
# # entrez_ids corresponding to section datasets with expression (22,000)
# df = dfFull.copy()
# df = df[df['gene_entrez'].isin(geneEntrezIDs)]
# df.shape[0]
# len(geneEntrezIDs)
# # (so something like 3,000 duplicates?)
# df = df.sort(['structure_id','data_set_id'])
# df.tail
# df_filter = df.drop_duplicates()
# df['data_set_id']
# # Save out just the gene information:
# dfGenes = df[]
#
# # Save data summary as a .csv file:
# csv_file_name = 'filteredDataFrame.csv'
# df.to_csv(csv_file_name)
# Compute the mean expression energy for each structure_id:
# gb = df.groupby(['gene_entrez','structure_id'])['expression_energy'].mean()
# print(gb)
# gb.pivot(index="structure_id", columns="gene_entrez", values="expression_energy")
print("Writing expression energy and density matrices out")
for expressionType in ['expression_energy','expression_density']:
table = pd.pivot_table(df, values=expressionType, index="structure_id",
columns="data_set_id", aggfunc=np.mean)
# table = pd.pivot_table(df, values='expression_energy', index="structure_id",
# columns="gene_entrez", aggfunc=np.mean)
print(table)
# Save mean expression energy to a .csv output file:
fileName = ("%s_%ux%u.csv" % (expressionType, table.shape[0], table.shape[1]))
table.to_csv(fileName,na_rep='NaN')
print("Saved expression energy of %u datasets over %u structures to %s" \
% (table.shape[1],table.shape[0],fileName))
# Need to also output the section dataset IDs (columns) and struct IDs (rows)
dataSetIDs = pd.DataFrame(list(table.axes[0]))
dataSetIDs.to_csv('structIDs_Rows.csv', index=False, header=False)
dataSetIDs = pd.DataFrame(list(table.axes[1]))
dataSetIDs.to_csv('dataSetIDs_Columns.csv', index=False, header=False)
# Summary to screen
# gdf = gb.agg({'data_set_id': pd.Series.nunique})
# print(gdf)
# if __name__ == "__main__": main()