-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMagicToDataFrames.py
291 lines (247 loc) · 9.08 KB
/
MagicToDataFrames.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# coding: utf-8
# # Unpack MagIC database text file into Pandas dataframes
#
# This notebook should be saved as MagicToDataFrames.py if any changes are made. The python is then imported into the PmagPy_AnalysisFromWebCall notebook.
# Code is modified from code at https://github.com/PmagPy/PmagPy in ipmag.py and pmag.py modules (Licensed under a 3-clause BSD license. See [license.txt](https://github.com/ltauxe/PmagPy/blob/master/license.txt) for details.).
# Modifed and adapted by Stephen M. Richard 2019-06-06
# In[ ]:
#imports
import pandas as pd
import requests
# In[ ]:
def magicToDataframe(data_model=3., txt=""):
"""
takes the name of a text file downloaded from the MagIC database and
unpacks it into a dictionary of pandas dataframes. Each dataframe is named
for the tab heading in the MagIC database file.
Parameters
----------
infile : str
MagIC-format file to unpack
txt : str, default ""
if infile is not provided, you may provide a string with file contents instead
(useful for downloading MagIC file directly from earthref)
returns a dictionary
"""
infile=None
framedict = {}
if data_model == 2.5:
method_col = "magic_method_codes"
else:
method_col = "method_codes"
if txt:
infile = txt.split("\n")
else:
print("no input text provided")
return(framedict)
File = [] # will contain all non-blank lines from downloaded file
for line in infile:
line = line.replace('\n', '')
if line[0:4] == '>>>>' or len(line.strip()) > 0: # skip blank lines
File.append(line)
#print(File)
#return(framedict)
LN = 0 # tracks our progress iterating through File
type_list = []
filenum = 0
while LN < len(File) - 1:
line = File[LN]
if ">>>>" in line:
LN += 1
continue
file_type = line.split('\t')[1]
file_type = file_type.lower()
if file_type[-1] == "\n":
file_type = file_type[:-1]
#print('working on: ', repr(file_type))
if file_type not in type_list:
type_list.append(file_type)
else:
filenum += 1
LN += 1
line = File[LN]
# skip empty tables
if line == ">>>>>>>>>>":
LN += 1
continue
keys = line.replace('\n', '').split('\t')
if keys[0][0] == '.':
keys = line.replace('\n', '').replace('.', '').split('\t')
keys.append('RecNo') # cludge for new MagIC download format
LN += 1
Recs = []
while LN < len(File):
line = File[LN]
# finish up one file type and then break
if ">>>>" in line and len(Recs) > 0:
NewRecs = []
for rec in Recs:
if method_col in list(rec.keys()):
meths = rec[method_col].split(":")
if len(meths) > 0:
methods = ""
for meth in meths:
methods = methods + meth.strip() + ":" # get rid of nasty spaces!!!!!!
rec[method_col] = methods[:-1]
NewRecs.append(rec)
magic_writeDict(framedict, Recs, file_type)
#print("Recs ", Recs)
#print("data tab: ", file_type)
Recs = []
LN += 1
break
# keep adding records of the same file type
else:
rec = line.split('\t')
Rec = {}
if len(rec) == len(keys):
for k in range(len(rec)):
Rec[keys[k]] = rec[k]
Recs.append(Rec)
# in case of magic_search_results.txt, which has an extra
# column:
elif len(rec) - len(keys) == 1:
for k in range(len(rec))[:-1]:
Rec[keys[k]] = rec[k]
Recs.append(Rec)
elif len(rec) < len(keys):
for k in range(len(rec)):
Rec[keys[k]] = rec[k]
for k in range(len(rec), len(keys)):
Rec[keys[k]] = ""
Recs.append(Rec)
else:
print('WARNING: problem in file with line: ')
print(line)
print('skipping....')
LN += 1
if len(Recs) > 0:
NewRecs = []
for rec in Recs:
if method_col in list(rec.keys()):
meths = rec[method_col].split(":")
if len(meths) > 0:
methods = ""
for meth in meths:
methods = methods + meth.strip() + ":" # get rid of nasty spaces!!!!!!
rec[method_col] = methods[:-1]
NewRecs.append(rec)
magic_writeDict(framedict,Recs, file_type)
#print("Recs /n", Recs)
#print("data tab: ", file_type)
return framedict
# In[ ]:
def download_from_magic(con_id):
"""
Download a MagIC contribution directly from the MagIC API.
If successful, this will write individual MagIC files to
your chosen dir_path
Parameters
----------
con_id : string
either a MagIC contribution id, i.e. 12366,
or a URL that will get a MagIC contribution
Returns
dictionary with key for each section of the MagIC data document, and
value that is pandas dataframe containing the table for that section.
"""
#print("con_id" + con_id)
if not requests:
print('-W- You must install the requests module to use this functionality')
return
try:
if (('https://' in str(con_id)) or ('http://' in str(con_id))):
theURL=str(con_id)
else:
theURL='https://earthref.org/MagIC/download/{}/'.format(con_id)
#print('MagIC data URL ' + theURL)
res = requests.get(theURL)
except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout):
print("-W- Could not connect to MagIC")
return
if not res.ok:
print("-W- Could not connect to MagIC -- check your requested contribution id ({})".format(con_id))
return
resultframedict = {}
print("CALL magicToDataframe")
resultframedict = magicToDataframe(txt=res.text)
#print some citation information for the source
if not(resultframedict['contribution'].empty):
thecontribs=resultframedict['contribution']
thekeys=(list(thecontribs))
for index, row in thecontribs.iterrows():
if 'contributor' in thekeys:
contrib = 'contributor: ' + row['contributor'] + '; '
else:
contrib = ''
if 'author' in thekeys:
auth = 'author: ' + row['author'] + '; '
else:
auth = ''
if 'reference' in thekeys:
refer = ' reference: ' + row['reference']
else:
refer = ''
print(contrib + auth + refer)
return resultframedict
# In[ ]:
def magic_writeDict(framedict, Recs, file_type):
"""
Parameters
_________
framedict == the dictionary of frames. the key is file_type, values are pandas
frame constructed from Recs
Recs : list of dictionaries in MagIC format
file_type : MagIC table type (e.g., specimens)
Return :
[True,False] : True if successful
ofile : same as input
Effects :
writes a MagIC formatted file from Recs
"""
if len(Recs) < 1:
print('No records to write to file {}'.format(ofile))
return False, ""
# if os.path.split(ofile)[0] != "" and not os.path.isdir(os.path.split(ofile)[0]):
# os.mkdir(os.path.split(ofile)[0])
#mag_out = open(ofile, 'w+', errors="backslashreplace")
#print("file_type: ",file_type)
framedict[file_type]=pd.DataFrame(Recs)
#framedict[file_type]=Recs
#print("framedict: ", framedict)
print('data table: ' + file_type + '; ' + str(len(Recs)) + ' records')
return True
# #change from markdown to code for testing
#
#
# outputdict = {}
# outputdict = download_from_magic(16619)
# print("output: ", outputdict)
# #change from markdown to code for testing
#
#
# if not(outputdict['contribution'].empty):
# thecontribs=outputdict['contribution']
# thekeys=(list(thecontribs))
# for index, row in thecontribs.iterrows():
# if 'contributor' in thekeys:
# contrib = 'contributor: ' + row['contributor'] + '; '
# else:
# contrib = ''
# if 'author' in thekeys:
# auth = 'author: ' + row['author'] + '; '
# else:
# auth = ''
#
# if 'reference' in thekeys:
# refer = ' reference: ' + row['reference']
# else:
# refer = ''
# print(contrib + auth + refer)
#
#
#
#
# In[ ]:
# In[ ]: