forked from gbif/rs.gbif.org
-
Notifications
You must be signed in to change notification settings - Fork 1
/
sync-extensions.py
executable file
·340 lines (303 loc) · 11.4 KB
/
sync-extensions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/usr/bin/env python3
#
# Registry Updater for extensions and vocabularies
#
import datetime
import json
import os
import sys
import traceback
import urllib.request
from string import Template
from xml.etree.ElementTree import ElementTree
RS_BASE=os.getcwd()+"/"
NS_DC="http://purl.org/dc/terms/"
NS_EXT="http://rs.gbif.org/extension/"
# default issued date
MIN_DATE = datetime.date(datetime.MINYEAR, 1, 1)
PRODUCTION = 0
SANDBOX = 1
# Templates for the HTML extension list.
HTML_EXTENSION_TEMPLATE_HEADER = Template("""
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>GBIF Registered Extensions</title>
<link rel="stylesheet" type="text/css" href="/style/human.css"/>
<style type="text/css">
.definition {
background: rgba(67%, 67%, 67%, 50%);
padding: 0.5em 0 0 0.5em;
}
.definition:target {
background: rgba(90%, 100%, 100%, 50%);
outline: 2px solid rgba(90%, 100%, 100%, 100%);
}
.definition .title {
margin-left: 2rem;
font-weight: bold;
font-size: 1.2em;
}
.definition .body {
background: rgba(100%, 100%, 100%, 95%);
margin-top: 0.5em;
padding-top: 0.5em;
margin-left: 2em;
padding-left: 1em;
}
.definition .details {
color: grey;
font-size: 0.8em;
}
</style>
</head>
<body>
<nav>
<ul>
<li><img src="/style/logo-gbif-white.svg" width="67" height="28"/></li>
<li><a href="/">Repository of Schemas</a></li>
</ul>
</nav>
<header>
<img src="/style/logo-gbif.svg" alt="GBIF" width="115" height="46"/>
<h1>Registered Extensions</h1>
</header>
<p>The following extensions are the latest versions of those registered with GBIF for $environment use.</p>
<p>The list was last updated at $last_updated, and is also available in <a href="extensions.json">JSON format</a>.</p>
""")
HTML_EXTENSION_TEMPLATE_ENTRY = Template("""
<div id="$identifier" class="definition">
<div class="title">
<a href="$url">$title</a>
</div>
<div class="body">
<p>$description</p>
<table class="details">
<tr><th>Name</th><td>$name</td></tr>
<tr><th>Namespace</th><td>$namespace</td></tr>
<tr><th>RowType</th><td>$identifier</td></tr>
<tr><th>Issued</th><td>$issued</td></tr>
<tr><th>Keywords</th><td>$subject</td></tr>
</table>
</div>
</div>
""")
HTML_EXTENSION_TEMPLATE_FOOTER = Template("""
</body>
</html>
""")
ASIS_TEMPLATE = Template("""Status: 303 See Other
Location: $location
Content-Type: $contentType
See <a href="$location">$location</a>.
""")
class Extension:
def __init__(self):
self.identifier = None
self.url = None
self.title = None
self.description = None
self.subject = None
self.issued = None
self.isLatest = False
self.namespace = None
self.name = None
def __repr__(self):
return """EXT %s Issued:%s (latest=%s) >>%s<< %s [%s]""" % (self.identifier, self.issued, self.isLatest, self.title, self.description, self.subject)
class Vocabulary:
def __init__(self):
self.identifier = None
self.url = None
self.title = None
self.description = None
self.subject = None
self.issued = None
self.isLatest = False
self.namespace = None
self.name = None
def __repr__(self):
return """VOC %s Issued:%s (latest=%s) >>%s<< %s [%s] """ % (self.identifier, self.issued, self.isLatest, self.title, self.description, self.subject)
def writeExtensions(env, dir, urls):
j = open(dir + 'extensions.json', 'w')
h = open(dir + 'extensions.html', 'w')
processUrls(j, h, urls, env, 'extensions')
j.close()
h.close()
def writeVocabs(env, dir, urls):
f = open(dir + 'vocabularies.json', 'w')
processUrls(f, None, urls, env, 'thesauri')
f.close()
def processUrls(fp, html, urls, env, rootElement):
"""Retrieve a list of objects by their URL, sort them by their issue
date, update each object indicating if it is the latest issued or
not, and write each object to the JSON file."""
allObjects = []
for url in urls:
print("Processing %s" % url)
obj = parseUrl(url)
if obj != None:
if obj.identifier != None:
allObjects.append(obj)
else:
print("Missing identifier in %s. Ignore" % url)
# sort by issued date, starting with newest
allObjects = sorted(allObjects, key=getIssuedDate, reverse=True)
# iterate through objects and indicate whether it is the latest or not
identifiers = []
for obj in allObjects:
if (obj.identifier is not None and obj.identifier not in identifiers):
identifiers.append(obj.identifier)
obj.isLatest=True
else:
print("The extension or vocabulary with URL %s issued %s is deprecated or superseded by one in production" % (obj.url, obj.issued))
# write each object to the JSON file
fp.write('{"%s":[\n' % rootElement)
first = True;
for obj in allObjects:
if (first and html):
html.write(HTML_EXTENSION_TEMPLATE_HEADER.substitute(
last_updated=datetime.datetime.today().strftime("%H:%M:%S on %e %B %Y"),
environment=('production' if (env == PRODUCTION) else 'development (sandbox)')
))
if (not first):
fp.write(',\n')
# Write Apache HTTPD asis files to allow redirects like https://rs.gbif.org/terms/1.0/Distribution → https://rs.gbif.org/extension/gbif/1.0/distribution_2022-02-02.xml
if (obj.isLatest and obj.identifier.startswith("http://rs.gbif.org/") and "#" not in obj.identifier):
if (env != PRODUCTION and not obj.identifier.startswith("http://rs.gbif.org/sandbox/")):
print("Refusing to create "+obj.identifier+" from the sandbox, adding sandbox/ to path")
path = RS_BASE + obj.identifier.replace("http://rs.gbif.org/", "sandbox/")
else:
path = RS_BASE + obj.identifier.replace("http://rs.gbif.org/", "")
# Some identifiers end with /, index.asis will be served
if (path.endswith("/")):
path = path + "index"
asisFile = path + ".asis"
asisDir = os.path.dirname(asisFile)
if not os.path.isdir(asisDir):
os.makedirs(asisDir)
a = open(asisFile, 'w')
a.write(ASIS_TEMPLATE.substitute(location=obj.url.replace("http://rs.gbif.org", "https://rs.gbif.org"), contentType='text/xml'))
a.close()
# Write HTML extensions list
if (obj.isLatest and html):
t = dict(
identifier=obj.identifier,
url=obj.url.replace("http://rs.gbif", "https://rs.gbif"),
title=obj.title,
description=obj.description,
name=obj.name,
namespace=obj.namespace,
issued=obj.issued,
subject=obj.subject,
)
html.write(HTML_EXTENSION_TEMPLATE_ENTRY.substitute(t))
# Write JSON extensions list
# name and namespace are used in the HTML list, but not the JSON.
del(obj.name)
del(obj.namespace)
json.dump(obj.__dict__, fp, default=json_serial, indent=2)
first = False;
fp.write('\n]}')
if (html):
html.write(HTML_EXTENSION_TEMPLATE_FOOTER.substitute())
return allObjects
def getIssuedDate(x):
"""Return the issued date, using default if issued date was None"""
return x.issued or MIN_DATE
def json_serial(obj):
"""JSON serializer for objects not serializable by default JSON code
For datime.date objects, return ISO format, e.g. yyyy-mm-dd
"""
if isinstance(obj, datetime.date):
serial = obj.isoformat()
return serial
def parseUrl(url):
"""Download the XML document at a given URL. Parse the XML and
construct either an Extension or Vocabulary depending on the
contents of the XML document. At the end, return the object
constructed
URLs beginning http://rs.gbif.org are instead retrieved
relative to this script."""
try:
latestUrl = url.replace('http://rs.gbif.org/', "file://"+RS_BASE)
tree = ElementTree()
with urllib.request.urlopen(latestUrl) as response:
tree.parse(response)
response.close()
doc = tree.getroot()
if (doc.tag == "{%s}extension"%NS_EXT):
obj = Extension()
obj.identifier = doc.attrib.get('rowType')
else:
obj = Vocabulary()
obj.identifier = doc.attrib.get('{%s}URI'%NS_DC)
obj.namespace = doc.attrib.get('namespace')
obj.name = doc.attrib.get('name')
obj.url = url
obj.title = doc.attrib.get('{%s}title'%NS_DC)
obj.description = doc.attrib.get('{%s}description'%NS_DC)
obj.subject = doc.attrib.get('{%s}subject'%NS_DC)
# convert YYYY-MM-DD string date into datetime.date object
strDate = doc.attrib.get('{%s}issued'%NS_DC)
if (strDate is not None):
obj.issued = datetime.datetime.strptime(strDate, "%Y-%m-%d").date()
return obj
except:
print("Oops, can't parse URL %s" % url)
print("-"*60)
traceback.print_exc(file = sys.stdout)
print("-"*60)
exit(1)
return None
def listExtensions(basedir, baseurl):
urls = []
print("WALK DIR "+basedir)
for fn in os.listdir(basedir):
if fn.startswith("."):
continue
p = os.path.join(basedir,fn)
if os.path.isdir(p):
urls.extend( listExtensions(basedir+fn+"/", baseurl+fn+"/") )
else:
if (fn.lower().endswith(".xml")):
url = baseurl+fn
print(" found extension at "+url)
urls.append(url)
return urls
def listExternal(basedir):
return json.load(open(basedir+"external.json"))
def listVocabularies(basedir, baseurl):
urls = []
print("WALK DIR "+basedir)
for fn in os.listdir(basedir):
if fn.startswith("."):
continue
p = os.path.join(basedir,fn)
if os.path.isdir(p):
urls.extend( listVocabularies(basedir+fn+"/", baseurl+fn+"/") )
else:
if (fn.lower().endswith(".xml")):
url = baseurl+fn
print(" found vocabulary at "+url)
urls.append(url)
return urls
if __name__ == "__main__":
print("LOCATED RS.GBIF.ORG FILESYSTEM AT: "+RS_BASE)
print("UPDATE PRODUCTION EXTENSION FILE")
externalProd = listExternal(RS_BASE+"extension/")
urlsCore = listExtensions(RS_BASE+"core/","http://rs.gbif.org/core/")
urlsExt = listExtensions(RS_BASE+"extension/","http://rs.gbif.org/extension/")
writeExtensions(PRODUCTION, RS_BASE, urlsCore+urlsExt+externalProd)
print("UPDATE PRODUCTION VOCABULARY FILE")
urlsVoc = listVocabularies(RS_BASE+"vocabulary/","http://rs.gbif.org/vocabulary/")
writeVocabs(PRODUCTION, RS_BASE, urlsVoc)
print("UPDATE SANDBOX EXTENSION FILE")
externalDev=listExternal(RS_BASE+"sandbox/extension/")
urlsSandbox = listExtensions(RS_BASE+"sandbox/extension/","http://rs.gbif.org/sandbox/extension/")
urlsSandboxCore = listExtensions(RS_BASE+"sandbox/core/","http://rs.gbif.org/sandbox/core/")
writeExtensions(SANDBOX, RS_BASE+"sandbox/", urlsCore+urlsExt+urlsSandbox+externalProd+externalDev+urlsSandboxCore)
print("UPDATE SANDBOX VOCABULARY FILE")
urlsVoc2 = listVocabularies(RS_BASE+"sandbox/vocabulary/","http://rs.gbif.org/sandbox/vocabulary/")
writeVocabs(SANDBOX, RS_BASE+"sandbox/", urlsVoc+urlsVoc2)