forked from ucldc/nuxeo_merritt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
merritt-atom.py
413 lines (329 loc) · 16.3 KB
/
merritt-atom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os
import argparse
from lxml import etree
from pynux import utils
from datetime import datetime
import dateutil.tz
import pprint
import urlparse
from deepharvest.deepharvest_nuxeo import DeepHarvestNuxeo
from os.path import expanduser
import codecs
import json
import requests
import boto
import logging
""" Given the Nuxeo document path for a collection folder, publish ATOM feed for objects for Merritt harvesting. """
pp = pprint.PrettyPrinter()
ATOM_NS = "http://www.w3.org/2005/Atom"
DC_NS = "http://purl.org/dc/elements/1.1/"
NX_NS = "http://www.nuxeo.org/ecm/project/schemas/tingle-california-digita/ucldc_schema"
NS_MAP = {None: ATOM_NS,
"nx": NX_NS,
"dc": DC_NS}
# we want to implement this mapping in the Registry:
MERRITT_ID_MAP = {'asset-library/UCM': 'ark:/13030/m5b58sn8',
'asset-library/UCSF/School_of_Dentistry_130': 'ark:/13030/m5xp9hp7',
'asset-library/UCSF/A_History_of_UCSF': 'ark:/13030/m5sx8rx2',
'asset-library/UCSF/30th_General_Hospital': 'ark:/13030/m5p58150',
'asset-library/UCSF/Day_Robert_L_Collection': 'ark:/13030/m5dn6hr0',
'asset-library/UCSF/Photograph_collection': 'ark:/13030/m5jd78gq',
'asset-library/UCSF/JapaneseWoodblocks': 'ark:/13030/m58w5s1p',
'asset-library/UCB/UCB\ EDA': 'ark:/13030/m500292r',
'asset-library/UCR': 'ark:/13030/m5qg11t8',
'asset-library/UCSC': 'ark:/13030/m5kq0912'}
REGISTRY_API_BASE = 'https://registry.cdlib.org/api/v1/'
BUCKET = 'static.ucldc.cdlib.org/merritt' # FIXME put this in a conf file
FEED_BASE_URL = 'https://s3.amazonaws.com/{}/'.format(BUCKET)
'''
# following is mapping from Adrian. All are in Nuxeo except for UCSF Library Legacy Tobacco Documents Library
ark:/13030/m5b58sn8 University of California, Merced Library Nuxeo collections
ark:/13030/m52c19rr UCSF Library Legacy Tobacco Documents Library
ark:/13030/m5xp9hp7 UCSF Library School of Dentistry 130th Anniversary
ark:/13030/m5sx8rx2 UCSF Library A History of UCSF
ark:/13030/m5p58150 UCSF Library 30th General Hospital
ark:/13030/m5dn6hr0 UCSF Library Robert L. Day Image Collection
ark:/13030/m5jd78gq UCSF Library Photograph Collection
ark:/13030/m58w5s1p UCSF Library Japanese Woodblock Print Collection
ark:/13030/m500292r UC Berkeley Environmental Design Archives Nuxeo collections
ark:/13030/m5qg11t8 UC Riverside Nuxeo collections
ark:/13030/m5kq0912 UC Santa Cruz Nuxeo collections
'''
class MerrittAtom():
def __init__(self, collection_id, pynuxrc=''):
self.logger = logging.getLogger(__name__)
if pynuxrc:
self.nx = utils.Nuxeo(rcfile=open(pynuxrc,'r'))
elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')):
self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r'))
self.collection_id = collection_id
self.path = self._get_nuxeo_path()
self.merritt_id = self.get_merritt_id(self.path)
self.atom_file = self._get_filename(self.collection_id)
if not self.atom_file:
raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id))
self.s3_url = "{}{}".format(FEED_BASE_URL, self.atom_file)
def get_merritt_id(self, path):
''' given the Nuxeo path, get corresponding Merritt collection ID '''
merritt_id = None
path = path.lstrip('/')
while len(path.split('/')) > 1:
if path in MERRITT_ID_MAP:
merritt_id = MERRITT_ID_MAP[path]
path = os.path.dirname(path)
return merritt_id
def _get_nuxeo_path(self):
''' given ucldc registry collection ID, get Nuxeo path for collection '''
url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id)
res = requests.get(url)
res.raise_for_status()
md = json.loads(res.text)
nuxeo_path = md['harvest_extra_data']
return nuxeo_path
def _get_filename(self, collection_id):
''' given Collection ID, get a friendly filename for the ATOM feed '''
filename = 'ucldc_collection_{}.atom'.format(collection_id)
return filename
def _extract_nx_metadata(self, uid):
''' extract Nuxeo metadata we want to post to the ATOM feed '''
raw_metadata = self.nx.get_metadata(uid=uid)
metadata = {}
# creator
creators = raw_metadata['properties']['ucldc_schema:creator']
metadata['creator'] = [creator['name'] for creator in creators]
# title
metadata['title'] = raw_metadata['title']
# date
dates = raw_metadata['properties']['ucldc_schema:date']
dates = [date['date'] for date in dates]
metadata['date'] = dates[0] if dates else None
# nuxeo id
metadata['id'] = raw_metadata['properties']['ucldc_schema:identifier']
# nuxeo collection
metadata['collection'] = raw_metadata['properties']['ucldc_schema:collection'][0] if raw_metadata['properties']['ucldc_schema:collection'] else None
return metadata
def _construct_entry(self, uid, is_parent):
''' construct ATOM feed entry element for a given nuxeo doc '''
nx_metadata = self._extract_nx_metadata(uid)
entry = etree.Element(etree.QName(ATOM_NS, "entry"))
entry = self._populate_entry(entry, nx_metadata, uid, is_parent)
return entry
def _add_atom_elements(self, doc):
''' add atom feed elements to document '''
# recommended ATOM feed elements
feed_author = etree.Element(etree.QName(ATOM_NS, "author"))
feed_author.text = "UC Libraries Digital Collection"
doc.insert(0, feed_author)
# required ATOM feed elements
feed_title = etree.Element(etree.QName(ATOM_NS, "title"))
feed_title.text = "UCLDC Metadata Feed" # FIXME get campus name from registry API?
doc.insert(0, feed_title)
feed_id = etree.Element(etree.QName(ATOM_NS, "id"))
feed_id.text = self.s3_url
doc.insert(0, feed_id)
return doc
def _add_feed_updated(self, doc, updated):
''' add feed updated '''
feed_updated = etree.Element(etree.QName(ATOM_NS, "updated"))
feed_updated.text = updated
doc.insert(0, feed_updated)
def _add_collection_alt_link(self, doc, path):
''' add elements related to Nuxeo collection info to document '''
collection_metadata = self.nx.get_metadata(path=path)
collection_title = collection_metadata['title']
collection_uid = collection_metadata['uid']
collection_uri = self.get_object_view_url(collection_uid)
feed_link_alt = etree.Element(etree.QName(ATOM_NS, "link"), rel="alternate", href=collection_uri, title=collection_title)
doc.insert(0, feed_link_alt)
return doc
def _add_paging_info(self, doc):
''' add rel links for paging '''
# this is just dumb for now
last_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="last", href=self.s3_url)
doc.insert(0, last_link)
first_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="first", href=self.s3_url)
doc.insert(0, first_link)
self_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="self", href=self.s3_url)
doc.insert(0, self_link)
def _add_merritt_id(self, doc, merritt_collection_id):
''' add Merritt ID '''
merritt_id = etree.Element(etree.QName(ATOM_NS, "merritt_collection_id"))
merritt_id.text = merritt_collection_id
doc.insert(0, merritt_id)
def _populate_entry(self, entry, metadata, nxid, is_parent):
''' get <entry> element for a given set of object metadata '''
# atom id (URI)
nuxeo_object_view_url = self.get_object_view_url(nxid)
atom_id = etree.SubElement(entry, etree.QName(ATOM_NS, "id"))
atom_id.text = nuxeo_object_view_url
# atom title
atom_title = etree.SubElement(entry, etree.QName(ATOM_NS, "title"))
atom_title.text = metadata["title"]
# atom updated
atom_updated = etree.SubElement(entry, etree.QName(ATOM_NS, "updated"))
atom_updated.text = datetime.now(dateutil.tz.tzutc()).isoformat()
self.last_update = atom_updated.text
# atom author
atom_author = etree.SubElement(entry, etree.QName(ATOM_NS, "author"))
atom_author.text = "UC Libraries Digital Collection"
# atom links - Merritt is reading the component objects from here
full_metadata_url = self.get_full_metadata(nxid)
link_md = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=full_metadata_url, type="application/xml", title="Full metadata for this object from Nuxeo")
if is_parent:
media_json_url = self.get_media_json_url(nxid)
link_media_json = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=media_json_url, type="application/json", title="Deep Harvest metadata for this object")
nx_metadata = self.nx.get_metadata(uid=nxid)
nxpath = nx_metadata['path']
nuxeo_file_download_url = self.get_object_download_url(nx_metadata)
link_object_file = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=nuxeo_file_download_url, title="Main content file") # FIXME add content_type
aux_file_urls = self.get_aux_file_urls(nx_metadata)
for af in aux_file_urls:
link_aux_file = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=af, title="Auxiliary file")
# dc creator
for creator_name in metadata['creator']:
dc_creator = etree.SubElement(entry, etree.QName(DC_NS, "creator"))
dc_creator.text = creator_name
# dc title
dc_title = etree.SubElement(entry, etree.QName(DC_NS, "title"))
dc_title.text = metadata['title']
# dc date
dc_date = etree.SubElement(entry, etree.QName(DC_NS, "date"))
dc_date.text = metadata['date']
# dc identifier (a.k.a. local identifier) - Nuxeo ID
nuxeo_identifier = etree.SubElement(entry, etree.QName(DC_NS, "identifier"))
nuxeo_identifier.text = nxid
# UCLDC identifier (a.k.a. local identifier) - ucldc_schema:identifier -- this will be the ARK if we have it
if metadata['id']:
ucldc_identifier = etree.SubElement(entry, etree.QName(NX_NS, "identifier"))
ucldc_identifier.text = metadata['id']
# UCLDC collection identifier
ucldc_collection_id = etree.SubElement(entry, etree.QName(NX_NS, "collection"))
ucldc_collection_id.text = metadata['collection']
return entry
def _write_feed(self, doc):
''' publish feed '''
feed = etree.ElementTree(doc)
feed_string = etree.tostring(feed, pretty_print=True, encoding='utf-8', xml_declaration=True)
with open(self.atom_file, "w") as f:
f.write(feed_string)
def _s3_stash(self):
""" Stash file in S3 bucket.
"""
s3_url = 's3://{}/{}'.format(BUCKET, self.atom_file)
bucketpath = BUCKET.strip("/")
bucketbase = BUCKET.split("/")[0]
parts = urlparse.urlsplit(s3_url)
mimetype = 'application/xml'
conn = boto.connect_s3()
try:
bucket = conn.get_bucket(bucketbase)
except boto.exception.S3ResponseError:
bucket = conn.create_bucket(bucketbase)
self.logger.info("Created S3 bucket {}".format(bucketbase))
if not(bucket.get_key(parts.path)):
key = bucket.new_key(parts.path)
key.set_metadata("Content-Type", mimetype)
key.set_contents_from_filename(self.atom_file)
msg = "created {0}".format(s3_url)
self.logger.info(msg)
else:
key = bucket.get_key(parts.path)
key.set_metadata("Content-Type", mimetype)
key.set_contents_from_filename(self.atom_file)
msg = "re-uploaded {}".format(s3_url)
self.logger.info(msg)
def get_object_view_url(self, nuxeo_id):
""" Get object view URL """
parts = urlparse.urlsplit(self.nx.conf["api"])
url = "{}://{}/Nuxeo/nxdoc/default/{}/view_documents".format(parts.scheme, parts.netloc, nuxeo_id)
return url
def get_full_metadata(self, nuxeo_id):
""" Get full metadata via Nuxeo API """
parts = urlparse.urlsplit(self.nx.conf["api"])
url = '{}://{}/Merritt/{}.xml'.format(parts.scheme, parts.netloc, nuxeo_id)
return url
def get_object_download_url(self, metadata):
''' given the full metadata for an object, get file download url '''
try:
file_content = metadata['properties']['file:content']
except KeyError:
raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'")
if file_content is None:
return None
else:
url = file_content['data']
# make available via basic auth
url = url.replace('/nuxeo/', '/Nuxeo/')
return url
def get_media_json_url(self, nuxeo_id):
""" Get media.json (deep harvest) url """
# https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/002130a5-e171-461b-a41b-28ab46af9652-media.json
url = "https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/{}-media.json".format(nuxeo_id)
return url
def get_aux_file_urls(self, metadata):
''' get auxiliary file urls '''
urls = []
# get any "attachment" files
if metadata['properties']['files:files']:
attachments = metadata['properties']['files:files']
for attachment in attachments:
url = attachment['file']['data']
url = url.replace('/nuxeo/', '/Nuxeo/')
urls.append(url)
# get any "extra_file" files
if metadata['properties']['extra_files:file']:
for extra_file in metadata['properties']['extra_files:file']:
url = extra_file['blob']['data']
url = url.replace('/nuxeo/', '/Nuxeo/')
urls.append(url)
return urls
def main(argv=None):
parser = argparse.ArgumentParser(description='Create ATOM feed for a given Nuxeo folder for Merritt harvesting')
parser.add_argument("collection", help="UCLDC Registry Collection ID")
parser.add_argument("--pynuxrc", help="rc file for use by pynux")
if argv is None:
argv = parser.parse_args()
collection_id = argv.collection
if argv.pynuxrc:
ma = MerrittAtom(collection_id, argv.pynuxrc)
else:
ma = MerrittAtom(collection_id)
print "atom_file: {}".format(ma.atom_file)
print "ma.path: {}".format(ma.path)
if argv.pynuxrc:
dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc)
else:
dh = DeepHarvestNuxeo(ma.path, '')
print "Nuxeo path: {}".format(ma.path)
print "Fetching Nuxeo docs. This could take a while if collection is large..."
documents = dh.fetch_objects()
# create root
root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)
# add entries
for document in documents:
nxid = document['uid']
print "working on document: {} {}".format(nxid, document['path'])
# parent
entry = ma._construct_entry(nxid, True)
print "inserting entry for parent object {} {}".format(nxid, document['path'])
root.insert(0, entry)
# children
component_entries = [ma._construct_entry(c['uid'], False) for c in dh.fetch_components(document)]
for ce in component_entries:
print "inserting entry for component: {} {}".format(nxid, document['path'])
root.insert(0, ce)
# add header info
print "Adding header info to xml tree"
ma._add_merritt_id(root, ma.merritt_id)
ma._add_paging_info(root)
ma._add_collection_alt_link(root, ma.path)
ma._add_atom_elements(root)
ma._add_feed_updated(root, ma.last_update)
ma._write_feed(root)
print "Feed written to file: {}".format(ma.atom_file)
ma._s3_stash()
print "Feed stashed on s3: {}".format(ma.s3_url)
if __name__ == "__main__":
sys.exit(main())