Skip to content

Commit

Permalink
New upload version method for dvdata allows on the fly metadata changes
Browse files Browse the repository at this point in the history
  • Loading branch information
plesubc committed Aug 8, 2023
1 parent cf8d0d9 commit 314a45d
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 39 deletions.
2 changes: 1 addition & 1 deletion src/dataverse_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@
'''
from dataverse_utils.dataverse_utils import *

VERSION = (0,9,2)
VERSION = (0,10,2)
__version__ = '.'.join([str(x) for x in VERSION])
96 changes: 96 additions & 0 deletions src/dataverse_utils/dvdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,56 @@ def __init__(self, pid: str,
self['file_info'] = self['orig_json']['files']
self['file_ids'] = [x['dataFile'].get('id') for x in self['orig_json']['files']]
self['file_persistentIds'] = self._get_file_pids()
self['target_version'] = None
if not self['target_version']:
self['target_version'] = Study.get_version(url)

@classmethod
def get_version(cls, url:str, timeout:int=100)->float:
'''
Returns a float representing a Dataverse version number.
Floating point value composed of:
float(f'{major_version}.{minor_verson:03d}{patch:03d}')
ie, version 5.9.2 would be 5.009002
url : str
URL of base Dataverse instance. eg: 'https://abacus.library.ubc.ca'
timeout : int
Request timeout in seconds
'''
ver = requests.get(f'{url}/api/info/version',
#headers = {'X-Dataverse-key' : key},
timeout = timeout)
try:
ver.raise_for_status()
except requests.exceptions.HTTPError as exc:
LOGGER.error(r'Error getting version for {url}')
LOGGER.exception(exc)
LOGGER.exception(traceback.format_exc())
raise requests.exceptions.HTTPError
#Scholars Portal version is formatted as v5.13.9-SP, so. . .
verf = ver.json()['data']['version'].strip('v ').split('.')
verf = [x.split('-')[0] for x in verf]
verf =[int(b)/10**(3*a) for a,b in enumerate(verf)]
#it's 3*a in case for some reason we hit, say v5.99.99 and there's more before v6.
verf = sum(verf)
return verf

def set_version(self, url:str, timeout:int=100)->None:
'''
Sets self['target_version'] to appropriate integer value *AND*
formats self['upload_json'] to correct JSON format
url : str
URL of *target* Dataverse instance
timeout : int
request timeout in seconds
'''
self['target_version'] = Study.get_version(url, timeout)
# Now fix the metadata to work with various versions
if self['target_version'] >= 5.010:
self.fix_licence()
if self['target_version'] >= 5.013:
self.production_location()

def _orig_json(self) -> dict:
'''
Expand Down Expand Up @@ -78,6 +128,52 @@ def _get_file_pids(self)->list:
return None
return pids

######
#JSON metdata fixes for different versions
######
def fix_licence(self)->None:
'''
With Dataverse v5.10+, a licence type of 'NONE' is now forbidden.
Now, as per <https://guides.dataverse.org/en/5.14/api/sword.html\
?highlight=invalid%20license>,
non-standard licences may be replaced with None.
This function edits the same Study object *in place*, so returns nothing.
'''
if self['upload_json']['datasetVersion']['license'] == 'NONE':
self['upload_json']['datasetVersion']['license'] = None

if not self['upload_json']['datasetVersion']['termsOfUse']:
#This shouldn't happen, but UBC has datasets from the early 1970s
self['upload_json']['datasetVersion']['termsOfUse'] = 'Not available'

def production_location(self)->None:
'''
Changes "multiple" to True where typeName == 'productionPlace' in
Study['upload_json'] Changes are done
*in place*.
This change came into effect with Dataverse v5.13
'''
#{'typeName': 'productionPlace', 'multiple': True, 'typeClass': 'primitive',
#'value': ['Vancouver, BC', 'Ottawa, ON']}

# get index
indy = None
for ind, val in enumerate(self['upload_json']['datasetVersion']\
['metadataBlocks']['citation']['fields']):
if val['typeName'] == 'productionPlace':
indy = ind
break

if indy and not self['upload_json']['datasetVersion']['metadataBlocks']\
['citation']['fields'][indy]['multiple']:
self['upload_json']['datasetVersion']['metadataBlocks']\
['citation']['fields'][indy]['multiple'] = True
self['upload_json']['datasetVersion']['metadataBlocks']\
['citation']['fields'][indy]['value'] = [self['upload_json']['datasetVersion']\
['metadataBlocks']['citation']\
['fields'][indy]['value']]

class File(dict):
'''
Class representing a file on a Dataverse instance
Expand Down
42 changes: 4 additions & 38 deletions src/dataverse_utils/scripts/dv_study_migrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import dataverse_utils
import dataverse_utils.dvdata

VERSION = (0, 4, 0)
VERSION = (0, 4, 1)
__version__ = '.'.join([str(x) for x in VERSION])

def parsley() -> argparse.ArgumentParser():
Expand Down Expand Up @@ -123,41 +123,7 @@ def remove_target_files(record:dataverse_utils.dvdata.Study, timeout:int=100):
file = sys.stderr)
sys.exit()

def fix_licence(badstudy:dataverse_utils.dvdata.Study)->None:
'''
With Dataverse v5.10+, a licence type of 'NONE' is now forbidden.
Now, as per <https://guides.dataverse.org/en/5.14/api/sword.html?highlight=invalid%20license>,
non-standard licences may be replaced with None.
This function edits the same Study object *in place*, so returns nothing.
'''
if badstudy['upload_json']['datasetVersion']['license'] == 'NONE':
badstudy['upload_json']['datasetVersion']['license'] = None

if not badstudy['upload_json']['datasetVersion']['termsOfUse']:
#This shouldn't happen, but UBC has datasets from the early 1970s
badstudy['upload_json']['datasetVersion']['termsOfUse'] = 'Not available'

def check_dv_ver(url: str, timeout=100)->bool:
'''
Returns True if Dataverse >= v5.10. Useful if there are issues with licence fields.
'''
ver = requests.get(f'{url}/api/info/version',
#headers = {'X-Dataverse-key' : key},
timeout = timeout)
try:
ver.raise_for_status()
except requests.exceptions.HTTPError:
print(r'Error getting version for {url}', file =sys.stderr)
sys.exit()
verf = ver.json()['data']['version'].strip('v ').split('.')
verf = [x.split('-')[0] for x in verf]
verf =[int(b)/10**(3*a) for a,b in enumerate(verf)]
#it's 3*a in case for some reason we hit, say v5.99.99 and there's more before v6.
verf = sum(verf)
if verf >= 5.010: # which is really v5.10.0, ie 5 + .010 + 000000
return True
return False

def main():
'''
Expand All @@ -169,9 +135,9 @@ def main():

studs = [dataverse_utils.dvdata.Study(x, args.source_url, args.source_key)
for x in args.pids]
if check_dv_ver(args.target_url):
for bad in studs:
fix_licence(bad)
for stud in studs:
stud.set_version(args.target_url)

if args.collection:
for stud in studs:
upload = requests.post(f'{args.target_url}/api/dataverses/{args.collection}/datasets',
Expand Down

0 comments on commit 314a45d

Please sign in to comment.