From 3d617f09770c26819ec19fd87563732f50ea015f Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Thu, 4 Apr 2024 15:58:14 -0400 Subject: [PATCH 01/13] Refactored run.py modified: config.py modified: requirements.txt modified: run.py --- config.py | 2 +- requirements.txt | 4 +- run.py | 227 +++++++++++++++++++++++++++++------------------ 3 files changed, 144 insertions(+), 89 deletions(-) diff --git a/config.py b/config.py index 30bb4e1..9665c37 100644 --- a/config.py +++ b/config.py @@ -1,4 +1,4 @@ -LOG_LEVEL="WARN" +LOGGING_LEVEL="DEBUG" LOG_STDOUT=True _API_TOKEN=None diff --git a/requirements.txt b/requirements.txt index e4cbec2..ad98e9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -git+https://github.com/adsabs/ADSIngestParser@v0.9.13 -git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.7 +git+https://github.com/adsabs/ADSIngestParser@v0.9.16 +git+https://github.com/seasidesparrow/ADSIngestEnrichment@wiley_spie.20240402 adsputils==1.5.2 habanero==0.7.4 namedentities==1.9.4 diff --git a/run.py b/run.py index 0059847..52f3b48 100644 --- a/run.py +++ b/run.py @@ -9,7 +9,8 @@ from adsingestp.parsers.elsevier import ElsevierParser from adsingestp.parsers.adsfeedback import ADSFeedbackParser from adsingestp.parsers.copernicus import CopernicusParser -from adsputils import setup_logging +from adsingestp.parsers.wiley import WileyParser +from adsputils import load_config, setup_logging from datetime import datetime, timedelta from glob import iglob @@ -20,9 +21,18 @@ 'elsevier': ElsevierParser(), 'feedback': ADSFeedbackParser(), 'copernicus': CopernicusParser(), + 'wiley': WileyParser(), } -logger = setup_logging('manual-parser') +proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./")) +conf = load_config(proj_home=proj_home) +logger = setup_logging( + "run.py", + proj_home=proj_home, + level=conf.get("LOGGING_LEVEL", "INFO"), + attach_stdout=conf.get("LOG_STDOUT", False), +) + def get_args(): @@ -81,8 +91,8 @@ def get_args(): '--file_type', dest='file_type', action='store', - default='jats', - help='Type of input file: jats, dc, cr, nlm, elsevier, feedback') + default=None, + help='Type of input file: jats, dc, cr, nlm, elsevier, feedback, copernicus, wiley') parser.add_argument('-w', '--write_refs', @@ -116,7 +126,6 @@ def get_args(): args = parser.parse_args() return args - def create_tagged(rec=None, args=None): try: xlator = translator.Translator() @@ -137,99 +146,145 @@ def create_refs(rec=None, args=None): except Exception as err: logger.warning("Unable to write references: %s" % err) +def write_record(record, args): + if args.output_file: + tagged = create_tagged(rec=record, args=args) + if tagged: + with open(args.output_file, "a") as fout: + fout.write("%s\n" % tagged) + if args.write_refs: + create_refs(rec=record, args=args) + else: + raise Exception("Tagged record not generated.") + else: + raise Exception("Output_file not defined, no place to write records to!") + + +def parse_record(rec): + pdata = rec.get('data', None) + ptype = rec.get('type', None) + filename = rec.get('name', None) + parser = PARSER_TYPES.get(ptype, None) + write_file = hasbody.has_body(pdata) + parsedrecord = None + if not parser: + logger.error("No parser available for file_type '%s'." % ptype) + else: + try: + parser.__init__() + if ptype == 'nlm': + parsedrecord = parser.parse(pdata, bsparser='lxml-xml') + else: + parsedrecord = parser.parse(pdata) + if parsedrecord: + if filename: + if not parsedrecord.get("recordData", {}).get("loadLocation", None): + parsedrecord["recordData"]["loadLocation"] = filename + if not write_file: + parsedrecord["recordData"]["loadLocation"] = None + else: + raise Exception("Null body returned by parser!") + except Exception as err: + logger.warning("Error parsing record (%s): %s" % (filename,err)) + return parsedrecord -def main(): - - args = get_args() - rawDataList = [] - ingestDocList = [] - # This route processes data from user-input files - if args.proc_path: - infiles = iglob(args.proc_path, recursive=True) - if infiles and args.proc_since: - dtime = timedelta(days=int(args.proc_since)) - today = datetime.today() - infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)] - infiles = infiles_since - for f in infiles: +def process_record(rec, args): + try: + parsedRecord = parse_record(rec) + if not parsedRecord: + logger.error("Parsing yielded no data for %s" % rec.get("name", None)) + else: try: - with open(f, 'r') as fin: - output = {'data': fin.read(), - 'name': f, - 'type': args.file_type} - rawDataList.append(output) + write_record(parsedRecord, args) except Exception as err: - logger.warning("Failed to import %s: %s" % (f, err)) + logger.error("Classic tagger did not generate a tagged record for %s" % f) + else: + logger.debug("Successfully processed %s with %s" % (rec.get("name", None), str(args))) + except Exception as err: + logger.error("Error parsing and processing record %s: %s" % (rec.get("name", None), err)) - # This route fetches data from Crossref via the Habanero module - elif args.fetch_doi: - try: - getdoi = doiharvest.DoiHarvester(doi=args.fetch_doi) - output = {'data': getdoi.get_record(), - 'type': 'cr'} - rawDataList.append(output) - except Exception as err: - logger.warning("Failed to fetch DOI %s: %s" % (args.fetch_doi,err)) - elif args.fetch_doi_list: - try: - with open(args.fetch_doi_list, 'r') as fin: - for l in fin.readlines(): - fetch_doi = l.strip() - getdoi = None - output = None +def process_filepath(args): + if args.proc_path: + logger.debug("Finding files in path %s ..." % args.proc_path) + infiles = [x for x in iglob(args.proc_path, recursive=True)] + if not infiles: + logger.warning("No files found in path %s." % args.proc_path) + else: + count = len(infiles) + print("haha %s" % str(count)) + logger.debug("Found %s files." % count) + if args.proc_since: + dtime = timedelta(days=int(args.proc_since)) + today = datetime.today() + infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)] + infiles = infiles_since + if not infiles: + logger.error("No files more recent than %s days old" % str(args.proc_since)) + else: + nfiles = len(infiles) + logger.info("There were %s files found in path." % str(nfiles)) + for f in infiles: + inputRecord = {} try: - getdoi = doiharvest.DoiHarvester(doi=fetch_doi) - output = {'data': getdoi.get_record(), - 'type': 'cr'} - rawDataList.append(output) + with open(f, 'r') as fin: + inputRecord = {'data': fin.read(), + 'name': f, + 'type': args.file_type} except Exception as err: - logger.warning("Failed to fetch DOI %s: %s" % (fetch_doi,err)) - except Exception as err: - logger.error("Failed to read %s: %s" % (args.fetch_doi_list, err)) - - # Now process whatever raw records you have - for rec in rawDataList: - pdata = rec.get('data', None) - ptype = rec.get('type', None) - filename = rec.get('name', None) - parser = PARSER_TYPES.get(ptype, None) - write_file = hasbody.has_body(pdata) - if parser: + logger.warning("Failed to read input file %s: %s" % (f, err)) + else: + process_record(inputRecord, args) + else: + logger.warning("Null processing path given, nothing processed.") + + +def process_doilist(doilist, args): + if doilist: + ptype = args.file_type + if not ptype: + ptype = 'cr' + for d in doilist: try: - parser.__init__() - parsedrecord = None - if ptype == 'nlm': - parsedrecord = parser.parse(pdata, bsparser='lxml-xml') - else: - parsedrecord = parser.parse(pdata) - if parsedrecord: - if filename: - if not parsedrecord.get("recordData", {}).get("loadLocation", None): - parsedrecord["recordData"]["loadLocation"] = filename - if not write_file: - parsedrecord["recordData"]["loadLocation"] = None - ingestDocList.append(parsedrecord) - else: - raise Exception("Null body returned by parser!") + getdoi = doiharvest.DoiHarvester(doi=d) + inputRecord = {'data': getdoi.get_record(), + 'name': d, + 'type': ptype} except Exception as err: - logger.warning("Error parsing record (%s): %s" % (filename,err)) - else: - logger.error("No parser available for file_type '%s'." % args.file_type) + logger.warning("Failed to fetch doi %s: %s" % (d, err)) + else: + process_record(inputRecord, args) + else: + logger.warning("No DOIs provided, nothing processed.") - if ingestDocList: - if args.output_file: - with open(args.output_file, 'a') as fout: - for d in ingestDocList: - tagged = create_tagged(rec=d, args=args) - if tagged: - fout.write("%s\n" % tagged) - else: - logger.info("Tagged record not written.") - if args.write_refs: - create_refs(rec=d, args=args) +def main(): + args = get_args() + rawDataList = [] + ingestDocList = [] + + logger.debug("Initiating parsing with the following arguments: %s" % str(args)) + + if args.proc_path and not args.file_type: + fileTypeList = PARSER_TYPES.keys() + logger.error("You need to provide a filetype from this list: %s" % str(fileTypeList)) + else: + # This route processes data from user-input files + if args.proc_path: + process_filepath(args) + + # This route fetches data from Crossref via the Habanero module + elif (args.fetch_doi or args.fetch_doi_list): + doiList = None + if args.fetch_doi: + doiList = [args.fetch_doi] + elif args.fetch_doi_list: + doiList = [] + with open(args.fetch_doi_list, 'r') as fin: + for l in fin.readlines(): + doiList.append(l.strip()) + process_doilist(doiList, args) if __name__ == '__main__': From c10ccf91eca3973f8581a62e826f2b80cc216f93 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Fri, 5 Apr 2024 07:13:59 -0400 Subject: [PATCH 02/13] updating logger debug to info for a few cases modified: run.py --- run.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/run.py b/run.py index 52f3b48..78112cb 100644 --- a/run.py +++ b/run.py @@ -1,7 +1,6 @@ import argparse import json import os -from adsmanparse import translator, doiharvest, classic_serializer, hasbody from adsenrich.references import ReferenceWriter from adsingestp.parsers.crossref import CrossrefParser from adsingestp.parsers.jats import JATSParser @@ -10,6 +9,7 @@ from adsingestp.parsers.adsfeedback import ADSFeedbackParser from adsingestp.parsers.copernicus import CopernicusParser from adsingestp.parsers.wiley import WileyParser +from adsmanparse import translator, doiharvest, classic_serializer, hasbody from adsputils import load_config, setup_logging from datetime import datetime, timedelta from glob import iglob @@ -207,24 +207,23 @@ def process_record(rec, args): def process_filepath(args): if args.proc_path: - logger.debug("Finding files in path %s ..." % args.proc_path) + logger.info("Finding files in path %s ..." % args.proc_path) infiles = [x for x in iglob(args.proc_path, recursive=True)] if not infiles: logger.warning("No files found in path %s." % args.proc_path) else: - count = len(infiles) - print("haha %s" % str(count)) - logger.debug("Found %s files." % count) + logger.info("Found %s files." % count) if args.proc_since: + logger.info("Checking file ages...") dtime = timedelta(days=int(args.proc_since)) today = datetime.today() infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)] infiles = infiles_since if not infiles: - logger.error("No files more recent than %s days old" % str(args.proc_since)) + logger.error("No files more recent than %s days old!" % str(args.proc_since)) else: nfiles = len(infiles) - logger.info("There were %s files found in path." % str(nfiles)) + logger.info("There were %s files found to process" % str(nfiles)) for f in infiles: inputRecord = {} try: From e971281d0ecfad1f7aaadf8e5a29d7b0973bf749 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Thu, 25 Apr 2024 10:12:18 -0400 Subject: [PATCH 03/13] Bump ingest parser to 0.9.17 modified: requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ad98e9b..2739bcf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/adsabs/ADSIngestParser@v0.9.16 +git+https://github.com/adsabs/ADSIngestParser@v0.9.17 git+https://github.com/seasidesparrow/ADSIngestEnrichment@wiley_spie.20240402 adsputils==1.5.2 habanero==0.7.4 From d5d9b99562e1f83e7cd08acaecd5c090ffb3fb71 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Thu, 25 Apr 2024 11:36:22 -0400 Subject: [PATCH 04/13] Adding util to suppress unwanted titles renamed: adsmanparse/hasbody.py -> adsmanparse/utils.py modified: config.py modified: run.py --- adsmanparse/{hasbody.py => utils.py} | 9 ++++++++ config.py | 34 +++++++++++++++++++++++++++- run.py | 6 +++-- 3 files changed, 46 insertions(+), 3 deletions(-) rename adsmanparse/{hasbody.py => utils.py} (59%) diff --git a/adsmanparse/hasbody.py b/adsmanparse/utils.py similarity index 59% rename from adsmanparse/hasbody.py rename to adsmanparse/utils.py index 5617d98..538537c 100644 --- a/adsmanparse/hasbody.py +++ b/adsmanparse/utils.py @@ -1,3 +1,4 @@ +import re from bs4 import BeautifulSoup def has_body(data): @@ -14,3 +15,11 @@ def has_body(data): if body: return True return False + + +def suppress_title(record, suppressed_titles): + title = record.get('title', {}).get('textEnglish', None) + if title: + for dtitle in suppressed_titles: + if re.search(dtitle, title, flags=re.IGNORECASE): + return True diff --git a/config.py b/config.py index 9665c37..7d73a1f 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,37 @@ -LOGGING_LEVEL="DEBUG" +LOGGING_LEVEL="INFO" LOG_STDOUT=True _API_TOKEN=None _API_URL=None + +DEPRECATED_TITLES=[ + r'^Index$', + r'^Author\sIndex', + r'^Materials\sIndex', + r'^Masthead', + r'^Editorial\sBoard', + r'^Board\sof\sEditors', + r'^Editors\scontinued', + r'^Subject\sFields\sof\sEditors', + r'^Diary$', + r'^Keyword\sListing', + r'^Keyword\sIndex', + r'^Issue\sInformation', + r'^In\sthis\sIssue', + r'^Contents$', + r'^Contents\slist$', + r'^Table\sof\scontents', + r'^\s*$', + r'Information\sfor\sauthors', + r'^[OI][BF]C', + r'advertisement', + r'Front\scover', + r'Back\scover', + r'Blank\spage', + r'^Subject\sIndex', + r'^Publications\sReceived', + r'^Forthcoming\sPapers', + r'^Outside\sFront\sCover', + r'^Inside\sBack\sCover', + r'^Editorial\sAdvisory\sBoard', +] diff --git a/run.py b/run.py index 78112cb..33ca265 100644 --- a/run.py +++ b/run.py @@ -9,7 +9,7 @@ from adsingestp.parsers.adsfeedback import ADSFeedbackParser from adsingestp.parsers.copernicus import CopernicusParser from adsingestp.parsers.wiley import WileyParser -from adsmanparse import translator, doiharvest, classic_serializer, hasbody +from adsmanparse import translator, doiharvest, classic_serializer, utils from adsputils import load_config, setup_logging from datetime import datetime, timedelta from glob import iglob @@ -165,7 +165,7 @@ def parse_record(rec): ptype = rec.get('type', None) filename = rec.get('name', None) parser = PARSER_TYPES.get(ptype, None) - write_file = hasbody.has_body(pdata) + write_file = utils.has_body(pdata) parsedrecord = None if not parser: logger.error("No parser available for file_type '%s'." % ptype) @@ -177,6 +177,8 @@ def parse_record(rec): else: parsedrecord = parser.parse(pdata) if parsedrecord: + if utils.suppress_title(parsedrecord, conf.get("DEPRECATED_TITLES", [])): + raise Exception("Warning: article matches a suppressed title.") if filename: if not parsedrecord.get("recordData", {}).get("loadLocation", None): parsedrecord["recordData"]["loadLocation"] = filename From d3791e26fc8e209cf3f7b311ee53e2657357b1f7 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Thu, 25 Apr 2024 11:37:43 -0400 Subject: [PATCH 05/13] fixes "count" infiles bug modified: run.py --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index 33ca265..fc817c2 100644 --- a/run.py +++ b/run.py @@ -214,7 +214,7 @@ def process_filepath(args): if not infiles: logger.warning("No files found in path %s." % args.proc_path) else: - logger.info("Found %s files." % count) + logger.info("Found %s files." % len(infiles)) if args.proc_since: logger.info("Checking file ages...") dtime = timedelta(days=int(args.proc_since)) From 91df13a2cc4a722aebf21b5689d81899f339534e Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Thu, 25 Apr 2024 11:43:44 -0400 Subject: [PATCH 06/13] Forcing removal of parsed record modified: run.py --- run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run.py b/run.py index fc817c2..9b6f617 100644 --- a/run.py +++ b/run.py @@ -178,6 +178,7 @@ def parse_record(rec): parsedrecord = parser.parse(pdata) if parsedrecord: if utils.suppress_title(parsedrecord, conf.get("DEPRECATED_TITLES", [])): + parsedrecord = None raise Exception("Warning: article matches a suppressed title.") if filename: if not parsedrecord.get("recordData", {}).get("loadLocation", None): From e642c3e4cc84c166283cbc7be83582fa110eacec Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Wed, 29 May 2024 15:18:45 -0400 Subject: [PATCH 07/13] Updates ingestparser to v0.9.19 modified: requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2739bcf..5d84109 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/adsabs/ADSIngestParser@v0.9.17 +git+https://github.com/adsabs/ADSIngestParser@v0.9.19 git+https://github.com/seasidesparrow/ADSIngestEnrichment@wiley_spie.20240402 adsputils==1.5.2 habanero==0.7.4 From 46005c163e1fc851668dbb452338ebcd7d590659 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Wed, 29 May 2024 15:22:06 -0400 Subject: [PATCH 08/13] updates config with additional deprecated titles modified: config.py --- config.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/config.py b/config.py index 7d73a1f..2e8aaef 100644 --- a/config.py +++ b/config.py @@ -6,32 +6,61 @@ DEPRECATED_TITLES=[ r'^Index$', + r'^Index\sto\svolume', + r'^Index\sto\sadverstiser', + r'^Index\sof\sauthors', + r'^Instructions\sto\sauthors', r'^Author\sIndex', r'^Materials\sIndex', r'^Masthead', r'^Editorial\sBoard', + r'^Editors\/Editorial\sboard', r'^Board\sof\sEditors', r'^Editors\scontinued', r'^Subject\sFields\sof\sEditors', r'^Diary$', + r'^Graphical\sContents', + r'^Cover$', + r'^Abstract$', + r'^Abstracts$', + r'^Patent\sreport$', r'^Keyword\sListing', r'^Keyword\sIndex', r'^Issue\sInformation', r'^In\sthis\sIssue', + r'^Instructions\sfor\sauthors', + r'^List\sof\sContents', + r'^Calendar$', r'^Contents$', r'^Contents\slist$', + r'^Contents\scontinued', + r'^Contents\sof\svolume', + r'^Contents:\sGraphical\sAbstracts', + r'^Other\sContents', + r'^Graphical\sabstract', r'^Table\sof\scontents', r'^\s*$', r'Information\sfor\sauthors', r'^[OI][BF]C', + r'Forthcoming\smeeting', r'advertisement', r'Front\scover', r'Back\scover', r'Blank\spage', r'^Subject\sIndex', + r'Software\ssurvey\ssection', + r'^Patents\sAlert', + r'^Guide\sfor\sAuthors', r'^Publications\sReceived', r'^Forthcoming\sPapers', + r'^Forthcoming\sArticles', + r'^Volume\scontents', + r'^Forthcoming\sregular\sarticles', r'^Outside\sFront\sCover', r'^Inside\sBack\sCover', + r'^Title\sPage', + r'^Title\sand\seditorial\sboard', + r'^Title\/Ed\sboard', + r'^Title\sEditorial\sBoard', r'^Editorial\sAdvisory\sBoard', ] From cda32c576e1fd0d256d281e9c12044a09afc8f18 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Wed, 26 Jun 2024 08:31:10 -0400 Subject: [PATCH 09/13] Update requirements for parser/enrichment modified: requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5d84109..ffbd4cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -git+https://github.com/adsabs/ADSIngestParser@v0.9.19 -git+https://github.com/seasidesparrow/ADSIngestEnrichment@wiley_spie.20240402 +git+https://github.com/adsabs/ADSIngestParser@v0.9.20 +git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.8 adsputils==1.5.2 habanero==0.7.4 namedentities==1.9.4 From 6b51c4fd44dc3d173c8e9b8e6b94676de00bb62f Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Wed, 26 Jun 2024 16:47:56 -0400 Subject: [PATCH 10/13] test translator with NASA PDS publisher string modified: adsmanparse/translator.py --- adsmanparse/translator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/adsmanparse/translator.py b/adsmanparse/translator.py index ac5c06e..a0999c4 100644 --- a/adsmanparse/translator.py +++ b/adsmanparse/translator.py @@ -349,7 +349,10 @@ def _get_copyright(self): def _special_handling(self, bibstem=None): # Special data handling rules on a per-bibstem basis - if bibstem == 'MPEC': + if bibstem in ['pds..data','pdss.data']: + pubstring = "NASA Planetary Data System " + self.data.get("publisherIDs", {}).get("Identifier","") + + elif bibstem == 'MPEC': # To do: # - reparse title into Circular no. and title # - remove MPC Staff as author From c019d1573a6424a1d6692f1a4b72d835254c1e4a Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Wed, 26 Jun 2024 17:50:09 -0400 Subject: [PATCH 11/13] fix for nasa.pds publication field modified: adsmanparse/translator.py --- adsmanparse/translator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/adsmanparse/translator.py b/adsmanparse/translator.py index a0999c4..3208038 100644 --- a/adsmanparse/translator.py +++ b/adsmanparse/translator.py @@ -349,8 +349,10 @@ def _get_copyright(self): def _special_handling(self, bibstem=None): # Special data handling rules on a per-bibstem basis - if bibstem in ['pds..data','pdss.data']: - pubstring = "NASA Planetary Data System " + self.data.get("publisherIDs", {}).get("Identifier","") + if bibstem == "pds..data" or bibstem == "pdss.data": + uri = self.data.get("publisherIDs", {})[0].get("Identifier", "") + pubstring = "NASA Planetary Data System %s" % uri + self.output["publication"] = pubstring elif bibstem == 'MPEC': # To do: From 7987999fd20b9b89dbc7bae9e6c96a48d273307c Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Wed, 26 Jun 2024 18:57:36 -0400 Subject: [PATCH 12/13] Bumps adsenrich to v0.9.9 modified: requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ffbd4cd..2ee17d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ git+https://github.com/adsabs/ADSIngestParser@v0.9.20 -git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.8 +git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.9 adsputils==1.5.2 habanero==0.7.4 namedentities==1.9.4 From 736e1e93e99468255ad7931f6b683d478bae1853 Mon Sep 17 00:00:00 2001 From: Matthew Templeton Date: Thu, 27 Jun 2024 10:36:38 -0400 Subject: [PATCH 13/13] Formatting for %J field of PDS data sets modified: adsmanparse/translator.py --- adsmanparse/translator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/adsmanparse/translator.py b/adsmanparse/translator.py index 3208038..fd87ead 100644 --- a/adsmanparse/translator.py +++ b/adsmanparse/translator.py @@ -350,8 +350,11 @@ def _get_copyright(self): def _special_handling(self, bibstem=None): # Special data handling rules on a per-bibstem basis if bibstem == "pds..data" or bibstem == "pdss.data": - uri = self.data.get("publisherIDs", {})[0].get("Identifier", "") - pubstring = "NASA Planetary Data System %s" % uri + urn = "" + for ident in self.data.get("publisherIDs", []): + if ident.get("Identifier", "")[0:3] == "urn": + urn = ident.get("Identifier", "") + pubstring = "NASA Planetary Data System, %s" % urn self.output["publication"] = pubstring elif bibstem == 'MPEC':