diff --git a/adsmanparse/translator.py b/adsmanparse/translator.py index ac5c06e..fd87ead 100644 --- a/adsmanparse/translator.py +++ b/adsmanparse/translator.py @@ -349,7 +349,15 @@ def _get_copyright(self): def _special_handling(self, bibstem=None): # Special data handling rules on a per-bibstem basis - if bibstem == 'MPEC': + if bibstem == "pds..data" or bibstem == "pdss.data": + urn = "" + for ident in self.data.get("publisherIDs", []): + if ident.get("Identifier", "")[0:3] == "urn": + urn = ident.get("Identifier", "") + pubstring = "NASA Planetary Data System, %s" % urn + self.output["publication"] = pubstring + + elif bibstem == 'MPEC': # To do: # - reparse title into Circular no. and title # - remove MPC Staff as author diff --git a/adsmanparse/hasbody.py b/adsmanparse/utils.py similarity index 59% rename from adsmanparse/hasbody.py rename to adsmanparse/utils.py index 5617d98..538537c 100644 --- a/adsmanparse/hasbody.py +++ b/adsmanparse/utils.py @@ -1,3 +1,4 @@ +import re from bs4 import BeautifulSoup def has_body(data): @@ -14,3 +15,11 @@ def has_body(data): if body: return True return False + + +def suppress_title(record, suppressed_titles): + title = record.get('title', {}).get('textEnglish', None) + if title: + for dtitle in suppressed_titles: + if re.search(dtitle, title, flags=re.IGNORECASE): + return True diff --git a/config.py b/config.py index 30bb4e1..2e8aaef 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,66 @@ -LOG_LEVEL="WARN" +LOGGING_LEVEL="INFO" LOG_STDOUT=True _API_TOKEN=None _API_URL=None + +DEPRECATED_TITLES=[ + r'^Index$', + r'^Index\sto\svolume', + r'^Index\sto\sadverstiser', + r'^Index\sof\sauthors', + r'^Instructions\sto\sauthors', + r'^Author\sIndex', + r'^Materials\sIndex', + r'^Masthead', + r'^Editorial\sBoard', + r'^Editors\/Editorial\sboard', + r'^Board\sof\sEditors', + r'^Editors\scontinued', + r'^Subject\sFields\sof\sEditors', + r'^Diary$', + r'^Graphical\sContents', + r'^Cover$', + r'^Abstract$', + r'^Abstracts$', + r'^Patent\sreport$', + r'^Keyword\sListing', + r'^Keyword\sIndex', + r'^Issue\sInformation', + r'^In\sthis\sIssue', + r'^Instructions\sfor\sauthors', + r'^List\sof\sContents', + r'^Calendar$', + r'^Contents$', + r'^Contents\slist$', + r'^Contents\scontinued', + r'^Contents\sof\svolume', + r'^Contents:\sGraphical\sAbstracts', + r'^Other\sContents', + r'^Graphical\sabstract', + r'^Table\sof\scontents', + r'^\s*$', + r'Information\sfor\sauthors', + r'^[OI][BF]C', + r'Forthcoming\smeeting', + r'advertisement', + r'Front\scover', + r'Back\scover', + r'Blank\spage', + r'^Subject\sIndex', + r'Software\ssurvey\ssection', + r'^Patents\sAlert', + r'^Guide\sfor\sAuthors', + r'^Publications\sReceived', + r'^Forthcoming\sPapers', + r'^Forthcoming\sArticles', + r'^Volume\scontents', + r'^Forthcoming\sregular\sarticles', + r'^Outside\sFront\sCover', + r'^Inside\sBack\sCover', + r'^Title\sPage', + r'^Title\sand\seditorial\sboard', + r'^Title\/Ed\sboard', + r'^Title\sEditorial\sBoard', + r'^Editorial\sAdvisory\sBoard', +] diff --git a/requirements.txt b/requirements.txt index e4cbec2..2ee17d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -git+https://github.com/adsabs/ADSIngestParser@v0.9.13 -git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.7 +git+https://github.com/adsabs/ADSIngestParser@v0.9.20 +git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.9 adsputils==1.5.2 habanero==0.7.4 namedentities==1.9.4 diff --git a/run.py b/run.py index 0059847..9b6f617 100644 --- a/run.py +++ b/run.py @@ -1,7 +1,6 @@ import argparse import json import os -from adsmanparse import translator, doiharvest, classic_serializer, hasbody from adsenrich.references import ReferenceWriter from adsingestp.parsers.crossref import CrossrefParser from adsingestp.parsers.jats import JATSParser @@ -9,7 +8,9 @@ from adsingestp.parsers.elsevier import ElsevierParser from adsingestp.parsers.adsfeedback import ADSFeedbackParser from adsingestp.parsers.copernicus import CopernicusParser -from adsputils import setup_logging +from adsingestp.parsers.wiley import WileyParser +from adsmanparse import translator, doiharvest, classic_serializer, utils +from adsputils import load_config, setup_logging from datetime import datetime, timedelta from glob import iglob @@ -20,9 +21,18 @@ 'elsevier': ElsevierParser(), 'feedback': ADSFeedbackParser(), 'copernicus': CopernicusParser(), + 'wiley': WileyParser(), } -logger = setup_logging('manual-parser') +proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./")) +conf = load_config(proj_home=proj_home) +logger = setup_logging( + "run.py", + proj_home=proj_home, + level=conf.get("LOGGING_LEVEL", "INFO"), + attach_stdout=conf.get("LOG_STDOUT", False), +) + def get_args(): @@ -81,8 +91,8 @@ def get_args(): '--file_type', dest='file_type', action='store', - default='jats', - help='Type of input file: jats, dc, cr, nlm, elsevier, feedback') + default=None, + help='Type of input file: jats, dc, cr, nlm, elsevier, feedback, copernicus, wiley') parser.add_argument('-w', '--write_refs', @@ -116,7 +126,6 @@ def get_args(): args = parser.parse_args() return args - def create_tagged(rec=None, args=None): try: xlator = translator.Translator() @@ -137,99 +146,147 @@ def create_refs(rec=None, args=None): except Exception as err: logger.warning("Unable to write references: %s" % err) +def write_record(record, args): + if args.output_file: + tagged = create_tagged(rec=record, args=args) + if tagged: + with open(args.output_file, "a") as fout: + fout.write("%s\n" % tagged) + if args.write_refs: + create_refs(rec=record, args=args) + else: + raise Exception("Tagged record not generated.") + else: + raise Exception("Output_file not defined, no place to write records to!") + + +def parse_record(rec): + pdata = rec.get('data', None) + ptype = rec.get('type', None) + filename = rec.get('name', None) + parser = PARSER_TYPES.get(ptype, None) + write_file = utils.has_body(pdata) + parsedrecord = None + if not parser: + logger.error("No parser available for file_type '%s'." % ptype) + else: + try: + parser.__init__() + if ptype == 'nlm': + parsedrecord = parser.parse(pdata, bsparser='lxml-xml') + else: + parsedrecord = parser.parse(pdata) + if parsedrecord: + if utils.suppress_title(parsedrecord, conf.get("DEPRECATED_TITLES", [])): + parsedrecord = None + raise Exception("Warning: article matches a suppressed title.") + if filename: + if not parsedrecord.get("recordData", {}).get("loadLocation", None): + parsedrecord["recordData"]["loadLocation"] = filename + if not write_file: + parsedrecord["recordData"]["loadLocation"] = None + else: + raise Exception("Null body returned by parser!") + except Exception as err: + logger.warning("Error parsing record (%s): %s" % (filename,err)) + return parsedrecord -def main(): - - args = get_args() - rawDataList = [] - ingestDocList = [] - # This route processes data from user-input files - if args.proc_path: - infiles = iglob(args.proc_path, recursive=True) - if infiles and args.proc_since: - dtime = timedelta(days=int(args.proc_since)) - today = datetime.today() - infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)] - infiles = infiles_since - for f in infiles: +def process_record(rec, args): + try: + parsedRecord = parse_record(rec) + if not parsedRecord: + logger.error("Parsing yielded no data for %s" % rec.get("name", None)) + else: try: - with open(f, 'r') as fin: - output = {'data': fin.read(), - 'name': f, - 'type': args.file_type} - rawDataList.append(output) + write_record(parsedRecord, args) except Exception as err: - logger.warning("Failed to import %s: %s" % (f, err)) + logger.error("Classic tagger did not generate a tagged record for %s" % f) + else: + logger.debug("Successfully processed %s with %s" % (rec.get("name", None), str(args))) + except Exception as err: + logger.error("Error parsing and processing record %s: %s" % (rec.get("name", None), err)) - # This route fetches data from Crossref via the Habanero module - elif args.fetch_doi: - try: - getdoi = doiharvest.DoiHarvester(doi=args.fetch_doi) - output = {'data': getdoi.get_record(), - 'type': 'cr'} - rawDataList.append(output) - except Exception as err: - logger.warning("Failed to fetch DOI %s: %s" % (args.fetch_doi,err)) - elif args.fetch_doi_list: - try: - with open(args.fetch_doi_list, 'r') as fin: - for l in fin.readlines(): - fetch_doi = l.strip() - getdoi = None - output = None +def process_filepath(args): + if args.proc_path: + logger.info("Finding files in path %s ..." % args.proc_path) + infiles = [x for x in iglob(args.proc_path, recursive=True)] + if not infiles: + logger.warning("No files found in path %s." % args.proc_path) + else: + logger.info("Found %s files." % len(infiles)) + if args.proc_since: + logger.info("Checking file ages...") + dtime = timedelta(days=int(args.proc_since)) + today = datetime.today() + infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)] + infiles = infiles_since + if not infiles: + logger.error("No files more recent than %s days old!" % str(args.proc_since)) + else: + nfiles = len(infiles) + logger.info("There were %s files found to process" % str(nfiles)) + for f in infiles: + inputRecord = {} try: - getdoi = doiharvest.DoiHarvester(doi=fetch_doi) - output = {'data': getdoi.get_record(), - 'type': 'cr'} - rawDataList.append(output) + with open(f, 'r') as fin: + inputRecord = {'data': fin.read(), + 'name': f, + 'type': args.file_type} except Exception as err: - logger.warning("Failed to fetch DOI %s: %s" % (fetch_doi,err)) - except Exception as err: - logger.error("Failed to read %s: %s" % (args.fetch_doi_list, err)) - - # Now process whatever raw records you have - for rec in rawDataList: - pdata = rec.get('data', None) - ptype = rec.get('type', None) - filename = rec.get('name', None) - parser = PARSER_TYPES.get(ptype, None) - write_file = hasbody.has_body(pdata) - if parser: + logger.warning("Failed to read input file %s: %s" % (f, err)) + else: + process_record(inputRecord, args) + else: + logger.warning("Null processing path given, nothing processed.") + + +def process_doilist(doilist, args): + if doilist: + ptype = args.file_type + if not ptype: + ptype = 'cr' + for d in doilist: try: - parser.__init__() - parsedrecord = None - if ptype == 'nlm': - parsedrecord = parser.parse(pdata, bsparser='lxml-xml') - else: - parsedrecord = parser.parse(pdata) - if parsedrecord: - if filename: - if not parsedrecord.get("recordData", {}).get("loadLocation", None): - parsedrecord["recordData"]["loadLocation"] = filename - if not write_file: - parsedrecord["recordData"]["loadLocation"] = None - ingestDocList.append(parsedrecord) - else: - raise Exception("Null body returned by parser!") + getdoi = doiharvest.DoiHarvester(doi=d) + inputRecord = {'data': getdoi.get_record(), + 'name': d, + 'type': ptype} except Exception as err: - logger.warning("Error parsing record (%s): %s" % (filename,err)) - else: - logger.error("No parser available for file_type '%s'." % args.file_type) + logger.warning("Failed to fetch doi %s: %s" % (d, err)) + else: + process_record(inputRecord, args) + else: + logger.warning("No DOIs provided, nothing processed.") - if ingestDocList: - if args.output_file: - with open(args.output_file, 'a') as fout: - for d in ingestDocList: - tagged = create_tagged(rec=d, args=args) - if tagged: - fout.write("%s\n" % tagged) - else: - logger.info("Tagged record not written.") - if args.write_refs: - create_refs(rec=d, args=args) +def main(): + args = get_args() + rawDataList = [] + ingestDocList = [] + + logger.debug("Initiating parsing with the following arguments: %s" % str(args)) + + if args.proc_path and not args.file_type: + fileTypeList = PARSER_TYPES.keys() + logger.error("You need to provide a filetype from this list: %s" % str(fileTypeList)) + else: + # This route processes data from user-input files + if args.proc_path: + process_filepath(args) + + # This route fetches data from Crossref via the Habanero module + elif (args.fetch_doi or args.fetch_doi_list): + doiList = None + if args.fetch_doi: + doiList = [args.fetch_doi] + elif args.fetch_doi_list: + doiList = [] + with open(args.fetch_doi_list, 'r') as fin: + for l in fin.readlines(): + doiList.append(l.strip()) + process_doilist(doiList, args) if __name__ == '__main__':