Merge pull request #58 from seasidesparrow/refactor_run.20240403

Refactor run.20240403
adsabs · Jun 27, 2024 · 4897dd8 · 4897dd8
2 parents c8b2c97 + 736e1e9
commit 4897dd8
Show file tree

Hide file tree

Showing 5 changed files with 226 additions and 91 deletions.
diff --git a/adsmanparse/translator.py b/adsmanparse/translator.py
@@ -349,7 +349,15 @@ def _get_copyright(self):
 
     def _special_handling(self, bibstem=None):
         # Special data handling rules on a per-bibstem basis
-        if bibstem == 'MPEC':
+        if bibstem == "pds..data" or bibstem == "pdss.data":
+             urn = ""
+             for ident in self.data.get("publisherIDs", []):
+                 if ident.get("Identifier", "")[0:3] == "urn":
+                     urn = ident.get("Identifier", "")
+             pubstring = "NASA Planetary Data System, %s" % urn
+             self.output["publication"] = pubstring
+
+        elif bibstem == 'MPEC':
             # To do:
             #	- reparse title into Circular no. and title
             #	- remove MPC Staff as author

diff --git a/adsmanparse/hasbody.py → adsmanparse/utils.py b/adsmanparse/hasbody.py → adsmanparse/utils.py
@@ -1,3 +1,4 @@
+import re
 from bs4 import BeautifulSoup
 
 def has_body(data):
@@ -14,3 +15,11 @@ def has_body(data):
             if body:
                 return True
     return False
+
+
+def suppress_title(record, suppressed_titles):
+    title = record.get('title', {}).get('textEnglish', None)
+    if title:
+        for dtitle in suppressed_titles:
+            if re.search(dtitle, title, flags=re.IGNORECASE):
+                return True
diff --git a/config.py b/config.py
@@ -1,5 +1,66 @@
-LOG_LEVEL="WARN"
+LOGGING_LEVEL="INFO"
 LOG_STDOUT=True
 
 _API_TOKEN=None
 _API_URL=None
+
+DEPRECATED_TITLES=[
+    r'^Index$',
+    r'^Index\sto\svolume',
+    r'^Index\sto\sadverstiser',
+    r'^Index\sof\sauthors',
+    r'^Instructions\sto\sauthors',
+    r'^Author\sIndex',
+    r'^Materials\sIndex',
+    r'^Masthead',
+    r'^Editorial\sBoard',
+    r'^Editors\/Editorial\sboard',
+    r'^Board\sof\sEditors',
+    r'^Editors\scontinued',
+    r'^Subject\sFields\sof\sEditors',
+    r'^Diary$',
+    r'^Graphical\sContents',
+    r'^Cover$',
+    r'^Abstract$',
+    r'^Abstracts$',
+    r'^Patent\sreport$',
+    r'^Keyword\sListing',
+    r'^Keyword\sIndex',
+    r'^Issue\sInformation',
+    r'^In\sthis\sIssue',
+    r'^Instructions\sfor\sauthors',
+    r'^List\sof\sContents',
+    r'^Calendar$',
+    r'^Contents$',
+    r'^Contents\slist$',
+    r'^Contents\scontinued',
+    r'^Contents\sof\svolume',
+    r'^Contents:\sGraphical\sAbstracts',
+    r'^Other\sContents',
+    r'^Graphical\sabstract',
+    r'^Table\sof\scontents',
+    r'^\s*$',
+    r'Information\sfor\sauthors',
+    r'^[OI][BF]C',
+    r'Forthcoming\smeeting',
+    r'advertisement',
+    r'Front\scover',
+    r'Back\scover',
+    r'Blank\spage',
+    r'^Subject\sIndex',
+    r'Software\ssurvey\ssection',
+    r'^Patents\sAlert',
+    r'^Guide\sfor\sAuthors',
+    r'^Publications\sReceived',
+    r'^Forthcoming\sPapers',
+    r'^Forthcoming\sArticles',
+    r'^Volume\scontents',
+    r'^Forthcoming\sregular\sarticles',
+    r'^Outside\sFront\sCover',
+    r'^Inside\sBack\sCover',
+    r'^Title\sPage',
+    r'^Title\sand\seditorial\sboard',
+    r'^Title\/Ed\sboard',
+    r'^Title\sEditorial\sBoard',
+    r'^Editorial\sAdvisory\sBoard',
+]
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-git+https://github.com/adsabs/ADSIngestParser@v0.9.13
-git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.7
+git+https://github.com/adsabs/ADSIngestParser@v0.9.20
+git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.9
 adsputils==1.5.2
 habanero==0.7.4
 namedentities==1.9.4

diff --git a/run.py b/run.py
@@ -1,15 +1,16 @@
 import argparse
 import json
 import os
-from adsmanparse import translator, doiharvest, classic_serializer, hasbody
 from adsenrich.references import ReferenceWriter
 from adsingestp.parsers.crossref import CrossrefParser
 from adsingestp.parsers.jats import JATSParser
 from adsingestp.parsers.datacite import DataciteParser
 from adsingestp.parsers.elsevier import ElsevierParser
 from adsingestp.parsers.adsfeedback import ADSFeedbackParser
 from adsingestp.parsers.copernicus import CopernicusParser
-from adsputils import setup_logging
+from adsingestp.parsers.wiley import WileyParser
+from adsmanparse import translator, doiharvest, classic_serializer, utils
+from adsputils import load_config, setup_logging
 from datetime import datetime, timedelta
 from glob import iglob
 
@@ -20,9 +21,18 @@
                 'elsevier': ElsevierParser(),
                 'feedback': ADSFeedbackParser(),
                 'copernicus': CopernicusParser(),
+                'wiley': WileyParser(),
                }
 
-logger = setup_logging('manual-parser')
+proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./"))
+conf = load_config(proj_home=proj_home)
+logger = setup_logging(
+    "run.py",
+    proj_home=proj_home,
+    level=conf.get("LOGGING_LEVEL", "INFO"),
+    attach_stdout=conf.get("LOG_STDOUT", False),
+)
+
 
 def get_args():
 
@@ -81,8 +91,8 @@ def get_args():
                         '--file_type',
                         dest='file_type',
                         action='store',
-                        default='jats',
-                        help='Type of input file: jats, dc, cr, nlm, elsevier, feedback')
+                        default=None,
+                        help='Type of input file: jats, dc, cr, nlm, elsevier, feedback, copernicus, wiley')
 
     parser.add_argument('-w',
                         '--write_refs',
@@ -116,7 +126,6 @@ def get_args():
     args = parser.parse_args()
     return args
 
-
 def create_tagged(rec=None, args=None):
     try:
         xlator = translator.Translator()
@@ -137,99 +146,147 @@ def create_refs(rec=None, args=None):
     except Exception as err:
         logger.warning("Unable to write references: %s" % err)
 
+def write_record(record, args):
+    if args.output_file:
+        tagged = create_tagged(rec=record, args=args)
+        if tagged:
+            with open(args.output_file, "a") as fout:
+                fout.write("%s\n" % tagged)
+            if args.write_refs:
+                create_refs(rec=record, args=args)
+        else:
+            raise Exception("Tagged record not generated.")
+    else:
+        raise Exception("Output_file not defined, no place to write records to!")
+
+
+def parse_record(rec):
+    pdata = rec.get('data', None)
+    ptype = rec.get('type', None)
+    filename = rec.get('name', None)
+    parser = PARSER_TYPES.get(ptype, None)
+    write_file = utils.has_body(pdata)
+    parsedrecord = None
+    if not parser:
+        logger.error("No parser available for file_type '%s'." % ptype)
+    else:
+        try:
+            parser.__init__()
+            if ptype == 'nlm':
+                parsedrecord = parser.parse(pdata, bsparser='lxml-xml')
+            else:
+                parsedrecord = parser.parse(pdata)
+            if parsedrecord:
+                if utils.suppress_title(parsedrecord, conf.get("DEPRECATED_TITLES", [])):
+                    parsedrecord = None
+                    raise Exception("Warning: article matches a suppressed title.")
+                if filename:
+                    if not parsedrecord.get("recordData", {}).get("loadLocation", None):
+                        parsedrecord["recordData"]["loadLocation"] = filename
+                    if not write_file:
+                        parsedrecord["recordData"]["loadLocation"] = None
+            else:
+                raise Exception("Null body returned by parser!")
+        except Exception as err:
+            logger.warning("Error parsing record (%s): %s" % (filename,err))
+    return parsedrecord
 
-def main():
-
-    args = get_args()
-    rawDataList = []
-    ingestDocList = []
 
-    # This route processes data from user-input files
-    if args.proc_path:
-        infiles = iglob(args.proc_path, recursive=True)
-        if infiles and args.proc_since:
-            dtime = timedelta(days=int(args.proc_since))
-            today = datetime.today()
-            infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)]
-            infiles = infiles_since
-        for f in infiles:
+def process_record(rec, args):
+    try:
+        parsedRecord = parse_record(rec)
+        if not parsedRecord:
+            logger.error("Parsing yielded no data for %s" % rec.get("name", None))
+        else:
             try:
-                with open(f, 'r') as fin:
-                    output = {'data': fin.read(),
-                              'name': f,
-                              'type': args.file_type}
-                    rawDataList.append(output)
+                write_record(parsedRecord, args)
             except Exception as err:
-                logger.warning("Failed to import %s: %s" % (f, err))
+                logger.error("Classic tagger did not generate a tagged record for %s" % f)
+            else:
+                logger.debug("Successfully processed %s with %s" % (rec.get("name", None), str(args)))
+    except Exception as err:
+        logger.error("Error parsing and processing record %s: %s" % (rec.get("name", None), err))
 
-    # This route fetches data from Crossref via the Habanero module
-    elif args.fetch_doi:
-        try:
-            getdoi = doiharvest.DoiHarvester(doi=args.fetch_doi)
-            output = {'data': getdoi.get_record(),
-                      'type': 'cr'}
-            rawDataList.append(output)
-        except Exception as err:
-            logger.warning("Failed to fetch DOI %s: %s" % (args.fetch_doi,err))
 
-    elif args.fetch_doi_list:
-        try:
-            with open(args.fetch_doi_list, 'r') as fin:
-                for l in fin.readlines():
-                    fetch_doi = l.strip()
-                    getdoi = None
-                    output = None
+def process_filepath(args):
+    if args.proc_path:
+        logger.info("Finding files in path %s ..." % args.proc_path)
+        infiles = [x for x in iglob(args.proc_path, recursive=True)]
+        if not infiles:
+            logger.warning("No files found in path %s." % args.proc_path)
+        else:
+            logger.info("Found %s files." % len(infiles))
+            if args.proc_since:
+                logger.info("Checking file ages...")
+                dtime = timedelta(days=int(args.proc_since))
+                today = datetime.today()
+                infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)]
+                infiles = infiles_since
+            if not infiles:
+                logger.error("No files more recent than %s days old!" % str(args.proc_since))
+            else:
+                nfiles = len(infiles)
+                logger.info("There were %s files found to process" % str(nfiles))
+                for f in infiles:
+                    inputRecord = {}
                     try:
-                        getdoi = doiharvest.DoiHarvester(doi=fetch_doi)
-                        output = {'data': getdoi.get_record(),
-                                  'type': 'cr'}
-                        rawDataList.append(output)
+                        with open(f, 'r') as fin:
+                            inputRecord = {'data': fin.read(),
+                                           'name': f,
+                                           'type': args.file_type}
                     except Exception as err:
-                        logger.warning("Failed to fetch DOI %s: %s" % (fetch_doi,err))
-        except Exception as err:
-            logger.error("Failed to read %s: %s" % (args.fetch_doi_list, err))
-
-    # Now process whatever raw records you have
-    for rec in rawDataList:
-        pdata = rec.get('data', None)
-        ptype = rec.get('type', None)
-        filename = rec.get('name', None)
-        parser = PARSER_TYPES.get(ptype, None)
-        write_file = hasbody.has_body(pdata)
-        if parser:
+                        logger.warning("Failed to read input file %s: %s" % (f, err))
+                    else:
+                        process_record(inputRecord, args)
+    else:
+        logger.warning("Null processing path given, nothing processed.")
+
+
+def process_doilist(doilist, args):
+    if doilist:
+        ptype = args.file_type
+        if not ptype:
+            ptype = 'cr'
+        for d in doilist:
             try:
-                parser.__init__()
-                parsedrecord = None
-                if ptype == 'nlm':
-                    parsedrecord = parser.parse(pdata, bsparser='lxml-xml')
-                else:
-                    parsedrecord = parser.parse(pdata)
-                if parsedrecord:
-                    if filename:
-                        if not parsedrecord.get("recordData", {}).get("loadLocation", None):
-                            parsedrecord["recordData"]["loadLocation"] = filename
-                        if not write_file:
-                            parsedrecord["recordData"]["loadLocation"] = None
-                    ingestDocList.append(parsedrecord)
-                else:
-                    raise Exception("Null body returned by parser!")
+                getdoi = doiharvest.DoiHarvester(doi=d)
+                inputRecord = {'data': getdoi.get_record(),
+                               'name': d,
+                               'type': ptype}
             except Exception as err:
-                logger.warning("Error parsing record (%s): %s" % (filename,err))
-        else:
-            logger.error("No parser available for file_type '%s'." % args.file_type)
+                logger.warning("Failed to fetch doi %s: %s" % (d, err))
+            else:
+                process_record(inputRecord, args)
+    else:
+        logger.warning("No DOIs provided, nothing processed.")
 
 
-    if ingestDocList:
-        if args.output_file:
-            with open(args.output_file, 'a') as fout:
-                for d in ingestDocList:
-                    tagged = create_tagged(rec=d, args=args)
-                    if tagged:
-                        fout.write("%s\n" % tagged)
-                    else:
-                        logger.info("Tagged record not written.")
-                    if args.write_refs:
-                        create_refs(rec=d, args=args)
+def main():
+    args = get_args()
+    rawDataList = []
+    ingestDocList = []
+
+    logger.debug("Initiating parsing with the following arguments: %s" % str(args))
+
+    if args.proc_path and not args.file_type:
+        fileTypeList = PARSER_TYPES.keys()
+        logger.error("You need to provide a filetype from this list: %s" % str(fileTypeList))
+    else:
+        # This route processes data from user-input files
+        if args.proc_path:
+            process_filepath(args)
+
+        # This route fetches data from Crossref via the Habanero module
+        elif (args.fetch_doi or args.fetch_doi_list):
+            doiList = None
+            if args.fetch_doi:
+                doiList = [args.fetch_doi]
+            elif args.fetch_doi_list:
+                doiList = []
+                with open(args.fetch_doi_list, 'r') as fin:
+                    for l in fin.readlines():
+                        doiList.append(l.strip())
+            process_doilist(doiList, args)
 
 
 if __name__ == '__main__':