Skip to content

Commit

Permalink
Merge pull request #58 from seasidesparrow/refactor_run.20240403
Browse files Browse the repository at this point in the history
Refactor run.20240403
  • Loading branch information
seasidesparrow committed Jun 27, 2024
2 parents c8b2c97 + 736e1e9 commit 4897dd8
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 91 deletions.
10 changes: 9 additions & 1 deletion adsmanparse/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,15 @@ def _get_copyright(self):

def _special_handling(self, bibstem=None):
# Special data handling rules on a per-bibstem basis
if bibstem == 'MPEC':
if bibstem == "pds..data" or bibstem == "pdss.data":
urn = ""
for ident in self.data.get("publisherIDs", []):
if ident.get("Identifier", "")[0:3] == "urn":
urn = ident.get("Identifier", "")
pubstring = "NASA Planetary Data System, %s" % urn
self.output["publication"] = pubstring

elif bibstem == 'MPEC':
# To do:
# - reparse title into Circular no. and title
# - remove MPC Staff as author
Expand Down
9 changes: 9 additions & 0 deletions adsmanparse/hasbody.py → adsmanparse/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from bs4 import BeautifulSoup

def has_body(data):
Expand All @@ -14,3 +15,11 @@ def has_body(data):
if body:
return True
return False


def suppress_title(record, suppressed_titles):
title = record.get('title', {}).get('textEnglish', None)
if title:
for dtitle in suppressed_titles:
if re.search(dtitle, title, flags=re.IGNORECASE):
return True
63 changes: 62 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,66 @@
LOG_LEVEL="WARN"
LOGGING_LEVEL="INFO"
LOG_STDOUT=True

_API_TOKEN=None
_API_URL=None

DEPRECATED_TITLES=[
r'^Index$',
r'^Index\sto\svolume',
r'^Index\sto\sadverstiser',
r'^Index\sof\sauthors',
r'^Instructions\sto\sauthors',
r'^Author\sIndex',
r'^Materials\sIndex',
r'^Masthead',
r'^Editorial\sBoard',
r'^Editors\/Editorial\sboard',
r'^Board\sof\sEditors',
r'^Editors\scontinued',
r'^Subject\sFields\sof\sEditors',
r'^Diary$',
r'^Graphical\sContents',
r'^Cover$',
r'^Abstract$',
r'^Abstracts$',
r'^Patent\sreport$',
r'^Keyword\sListing',
r'^Keyword\sIndex',
r'^Issue\sInformation',
r'^In\sthis\sIssue',
r'^Instructions\sfor\sauthors',
r'^List\sof\sContents',
r'^Calendar$',
r'^Contents$',
r'^Contents\slist$',
r'^Contents\scontinued',
r'^Contents\sof\svolume',
r'^Contents:\sGraphical\sAbstracts',
r'^Other\sContents',
r'^Graphical\sabstract',
r'^Table\sof\scontents',
r'^\s*$',
r'Information\sfor\sauthors',
r'^[OI][BF]C',
r'Forthcoming\smeeting',
r'advertisement',
r'Front\scover',
r'Back\scover',
r'Blank\spage',
r'^Subject\sIndex',
r'Software\ssurvey\ssection',
r'^Patents\sAlert',
r'^Guide\sfor\sAuthors',
r'^Publications\sReceived',
r'^Forthcoming\sPapers',
r'^Forthcoming\sArticles',
r'^Volume\scontents',
r'^Forthcoming\sregular\sarticles',
r'^Outside\sFront\sCover',
r'^Inside\sBack\sCover',
r'^Title\sPage',
r'^Title\sand\seditorial\sboard',
r'^Title\/Ed\sboard',
r'^Title\sEditorial\sBoard',
r'^Editorial\sAdvisory\sBoard',
]
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
git+https://github.com/adsabs/ADSIngestParser@v0.9.13
git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.7
git+https://github.com/adsabs/ADSIngestParser@v0.9.20
git+https://github.com/adsabs/ADSIngestEnrichment@v0.9.9
adsputils==1.5.2
habanero==0.7.4
namedentities==1.9.4
Expand Down
231 changes: 144 additions & 87 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import argparse
import json
import os
from adsmanparse import translator, doiharvest, classic_serializer, hasbody
from adsenrich.references import ReferenceWriter
from adsingestp.parsers.crossref import CrossrefParser
from adsingestp.parsers.jats import JATSParser
from adsingestp.parsers.datacite import DataciteParser
from adsingestp.parsers.elsevier import ElsevierParser
from adsingestp.parsers.adsfeedback import ADSFeedbackParser
from adsingestp.parsers.copernicus import CopernicusParser
from adsputils import setup_logging
from adsingestp.parsers.wiley import WileyParser
from adsmanparse import translator, doiharvest, classic_serializer, utils
from adsputils import load_config, setup_logging
from datetime import datetime, timedelta
from glob import iglob

Expand All @@ -20,9 +21,18 @@
'elsevier': ElsevierParser(),
'feedback': ADSFeedbackParser(),
'copernicus': CopernicusParser(),
'wiley': WileyParser(),
}

logger = setup_logging('manual-parser')
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "./"))
conf = load_config(proj_home=proj_home)
logger = setup_logging(
"run.py",
proj_home=proj_home,
level=conf.get("LOGGING_LEVEL", "INFO"),
attach_stdout=conf.get("LOG_STDOUT", False),
)


def get_args():

Expand Down Expand Up @@ -81,8 +91,8 @@ def get_args():
'--file_type',
dest='file_type',
action='store',
default='jats',
help='Type of input file: jats, dc, cr, nlm, elsevier, feedback')
default=None,
help='Type of input file: jats, dc, cr, nlm, elsevier, feedback, copernicus, wiley')

parser.add_argument('-w',
'--write_refs',
Expand Down Expand Up @@ -116,7 +126,6 @@ def get_args():
args = parser.parse_args()
return args


def create_tagged(rec=None, args=None):
try:
xlator = translator.Translator()
Expand All @@ -137,99 +146,147 @@ def create_refs(rec=None, args=None):
except Exception as err:
logger.warning("Unable to write references: %s" % err)

def write_record(record, args):
if args.output_file:
tagged = create_tagged(rec=record, args=args)
if tagged:
with open(args.output_file, "a") as fout:
fout.write("%s\n" % tagged)
if args.write_refs:
create_refs(rec=record, args=args)
else:
raise Exception("Tagged record not generated.")
else:
raise Exception("Output_file not defined, no place to write records to!")


def parse_record(rec):
pdata = rec.get('data', None)
ptype = rec.get('type', None)
filename = rec.get('name', None)
parser = PARSER_TYPES.get(ptype, None)
write_file = utils.has_body(pdata)
parsedrecord = None
if not parser:
logger.error("No parser available for file_type '%s'." % ptype)
else:
try:
parser.__init__()
if ptype == 'nlm':
parsedrecord = parser.parse(pdata, bsparser='lxml-xml')
else:
parsedrecord = parser.parse(pdata)
if parsedrecord:
if utils.suppress_title(parsedrecord, conf.get("DEPRECATED_TITLES", [])):
parsedrecord = None
raise Exception("Warning: article matches a suppressed title.")
if filename:
if not parsedrecord.get("recordData", {}).get("loadLocation", None):
parsedrecord["recordData"]["loadLocation"] = filename
if not write_file:
parsedrecord["recordData"]["loadLocation"] = None
else:
raise Exception("Null body returned by parser!")
except Exception as err:
logger.warning("Error parsing record (%s): %s" % (filename,err))
return parsedrecord

def main():

args = get_args()
rawDataList = []
ingestDocList = []

# This route processes data from user-input files
if args.proc_path:
infiles = iglob(args.proc_path, recursive=True)
if infiles and args.proc_since:
dtime = timedelta(days=int(args.proc_since))
today = datetime.today()
infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)]
infiles = infiles_since
for f in infiles:
def process_record(rec, args):
try:
parsedRecord = parse_record(rec)
if not parsedRecord:
logger.error("Parsing yielded no data for %s" % rec.get("name", None))
else:
try:
with open(f, 'r') as fin:
output = {'data': fin.read(),
'name': f,
'type': args.file_type}
rawDataList.append(output)
write_record(parsedRecord, args)
except Exception as err:
logger.warning("Failed to import %s: %s" % (f, err))
logger.error("Classic tagger did not generate a tagged record for %s" % f)
else:
logger.debug("Successfully processed %s with %s" % (rec.get("name", None), str(args)))
except Exception as err:
logger.error("Error parsing and processing record %s: %s" % (rec.get("name", None), err))

# This route fetches data from Crossref via the Habanero module
elif args.fetch_doi:
try:
getdoi = doiharvest.DoiHarvester(doi=args.fetch_doi)
output = {'data': getdoi.get_record(),
'type': 'cr'}
rawDataList.append(output)
except Exception as err:
logger.warning("Failed to fetch DOI %s: %s" % (args.fetch_doi,err))

elif args.fetch_doi_list:
try:
with open(args.fetch_doi_list, 'r') as fin:
for l in fin.readlines():
fetch_doi = l.strip()
getdoi = None
output = None
def process_filepath(args):
if args.proc_path:
logger.info("Finding files in path %s ..." % args.proc_path)
infiles = [x for x in iglob(args.proc_path, recursive=True)]
if not infiles:
logger.warning("No files found in path %s." % args.proc_path)
else:
logger.info("Found %s files." % len(infiles))
if args.proc_since:
logger.info("Checking file ages...")
dtime = timedelta(days=int(args.proc_since))
today = datetime.today()
infiles_since = [x for x in infiles if ((today - datetime.fromtimestamp(os.path.getmtime(x))) <= dtime)]
infiles = infiles_since
if not infiles:
logger.error("No files more recent than %s days old!" % str(args.proc_since))
else:
nfiles = len(infiles)
logger.info("There were %s files found to process" % str(nfiles))
for f in infiles:
inputRecord = {}
try:
getdoi = doiharvest.DoiHarvester(doi=fetch_doi)
output = {'data': getdoi.get_record(),
'type': 'cr'}
rawDataList.append(output)
with open(f, 'r') as fin:
inputRecord = {'data': fin.read(),
'name': f,
'type': args.file_type}
except Exception as err:
logger.warning("Failed to fetch DOI %s: %s" % (fetch_doi,err))
except Exception as err:
logger.error("Failed to read %s: %s" % (args.fetch_doi_list, err))

# Now process whatever raw records you have
for rec in rawDataList:
pdata = rec.get('data', None)
ptype = rec.get('type', None)
filename = rec.get('name', None)
parser = PARSER_TYPES.get(ptype, None)
write_file = hasbody.has_body(pdata)
if parser:
logger.warning("Failed to read input file %s: %s" % (f, err))
else:
process_record(inputRecord, args)
else:
logger.warning("Null processing path given, nothing processed.")


def process_doilist(doilist, args):
if doilist:
ptype = args.file_type
if not ptype:
ptype = 'cr'
for d in doilist:
try:
parser.__init__()
parsedrecord = None
if ptype == 'nlm':
parsedrecord = parser.parse(pdata, bsparser='lxml-xml')
else:
parsedrecord = parser.parse(pdata)
if parsedrecord:
if filename:
if not parsedrecord.get("recordData", {}).get("loadLocation", None):
parsedrecord["recordData"]["loadLocation"] = filename
if not write_file:
parsedrecord["recordData"]["loadLocation"] = None
ingestDocList.append(parsedrecord)
else:
raise Exception("Null body returned by parser!")
getdoi = doiharvest.DoiHarvester(doi=d)
inputRecord = {'data': getdoi.get_record(),
'name': d,
'type': ptype}
except Exception as err:
logger.warning("Error parsing record (%s): %s" % (filename,err))
else:
logger.error("No parser available for file_type '%s'." % args.file_type)
logger.warning("Failed to fetch doi %s: %s" % (d, err))
else:
process_record(inputRecord, args)
else:
logger.warning("No DOIs provided, nothing processed.")


if ingestDocList:
if args.output_file:
with open(args.output_file, 'a') as fout:
for d in ingestDocList:
tagged = create_tagged(rec=d, args=args)
if tagged:
fout.write("%s\n" % tagged)
else:
logger.info("Tagged record not written.")
if args.write_refs:
create_refs(rec=d, args=args)
def main():
args = get_args()
rawDataList = []
ingestDocList = []

logger.debug("Initiating parsing with the following arguments: %s" % str(args))

if args.proc_path and not args.file_type:
fileTypeList = PARSER_TYPES.keys()
logger.error("You need to provide a filetype from this list: %s" % str(fileTypeList))
else:
# This route processes data from user-input files
if args.proc_path:
process_filepath(args)

# This route fetches data from Crossref via the Habanero module
elif (args.fetch_doi or args.fetch_doi_list):
doiList = None
if args.fetch_doi:
doiList = [args.fetch_doi]
elif args.fetch_doi_list:
doiList = []
with open(args.fetch_doi_list, 'r') as fin:
for l in fin.readlines():
doiList.append(l.strip())
process_doilist(doiList, args)


if __name__ == '__main__':
Expand Down

0 comments on commit 4897dd8

Please sign in to comment.