pubConvCrawler

#!/usr/bin/env python

# first load the standard libraries from python
# we require at least python 2.5
#from sys import *
from __future__ import print_function
import sys

# load default python packages
import logging, optparse, os, glob, zipfile, types, gzip, shutil, codecs, collections, copy, urlparse
from os.path import *

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import maxCommon, maxTables, maxRun
import pubGeneric, pubStore, pubConf, pubXml, pubPubmed, pubCrawlLib, pubImgLoad

# === CONSTANTS ===================================
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser(
"""usage: %prog [options] <inDir> <outDir> - convert the output from crawler runs to pubTools format.

inDir can either be the crawling output directory or a directory that contains
many crawling output directories.

articleIds are simply minId (defined in pubConf) + PMID

example:
pubConvCrawler /hive/data/outside/pubs/crawler /hive/data/inside/pubs/text/crawler/
""")

parser.add_option("", "--auto", dest="auto", action="store_true", help="set input and output dirs based on pubConf directories")
parser.add_option("", "--finish", dest="finish", action="store_true", help="finish up after you fixed a failed cluster batch")
parser.add_option("", "--img", dest="doImages", action="store_true", help="also extract images from PDF files into sqlite databases (adding thumbnails and MD5 hashes)")
parser.add_option("", "--chunkSize", dest="chunkSize", type="int", action="store", help="number of article chunks, default (from pubConf), is %default", default=pubConf.chunkArticleCount)
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
#def unicodeList(list):
    #" convert ints to unicode in list and return new list "
    #newL = []
    #for e in list:
        #if type(e)!=types.UnicodeType:
            #newL.append(unicode(e))
        #else:
            #newL.append(e)
    #return newL
            
#def iterArticleSubdirRows(inDir):
    #" search all subdirs of inDir for articleMeta.tab files and yield their rows "
    #if isfile(join(inDir, "articles.db")):
        #subDirs = [inDir]
    #else:
        #subDirs = [join(inDir, s) for s in os.listdir(inDir)]

    #procCount = 0
    #for subPath in subDirs:
        ## not using the sqlite files anymore, slurping them was too slow
        #fname = join(subPath, "articleMeta.tab")
        #if not isfile(fname):
            #continue
        #if isfile(pubCrawlLib.getLockFname(subPath)):
            #logging.info("%s contains a lock file, crawling is ongoing, skipping dir" % subPath)
            #continue
            
        #logging.info("Found %s" % fname)
        #for row in maxCommon.iterTsvRows(fname):
            #procCount += 1
            #yield subPath, row
    #assert(procCount>0)

def splitCrawlerMeta(crawlDir, jobDir, minId, updateId, chunkSize, donePmids, tabExt):
    """ 
    split crawler meta file into pieces of chunkSize lines, return filenames and PMIDs
    Ignore articles with duplicate PMIDs
    """
    # headers for our output files
    allFields = list(tuple(pubStore.articleFields)) # copy list
    allFields.extend(pubCrawlLib.addHeaders)
    # this is a special field required for the conversion
    allFields.append("fileDir")

    RowRec = collections.namedtuple("crawlRec", allFields)

    if isdir(jobDir):
        logging.info("Deleting temporary dir %s" % jobDir)
        shutil.rmtree(jobDir)

    if not os.path.isdir(jobDir):
        logging.info("Creating directory %s" % jobDir)
        os.makedirs(jobDir)

    maxCommon.mustBeEmptyDir(jobDir)

    # split and add a field "fileDir" so that converter can find PDFs
    i = 0
    usedArticleIds = set()
    pmids = set()
    chunkCount = 0
    chunkNames = []

    if isfile(join(crawlDir, "articleMeta.tab")):
        logging.info("Directory %s contains articleMeta.tab, treated as crawl directory" % crawlDir)
        subDirs = [crawlDir]
    else:
        subDirs = glob.glob(join(crawlDir, "*"))
        logging.info("Directory %s does not contain articleMeta.tab, looking in subdirs" % crawlDir)

    artFh = None
    for subPath in subDirs:
        if ".tmp" in subPath or not isdir(subPath):
            continue
        artPath = join(subPath, "articleMeta.tab")
        if not isfile(artPath):
            logging.info("%s not found, skipping" % artPath)
            continue
        logging.info("Found metadata in %s" % subPath)

        for row in maxCommon.iterTsvRows(artPath):
            rowDict = row._asdict()

            # keep directory of files
            fileDir = join(os.path.abspath(subPath), "files")
            rowDict["fileDir"] = fileDir
            rowDict["size"] = ""
            rowDict["offset"] = ""

            # make sure we have a publisher field, older crawlers didn't save this field
            if "publisher" not in rowDict or rowDict["publisher"] == "":
                rowDict["publisher"] = basename(subPath)

            missFields = list(set(allFields) - set(rowDict.keys()))
            # older files don't always have the pii field
            if missFields==["pii"]:
                rowDict["pii"] = ""
            else:
                if len(missFields)!=0:
                    print "missing fields:", missFields
                    assert False

            excessFields = set(rowDict.keys()) - set(allFields)
            if len(excessFields)!=0:
                print("excess fields:", missFields)
                assert False


            # create unique article Id based on pmid
            if int(row.pmid) in donePmids:
                logging.log(5, "Skipping article %s, PMID is already done" % row.pmid)
                continue
            newArticleId = minId+int(row.pmid)
            if newArticleId in usedArticleIds:
                logging.warn("Skipping article %s, pmid seen before" % row.externalId)
                continue
            usedArticleIds.add(newArticleId)
            pmids.add(row.pmid)
            rowDict["articleId"] = str(newArticleId)

            # start new chunk if needed
            if (i==0) or (i / chunkSize) > chunkCount:
                chunkId = str(updateId)+"_%05d" % (chunkCount)
                chunkFname = chunkId+tabExt
                artFname = os.path.join(jobDir, chunkFname)
                if artFh!=None:
                    artFh.close()
                artFh = codecs.open(artFname, "w")
                artFh.write("\t".join(allFields)+"\n")
                logging.debug("Writing to chunk %s" % (artFname))
                chunkCount+=1
                chunkNames.append(chunkId)

            newRow = RowRec(**rowDict)

            # write data to files
            newRow = [x.encode("utf8") for x in newRow]
            artFh.write('\t'.join(newRow))
            artFh.write("\n")
            i += 1

    return chunkNames, pmids

#def toUnicode(var):
    #" force variable to unicode, somehow "
    #if type(var)==type(1):
        #var = unicode(var)
    #if var==None:
        #var = "NotSpecified"
    #elif type(var)==type(unicode()):
        #pass
    #else:
        #try:
            #var = var.decode("utf8")
        #except UnicodeDecodeError, msg:
            #logging.debug("Could not decode %s as utf8, error msg %s" % (var, msg))
            #var = var.decode("latin1")
    #return var

def finishUp(outDir):
    " write indexes, move files over from build dir into final dir and update update sqlite "
    buildDir = join(outDir, "build")
    updateId, lastArticleId, newPmids = pubGeneric.loadUpdateInfo(buildDir)
    pubStore.moveFiles(buildDir, outDir, nameList=["articles", "files", "img"])
    # mark PMIDs as done
    newPmidsStrList = [str(x) for x in newPmids]
    pubStore.updateSqlite(outDir)
    pubStore.appendToUpdatesTxt(outDir, updateId, lastArticleId, newPmidsStrList)
    shutil.rmtree(buildDir)

def submitJobs(inDir, outDir, useCluster, chunkSize, doImages):
    " split crawler article info (article info + additional fields) into chunks"
    outDir = abspath(outDir)

    # write all results to a tmp dir first
    buildDir = join(outDir, "build")
    #maxCommon.mustNotExist(buildDir) # if this fails, a previous run crashed
    if isdir(buildDir):
        text = "%s already exists. Looks like a previous run crashed or is ongoing. Do rm -rf %s if you are sure no conversion is going on now" % (buildDir, buildDir)
        logging.error(text)
        raise Exception

    # get already done PMIDs if we're not running for the first time
    logging.info("Looking for already converted files in %s", outDir)
    minId = pubConf.identifierStart["crawler"]
    updateId, firstArticleId, pmids = pubStore.parseUpdatesTab(outDir, minArticleId=minId)
    donePmids = set([int(x) for x in pmids])
    logging.info("Found %d PMIDs that are already done" % (len(donePmids)))

    runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, headNode=options.cluster, outDir=outDir)

    tmpIndexDir = join(buildDir, "jobfiles.tmp")
    tabExt = ".crawlArticles.tab"
    chunkIds, newPmids = splitCrawlerMeta(inDir, tmpIndexDir, minId, \
        updateId, chunkSize, donePmids, tabExt)
    pubGeneric.saveUpdateInfo(buildDir, updateId, "0", newPmids ) # for the "finishUp" function

    newFnames = []
    for chunkId in chunkIds:
        # create .article.gz
        inChunkFname = join (tmpIndexDir, chunkId+tabExt)
        outFname = join(buildDir, "%s.files.gz" % (chunkId))
        outArtFname = join(buildDir, "%s.articles.gz" % (chunkId))
        #maxCommon.mustNotExist(outFname)
        #maxCommon.mustNotExist(outArtFname)
        #newFnames.extend((outFname, outArtFname))
        command = "%s %s convertJob {check in exists %s} {check out exists+ %s} {check out exists+ %s}" % \
            (sys.executable, progFile, inChunkFname, outFname, outArtFname)
        if doImages:
            command += " --img"
        runner.submit(command)
    logging.info("Now converting %d articles" % len(newPmids))
    runner.finish(wait=True)


def findFiles(zipDir, eIssn, pmid):
    """ find files for pmid and sort into main and supp. Return dict main/supp -> list of paths """
    zipGlob = join(zipDir, "files", pmid+".*")
    logging.debug("Looking for files %s" % zipGlob)
    fnames = glob.glob(zipGlob)
    paths = collections.defaultdict(list)
    for fname in fnames:
        if ".S" in fname:
            paths["suppl"].append(fname)
        elif fname.endswith("html"):
            paths["main.html"] = fname
        elif fname.endswith("pdf"):
            paths["main.pdf"] = fname
        else:
            assert(False)
    return paths

def convSaveFile(artDict, fileDicts, externalId, fileType, url, mimeType, fname, fileId, imgDb):
    " try to convert fname to ascii and append to fileDicts, report success as boolean "
    logging.debug("Converting file %s, articleId %s" % (fname, artDict["articleId"]))
    if fileType == "main":
        fileDesc = "main text"
    else:
        fileDesc = "supplemental file"
    ext = splitext(basename(fname))[1]
    if ext!="":
        fileDesc = fileDesc + " (%s)" % ext

    if isfile(fname):
        fileDict = pubStore.createEmptyFileDict(desc=fileDesc)
        fileDict["externalId"] = externalId
        fileDict["fileType"] = fileType
        newParts = list(urlparse.urlparse(url))
        newParts[4] = "" # remove ?q=bla part, "query"
        newUrl = urlparse.urlunparse(newParts)
        fileDict["url"] = newUrl
        fileDict["mimeType"] = mimeType
        binData = open(fname).read()
        fileDict["content"] = binData
        fileDict["locFname"] = fname
        fileDict["fileId"] = fileId

        if mimeType=="application/pdf" and imgDb is not None:
            pubImgLoad.loadImages(imgDb, artDict, fileDict)
            
        fileDict = pubGeneric.toAsciiEscape(fileDict)

        if fileDict==None:
            logging.warn("Cannot convert %s, skipping" % fname)
            res = False
        else:
            #writer.writeFile(articleId, fileId, fileDict)
            fileDicts.append(fileDict)
            res = True

        return res
    else:
        #raise Exception("File %s was not found" % fname)
        logging.error("corrupt crawler output: File %s was not found" % fname)

def convertOneChunk(inFname, outFname, outArtFname, doImages):
    """ 
    convert one crawler meta file and pdf files to two pubtools files, 
    files and articles
    """ 
    # input fields:
    # articleId  externalId   mainHtmlUrl mainPdfUrl suppUrls
    # mainHtmlFile mainPdfFile suppFiles  landingUrl
    # output fields:
    # "fileId", # numerical ID of the file: its article (ID * 1000)+some count (for suppl files)
    # "externalId", # copy of external ID, for quick greps and mallet
    # "articleId", # numerical ID of the article
    # "url", # the url where the file is located, can also be elsevier://, pmcftp:// etc
    # "desc", # a description of the file, e.g. main text, html title or supp file description
    # "fileType", # can be either "main" or "supp"
    # "time", # time/day of conversion from PDF/html/etc
    # "mimeType", # mimetype of original file before to-text-conversion
    # "content" # the data from this file (newline => \a, tab => space, cr => space, \m ==> \a)

    # get headers
    artHeaders = pubStore.articleFields
    logging.info("Reading %s and writing to %s and %s" % (inFname, outFname, outArtFname))
    writer = pubStore.PubWriterFile(outFname)

    logging.debug("Reading %s" % inFname)
    logging.debug("Writing to %s" % outFname)

    if doImages:
        imgDb = pubImgLoad.openImgDbForArticles(outFname)
    else:
        imgDb = None

    # parse & write to output
    for row in maxCommon.iterTsvRows(inFname):
        # create article dict from crawlDict
        crawlDict = row._asdict()
        artDict = pubStore.createEmptyArticleDict()
        for key, val in artDict.iteritems():
            if key=="publisher":
                continue
            artDict[key] = unicode(crawlDict[key])
        artDict["publisher"] = basename(dirname(row.fileDir))

        fileDicts = []

        articleId = row.articleId
        logging.debug("Converting all files for article %s" % str(crawlDict))
        fileId = int(articleId)*1000
        pmid = row.externalId.replace("PMID", "")
        wasWrittenHtml, wasWrittenPdf = False, False
        # conver the main text
        if row.mainHtmlFile!="":
            fname = join(row.fileDir, pmid+".main.html")
            fileId += 1
            wasWrittenHtml = convSaveFile(artDict, fileDicts, row.externalId, "main.html", row.mainHtmlUrl, \
                "text/html", fname, fileId, imgDb)
        if row.mainPdfFile!="":
            fname = join(row.fileDir, pmid+".main.pdf")
            fileId += 1
            wasWrittenPdf = convSaveFile(artDict, fileDicts, row.externalId, "main.pdf", row.mainPdfUrl, \
                "application/pdf", fname, fileId, imgDb)

        if not wasWrittenPdf and not wasWrittenHtml:
            logging.warn("Neither PDF nor HTML main text found, skipping article %s" % articleId)
            continue

        # convert supplemental files
        if row.suppFiles!="":
            suppUrls = row.suppUrls
            # special case for a dozen AACR urls
            if ",_" in suppUrls:
                suppUrls = suppUrls.replace(",_","_")
            suppUrls = suppUrls.split(",")
            suppFnames = row.suppFiles.split(",")
            if len(suppUrls)!=len(suppFnames):
                logging.error("%s unequal number of commas, compared to %s: ignoring suppUrls" % \
                    (suppUrls, suppFnames))
                #assert(False)
                suppUrls = [""]*len(suppFnames)

            for suppIdx, suppFile in enumerate(suppFnames):
                fname = join(row.fileDir, suppFnames[suppIdx])
                logging.debug("Converting supp file %s" % (suppFile))
                suppUrl = suppUrls[suppIdx]
                if suppUrl==None:
                    logging.error("No URL for suppFile %s" % suppFile)
                    suppUrl = ""
                convSaveFile(artDict, fileDicts, row.externalId, "supp", suppUrl, None, \
                    fname, fileId+suppIdx, imgDb)

        #writer.writeArticle(articleId, artDict)
        writer.writeDocs(artDict, fileDicts)

    if options.doImages:
        pubImgLoad.addIndexes(imgDb)
    writer.close()

def main(args):
    pubGeneric.setupLogging(progFile, options)

    if len(args)==0 or args[0]!="convertJob":

        if options.auto:
            inDir, outDir = pubConf.defaultInOutDirs("crawler")
        else:
            inDir, outDir = args

        maxCommon.mustExistDir(outDir, makeDir=True)

        chunkSize = options.chunkSize

        if options.finish:
            finishUp(outDir)
        else:
            submitJobs(inDir, outDir, options.cluster, chunkSize, options.doImages)
            finishUp(outDir)

    else:
        # this is a cluster job -> just convert one chunk
        cmd, inFile, outFile, outArtFname = args
        convertOneChunk(inFile, outFile, outArtFname, options.doImages)

# ----------- MAIN --------------
if args==[] and not options.auto:
    parser.print_help()
    exit(1)

main(args)