pubRunAnnot

#!/usr/bin/env python

# first load the standard libraries from python
# we require at least python 2.7
#from sys import *
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
    print("Sorry, this program requires at least python 2.7")
    print("You can download a more current python version from python.org and compile it")
    print("into your homedir (or anywhere) with 'configure --prefix ~/python27'; make;")
    print("then run this program by specifying your own python executable like this: ")
    print("   ~/python27/bin/python <scriptFile>")
    print("or add ~/python27/bin to your PATH before /usr/bin")
    exit(1)

# load default python packages
import logging, optparse, os, collections, tarfile, mimetypes, tempfile, \
    copy, shutil, glob, time, string
from os.path import *

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import maxRun, pubStore, pubConf, pubGeneric, pubAlg, maxCommon
from maxCommon import *

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <algorithmName> <in> <out> <opt1>=<val1> <opt2>=<val2> ...- run an algorithm on a directory of fulltext files and write results to out directory

<algorithmName> can be a script from the scripts directory, like "protSearch.py". (Extension optional)
By default, only a variable "headers" and a function "annotateFile" is needed in the script. If
the script contains classes to separate different functions, you need to add the class name, separated by
a ":", like "dnaSearch.py:Annotate"

Use the option -o to run plain text files through the annotators.

Otherwise, <in> is a directory of tab-sep text files created by pubConvXXX converters.
It can be relative to pubConf.datasetDir.
pubRunAnnot will then:
- search <in> for fulltext chunks (*.articles.gz and *.files.gz)
- for each chunk submit a cluster or local job to write results to <out>

""")

parser.add_option("-o", "--onlyText", dest="onlyText", action="store_true", help="Run plain text files through the annotator. <in> can be a single text file or a directory with .txt files. Uses only a single thread.")
parser.add_option("-t", "--test", dest="test", action="store_true", help="run locally in single process, not on cluster or multi processes, to see error messages, dump annotations to stdout")
#parser.add_option("", "--realTest", dest="realTest", action="store_true", help="spawn processes on local machine, like a cluster run, but error messages are visible, write annotations to final output files")
parser.add_option("-a", "--addFields", dest="addFields", action="append", help="add article fields to output files, e.g. doi or title. Option can be used several times", default=[])
parser.add_option("-l", "--limit", dest="limitJobs", action="store", type="int", help="limit jobs to X concurrent jobs on the cluster")
parser.add_option("-r", "--ram", dest="ram", action="store", type="int", help="request x GB of ram for the jobs on the cluster")
parser.add_option("", "--cat", dest="concat", action="store_true", help="write output to <out> and concat all files to <out>.tab when jobs are finished")
parser.add_option("-k", "--keepOldFiles", dest="keepOldFiles", action="store_true", help="do not wipe the output dir before running the jobs")
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
def checkCleanDir(outDir, keepOldFiles):
    if isfile(outDir.rstrip("/")):
        raise Exception("%s already exists and is a file" % outDir)
    if not isdir(outDir):
        logging.info("Creating dir %s" % outDir)
        os.makedirs(outDir)
    oldGzFiles = os.listdir(outDir)
    if len(oldGzFiles)>0 and not keepOldFiles:
        logging.info("Directory %s contains %d .gz files" % (outDir, len(oldGzFiles)))
        logging.info("Waiting for 3 secs, then deleting them")
        time.sleep(3)
        pm = maxCommon.ProgressMeter(len(oldGzFiles))
        for oldGzFname in oldGzFiles:
            os.remove(join(outDir, oldGzFname))
            pm.taskCompleted()

def testMode():
    reload(sys)
    sys.setdefaultencoding('utf-8')
    outName = "stdout"
    if options.onlyText:
        inFiles = [inName]
    else:
        inDir = pubConf.resolveTextDir(inName.split(",")[0])
        inFiles = glob.glob(join(inDir, "*.articles.gz"))

    for inFname in inFiles:
        logging.info("Running on %s" % inFname)
        if options.onlyText:
            logging.info("Running only on file %s" % options.onlyText)
            reader = pubStore.PubReaderTest(inFname)
        else:
            reader = pubStore.PubReaderFile(inFname)
        pubAlg.runAnnotate(reader, alg, paramDict, outName)
    sys.exit(0)
# ----------- MAIN --------------
if len(args)<3:
    parser.print_help()
    sys.exit(1)

pubGeneric.setupLogging(progFile, options)

algName, inName, outName = args[:3]

assert("=" not in outName)
assert("=" not in inName)

paramStrings = args[3:]

paramDict = {}
paramDict = pubGeneric.stringListToDict(paramStrings)
for k,v in paramDict.iteritems():
    if isfile(v):
        paramDict[k]=abspath(v)

# we can't test jython algorithms
algType = "annotate"
if not algName.startswith("java"):
    alg = pubAlg.getAlg(algName, "Annotate") # makes sure that algName exists
    if "processRow" in dir(alg):
        algType = "processRow"


paramDict["startAnnotId"] = 0
paramDict["addFields"] = options.addFields

if options.test or options.onlyText:
    testMode()

checkCleanDir(outName, options.keepOldFiles)

inNames = inName.split(",")
maxJobs = None
if options.limitJobs:
    maxJobs = options.limitJobs

# make sure two runs with different algs and input data names have different 
# batch directories
inBaseNames = [basename(x.rstrip("/")) for x in inNames]
batchBase = algType+"_"+algName.replace(":","")+"_"+"-".join(inBaseNames)+"_"+outName
runner = pubGeneric.makeClusterRunner(batchBase, maxJob=maxJobs, maxRam=options.ram)

if algType == "annotate":
    pubAlg.annotate(algName, inNames, paramDict, outName, runner=runner, addFields=options.addFields, concat=options.concat)
else:
    logging.debug("running a processRow job")
    assert(len(inNames)==1)
    pubAlg.submitProcessRow(runner, algName, inNames[0], outName, paramDict)