-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubGetMedline
executable file
·119 lines (97 loc) · 4.71 KB
/
pubGetMedline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
# first load the standard libraries from python
#from sys import *
import sys, time
import logging, optparse, os, glob, urllib2, tempfile, shutil, csv, re, collections, ftplib
import subprocess
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
import pubGeneric, pubConf, maxXml, maxCommon, util
# === CONSTANTS ===
medlineSrv = 'ftp.ncbi.nlm.nih.gov'
baselinePath= '/pubmed/baseline'
updatePath= '/pubmed/updatefiles'
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <outDir> - update current Medline copy from Medline ftp server. optionally download full baseline medline. Uses lftp.
Example:
pubGetMedline /hive/data/outside/pubs/medline
Creates a file download.log with information what has been added/deleted.
If a new baseline comes in from NCBI (december):
- Renames the output directory by attaching the old year
- creates a new output directory
- downloads the new baseline and all updates
""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
#parser.add_option("", "--parse", dest="parse", action="store_true", help="for debugging, just parse one single xml file", default=None)
parser.add_option("-p", "--parallelConnections", dest="connCount", action="store", type="int", help="use X number of connections, default %default", default=8)
parser.add_option("--maxFiles", dest="maxFiles", action="store", type="int", help="only download this number of article files from MEDLINE for testing purposes", default=None)
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def downloadBaseline(outDir, connCount, maxFiles):
" download current baseline into outDir "
baselineFnames = pubGeneric.getFtpDir(medlineSrv, baselinePath)
baselineFnames = [basename(fname) for fname in baselineFnames if fname.endswith(".xml.gz")]
if maxFiles is not None:
baselineFnames = baselineFnames[0:maxFiles]
logging.info("Found %d filenames on ftp server" % len(baselineFnames))
medlineUrl = "ftp://"+medlineSrv+"/"+baselinePath,
pubGeneric.lftpGet(medlineUrl, outDir, baselineFnames, connCount)
return len(baselineFnames)
def ftpBaselineEqLocal(outDir):
prefixLen = 9 # medline12.....
baselineFnames = pubGeneric.getFtpDir(medlineSrv, baselinePath)
ftpVer = baselineFnames[0][:prefixLen]
ftpVer = ftpVer.replace("medline","")
existFnames = glob.glob(outDir+"/*.gz")
if len(existFnames)==0:
return None, None
localVer = basename(existFnames[0])[:prefixLen]
localVer = localVer.replace("medline","")
logging.debug("%s %s", ftpVer, localVer)
return ftpVer, localVer
def downloadUpdates(outDir, connCount, maxFiles):
""" Download updates from Medline into outDir """
logging.debug("Downloading updates")
updateFnames = pubGeneric.getFtpDir(medlineSrv, updatePath)
updateFnames = [basename(fname) for fname in updateFnames if fname.endswith(".xml.gz")]
if maxFiles is not None:
updateFnames = updateFnames[0:maxFiles]
logging.info("Found %d updates on ftp server" % len(updateFnames))
medlineUrl = "ftp://"+medlineSrv+"/"+updatePath,
pubGeneric.lftpGet(medlineUrl, outDir, updateFnames, connCount)
return len(updateFnames)
# ----------- MAIN --------------
def main():
if args==[]:
parser.print_help()
exit(1)
outDir = args[0]
maxCommon.mustExist(outDir)
pubGeneric.setupLogging(progFile, options)
outDir = args[0]
maxFiles = options.maxFiles
ftpVer, localVer = ftpBaselineEqLocal(outDir)
if not ftpVer==localVer:
logging.warn("Merry Christmas!")
logging.warn("It's December! And the Medline baseline version (full year) has changed")
bakDir = outDir+".20"+localVer
logging.info("Creating a backup of %s by renaming it to %s" % (outDir, bakDir))
shutil.move(outDir, bakDir)
logging.info("Creating a new directory %s" % outDir)
os.makedirs(outDir)
logging.warn("YOU NEED TO DELETE THE TEXTDIRS AND ALL RELATED MYSQL TABLES!")
flagFname = join(outDir, "newVersion.20"+localVer)
logging.info("Creating flag file %s to show that this version has changed" % flagFname)
open(flagFname, "w").write("")
logging.info("getting new baseline now")
downloadedCount = downloadBaseline(outDir, options.connCount, maxFiles)
if maxFiles is not None:
maxFiles -= downloadedCount
downloadedCount = downloadUpdates(outDir, options.connCount, maxFiles)
if maxFiles is not None:
maxFiles -= downloadedCount
main()