This repository has been archived by the owner on Mar 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfu
executable file
·276 lines (232 loc) · 9.07 KB
/
pdfu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python
""" pdfu
creates a PDF from a URL to an EAD XML file
"""
from __future__ import unicode_literals
import sys
import os
import inspect
import argparse
import tempfile
import urllib2
import urllib
import logging
import shutil
import time
import resource
def main(argv=None):
parser = argparse.ArgumentParser(
description='takes an EAD file and turn it into a PDF'
)
parser.add_argument('url', nargs=1,
help="URL or path to source EAD XML file")
parser.add_argument('outfile', nargs=1, help="name for new PDF")
parser.add_argument('-t', '--tempdir', required=False)
parser.add_argument('-w', '--warnings', default=False,
help="show python warnings supressed by default",
required=False, action='store_true')
parser.add_argument('-s', '--skipmodify', default=False,
required=False, action='store_true')
parser.add_argument('--loglevel', default='ERROR', required=False)
if argv is None:
argv = parser.parse_args()
if not argv.warnings:
# supress warnings
# http://stackoverflow.com/a/2047600/1763984
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
if argv.tempdir:
tempfile.tempdir = argv.tempdir
modify_pdf_monkeypatch = None
if argv.skipmodify:
modify_pdf_monkeypatch = skip_modify_pdf
# Info: http://stackoverflow.com/a/6098238/1763984
# realpath() with make your script run, even if you symlink it :)
cmd_folder = os.path.realpath(os.path.abspath(os.path.split(
inspect.getfile(inspect.currentframe())
)[0]))
if cmd_folder not in sys.path:
sys.path.insert(0, cmd_folder)
# use this if you want to include modules from a subforder
pdf_dir = os.path.realpath(os.path.abspath(os.path.join(os.path.split(
inspect.getfile(inspect.currentframe())
)[0], "oac-ead-to-pdf")))
sys.path.insert(0, pdf_dir)
os.environ['CLASSPATH'] = u''.join([
pdf_dir,
'/javalib/lib/saxonb-8.9.jar:',
pdf_dir,
'/javalib/classes',
])
# activate virtualenv
# http://stackoverflow.com/a/14792407/1763984
activate_this_file = u''.join([cmd_folder, "/ve/bin/activate_this.py"])
if os.path.isfile(activate_this_file):
execfile(activate_this_file, dict(__file__=activate_this_file))
# set debugging level
numeric_level = getattr(logging, argv.loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % argv.loglevel)
logging.basicConfig(level=numeric_level, )
if not which('java'):
logging.getLogger('PDFU').error("no java found")
raise
# call the function that does the work
createPDF(argv.url[0], argv.outfile[0], cmd_folder, modify_pdf_monkeypatch)
# http://mail.python.org/pipermail/python-dev/2008-January/076194.html
# http://stackoverflow.com/a/2375450/1763984
# http://wiki.zope.org/zope2/MonkeyPatch/
# http://stackoverflow.com/a/2375443/1763984
def skip_modify_pdf(self, PDFfile, docInfo):
logger = logging.getLogger('PDFU')
logger.info("monkeypatch _SKIP_ MODIFY PDF FOR : %s" % PDFfile)
def createPDF(url, outfile, cmd_folder, modify_pdf_monkeypatch):
import pdf_gen
if modify_pdf_monkeypatch:
pdf_gen.PostProcessor_OACEAD.modify_pdf = modify_pdf_monkeypatch
start_time = original_start_time = time.time()
# download XML file
(inputfile, tdir, baseFile) = downloadChunks(url)
download_time = time.time() - start_time
start_time = time.time()
xslt = u''.join([cmd_folder, '/oac-ead-to-pdf/oac4_to_pdf.xslt'])
odir = u''.join(['subdir=', tdir])
generator = pdf_gen.OAC_EADtoPDFGenerator(xslt)
assert generator
(completed, timeouts, errors, skipped) = generator.pdf_gen_file(
inputfile,
timeoutSecs=86400,
outdir_option=odir,
force=True,
)
generator_time = time.time() - start_time
start_time = time.time()
# assert not errors
# assert completed[0][1]
path_to_pdf_file = u''.join([os.path.splitext(inputfile)[0], '.pdf'])
pdf_size = os.stat(path_to_pdf_file).st_size
ead_size = os.stat(inputfile).st_size
maxrss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if pdf_size == 0:
logging.getLogger('PDFU').error("generated pdf is empty")
raise
else:
logging.getLogger('PDFU').info(
"good job, outfile \"%s\" is not zero sized" % (outfile)
)
moveIt(path_to_pdf_file, outfile)
shutil.rmtree(tdir)
move_time = time.time() - start_time
total_time = time.time() - original_start_time
print ("OK url={0} ead_size={1} pdf_size={2} download_time={3}"
"generator_time={4} move_time={5}"
" total_time={6} maxrss={7}").format(
url,
ead_size,
pdf_size,
download_time,
generator_time,
move_time,
total_time,
maxrss,
)
# def wat(var):
# for v in var:
# print type(v)
# print dir(v)
# print v
# pp.pprint(v)
def downloadChunks(url):
"""Helper to download large files the only arg is a url this file
will go to a temp directory
the file will also be downloaded in chunks and print out how much
remains https://gist.github.com/gourneau/1430932
"""
baseFile = os.path.basename(url)
temp_path = tempfile.mkdtemp(prefix="pdfu")
logging.getLogger('PDFU').info("temp path %s" % temp_path)
try:
file = os.path.join(temp_path, baseFile)
req = urllib.urlopen(url) # urllib works with normal file paths
# total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
# print math.floor( (downloaded / total_size) * 100 )
if not chunk:
break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:", e.code, url
return False
except urllib2.URLError, e:
print "URL Error:", e.reason, url
return False
return file, temp_path, baseFile
# use it like this
# downloadChunks("http://localhost/a.zip")
# http://stackoverflow.com/a/377028/1763984
def which(program):
import os
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def moveIt(place1, place2):
if place2.startswith("s3://"):
s3move(place1, place2)
else:
os.rename(place1, place2)
def s3move(place1, place2):
import boto
import urlparse
parts = urlparse.urlsplit(place2)
# SplitResult
# (scheme='s3', netloc='test.pdf', path='/dkd', query='', fragment='')
s3 = boto.connect_s3()
# TODO; add a --create-bucket option that will use s3.create_bucket
bucket = s3.get_bucket(parts.netloc)
key = bucket.new_key(parts.path)
key.set_contents_from_filename(place1)
key.set_acl('public-read')
# main() idiom for importing into REPL for debugging
if __name__ == "__main__":
sys.exit(main())
"""
Copyright (c) 2014, Regents of the University of California
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of the University of California nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""