-
-
Notifications
You must be signed in to change notification settings - Fork 16
/
pdfcompare.py
executable file
·1679 lines (1529 loc) · 70.4 KB
/
pdfcompare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/python3
# -*- coding: UTF-8 -*-
#
# pdfcompare.py -- command line tool to show search or compare results in a PDF
#
# (c) 2012-2022 Juergen Weigert juergen@fabmail.org
# Distribute under GPL-2.0 or ask
#
# 2012-03-16, V0.1 jw - initial draught: argparse, pdftohtml-xml, font.metrics
# 2012-03-20, V0.2 jw - all is there, but the coordinate systems of my overlay
# does not match. Sigh.
# 2013-01-12, V0.3 jw - support encrypted files added,
# often unencrypted is actually encrypted with key=''
# - coordinate transformation from xml to pdf canvas added
# - refactored: xml2wordlist, xml2fontinfo, create_mark
# - added experimental zap_letter_spacing()
# 2013-01-13, V0.4 jw - added class DecoratedWord
# - option --compare works!
# 2013-01-14, V0.5 jw - added xmlfile2wordlist, textfile2wordlist. fixed -e
# - added option --mark A,D,C
# 2013-01-15, V0.6 jw - added anno_popup() and friends. Horrible hack.
# - Added option --no-anno
# - With -c: added line counting to xml input
# top/center/bottom indicator for pdf.
# 2013-01-16, V0.7 jw - minor bugfixing.
# - Added Changemarks by: page_watermark().
# - /Creator /Producer /ModDate writing.
# 2013-01-17, V0.8 jw - added opcodes_post_proc() to make replace operations
# more the human friendly.
# - calling compressContentStreams() unless --no-compression
# - higher --transparency is now more transparent, not less.
# - delete markers at end of previous text, not start of next.
# 2013-01-18, V0.9 jw - hit statistics +-~= added
# - page break indicators in catwords() added
# - new option -F -first-page added.
# - line wrapping for okular popups, if over 60 chars.
# - added page break markers in annotations.
# - smaller files: we merge once, and help ourselves with /Annots.
# 2013-01-21, V1.0 jw - added option --features popup(aka anno),highight,changebar
# - removed option --no-anno in favour of option --features.
# - improved option --search-colors to handle all 4 colors.
# - fixed features default.
# - made -c optional.
# - added -n --no-output. Return value compatible with /bin/cmp.
# 2013-01-23, V1.1 jw - backwards and forwards navigation code added for easily
# finding next and previous page with changes. Using
# page_ref_magic as a placeholder with dummy self refs.
# - implemented proper relocation and merge in mergeAnnots().
# - some off-by-one errors fixed with first_page, last_page.
# - some python3 porting: print(), key in dict, isinstance()
# 2013-01-24, V1.1a jw - new --feature margin; ignore a certain margin on the
# pages option -M added. Unfinished.
# 2013-01-28, V1.1b jw - no more subtracting from NoneType
# - bugfix in mergeAnnots()
# - factored out highlight_height and reduced from
# 1.4 to 1.2 for less overlap.
# 2013-01-29, V1.2 jw - added bbox_overlap(), bbox_inside() in_bbox_interpolated()
# to implement margin feature.
# 2013-01-31, V1.3 jw - added hyphenation merge inside opcodes_post_proc().
# - added option strict, to suppress zap_letter_spacing() and
# to prevent hyphenation merge.
# - removed needless parameter tag from markword()
# - added an implementation of --spell using hunspell in pipe()
# 2013-02-02, V1.4 jw - New hunspell.py module created, and incorporated.
# The earlier implementation used a premature pipe protocol.
# 2013-02-03, V1.5 jw - Added a trivial --log implementation.
# - sorted command line options alphabetically.
# 2013-02-09, V1.5.1 jw - added test/python3.sh --
# cannot test much, too many modules missing.
# 2013-03-26, V1.6 jw - added experimental opcodes_find_moved()
# Although theoretically quadratic runtime, this
# contributes less than 2 seconds runtime to
# 490 pages sleha: 371.254u 1.104s 6:20.49 97.8%
# - long delete popups truncated.
# 2013-03-28, V1.6.1 jw - new option --below added, helps with obscure crashes in pyPDF.
# 2013-04-29, V1.6.2 jw - COPYING file added, sr#173525 was declined
# - Distinction between mediabox and cropbox implemented, so that
# changebars and navigation is not outside the visible area.
# - option --leftside added.
# 2013-05-07, V1.6.3 jw - Not-strict improved: better ignore hyphenation change
# and dotted lines. Debugging "mergeAnnots failed: page_idx 18
# out of range. Have 10" - no help.
# 2013-05-07, V1.6.4 jw - Allow pdftohtml to produce slightly invalid xml.
# We compensate using a fallback that erases all <a...> ... </a>
# tags. Debugging "mergeAnnots failed: page_idx 18 out of range.
# Have 10" - no help. New mergeAnnotsRelocate() solves this.
# passing -l -f to pdftohtml, but this has no effect on speed
# later on. Strange.
# 2014-01-07, V1.6.5 jw - manually merged https://github.com/jnweiger/pdfcompare/pull/4
# hope, I did not break too much...
# 2014-11-07, V1.6.6 jw - hint added for hunspell use: add word.
# 2015-04-18, V1.6.7 jw - fall back to pyPdf from PyPDF2, for Ubuntu 14.04 LTS
# 2015-04-19, V1.6.8 jw - popup pN[tcb]: source location descriptors optional.
# No normal user expects or understands them.
# No navigation marks per default. They are often broken, and often
# useless due to page number changes. Include in -f to enable.
# 2022-07-22, V1.6.9 jw - suggest pip install pypdf instead of fallback. Drop dependency on python-pypdf.
# Support python3 on ubunut 20.04
#
# osc in devel:languages:python python-pypdf >= 1.13+20130112
# need fix from https://bugs.launchpad.net/pypdf/+bug/242756
# osc in devel:languages:python python-reportlab
# osc in devel:languages:python python-pygame
# osc in X11:common:Factory poppler-tools
#
# needs module difflib from python-base
#
# Feature request:
# - poppler-tools:/usr/bin/pdftohtml -xml should report a rotation angle,
# if text is not left-to-right. And it should report spacing adjustements
# within a string.
#
# TODOs: see extra file TODO.md
#
# further References:
# http://www.aclweb.org/anthology-new/W/W12/W12-3211.pdf
# https://github.com/elacin/PDFExtract/
# http://svn.apache.org/repos/asf/pdfbox/trunk/
# Compatibility for older Python versions
from __future__ import with_statement
from __future__ import print_function
from __future__ import division
__VERSION__ = '1.6.9'
try:
# python2
from cStringIO import StringIO
except ImportError:
# python3, breaks python2-reportlab
from io import StringIO
try:
# Ubuntu 15.x
from PyPDF2 import PdfFileWriter, PdfFileReader, generic as Pdf
except ImportError:
# Ubuntu 14.04 LTS
# from pyPdf import PdfFileWriter, PdfFileReader, generic as Pdf
print("ERROR: pyPDF2 not found. Retry after running: sudo pip3 install pypdf pygame\n")
sys.exit(1)
from reportlab.pdfgen import canvas
from reportlab.lib.colors import Color
import urllib # used when normal encode fails.
import re, time
from pprint import pprint
import xml.etree.cElementTree as ET
import sys, os, subprocess
import argparse
import pygame.font as PGF
from difflib import SequenceMatcher
# FIXME: class Hunspell should be loaded as a module
# import HunspellPure
# I fail to understand the standard encode() decode() methods.
# But the codecs module always does what I mean.
import codecs
# allow debug printing into less:
if sys.version_info[0] < 3:
# this breaks in python3
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
debug = False
page_ref_magic = "675849302 to page " # a token we use to patch the page objects.
page_ref_plain = "to page " # this will be visible as a popup on navigation marks.
highlight_height = 1.2 # some fonts cause too much overlap with 1.4
# 1.2 is often not enough to look symmetric.
anno_popup_src_loc_ref = False # False: 'chg: bla' True: 'chg:p1t: bla'
# from pdfminer.fontmetrics import FONT_METRICS
# FONT_METRICS['Helvetica'][1]['W']
# 944
def mergeAnnotsRelocate(dest_p, src_p, first_page=0):
### Links to Pages look like this:
# <<
# /Contents (to page 1)
# /Dest [ 14 0 R /Fit ]
# /Type /Annot
# /Rect [ 571.46880 0 595.28000 11.90560 ]
# /Border [ 0 0 0 ]
# /Subtype /Link
# >>
### Whereas URL links have an /A object but no /Contents or /Dest.
#
###
# This is a hack:
# while writing navigation marks, we only have one page canvas at a time.
# Therefore we create self references there, and get
# "/Dest": [IndirectObject(33,0), "/Fit"]
# entries where the IndirectObject() points to the src_p itself.
# We replace this IndirectObject() with the one of the referenced page
# from dest_p's stream.
# First we fetch the list of all IndirectObject()s for all the pages
# in the dest stream.
if "/Parent" in dest_p:
# FIXME: this is not always an array of all pages.
# Tanja's book_sleha.pdf has only len(pages_a) = 10, alhtough we have 490 pages.
pages_a = dest_p["/Parent"].getObject().get("/Kids", [])
else:
pages_a = []
print("mergeAnnots Warning: no /Parent in dest_p")
if "/Annots" in src_p:
annots = src_p["/Annots"]
for a in annots:
o = a.getObject() # a is an IndirectObject()
if "/Contents" in o:
if debug > 1: pprint(["mergeAnnots old:", o])
m = re.match(page_ref_magic+"(\d+)$", o["/Contents"])
if m:
p_nr = m.group(1)
o[Pdf.NameObject("/Contents")] = Pdf.createStringObject(page_ref_plain+p_nr)
if len(pages_a) > int(p_nr)+first_page:
o["/Dest"][0] = pages_a[int(p_nr)+first_page]
if debug > 1: pprint(["mergeAnnots new:", o])
else:
# FIXME: len(pages_a) can be much shorter than number of pages...
if debug:
print("mergeAnnots failed: page_idx %d out of range. Have %d" %
(int(p_nr)+first_page, len(pages_a)))
else:
print("mergeAnnots failed: page_ref_magic not found: '%s'" % o["/Contents"])
if "/Annots" in dest_p:
if debug:
print("mergeAnnots: append %d+%d" % (len(dest_p["/Annots"]), len(annots)))
dest_p["/Annots"].extend(annots)
else:
dest_p[Pdf.NameObject("/Annots")] = annots
def page_changemarks(canvas, mediabox, cropbox, marks, page_idx, trans=0.5, leftside=None, cb_x=None, cb_w=0.007, min_w=0.01, ext_w=0.05, features='C,H,A,N'):
# cb_x=0.98 changebar near right edge
# cb_x=0.02 changebar near left edge
# min_w=0.05: each mark is min 5% of the page width wide. If not we add extenders.
if cb_x is None:
if leftside:
cb_x=0.02
else:
cb_x=0.98
anno=False
highlight=False
changebar=False
navigation=False
# features = map(lambda x: x[0].upper(), features.split(','))
features = [x[0].upper() for x in features.split(',')]
if 'A' in features: anno=True
if 'P' in features: anno=True # aka popup
if 'H' in features: highlight=True
if 'C' in features: changebar=True
if 'N' in features: navigation=True
# mediabox [0, 0, 612, 792], list of 4x float or FloatObject
# FloatObject does not support arithmetics with float. Needs casting. Sigh.
# marks = { h:1188, w:918, x:0, y:0, rect: [{x,y,w,h,t},...], nr:1, nav_fwd:9, nav_bwd:4 }
def mx2c(x):
return (0.0+x*float(mediabox[2])/marks['w'])
def cx2c(x):
return (0.0+x*float(cropbox[2])/marks['w'])
def w2c(w):
return (0.0+w*float(mediabox[2])/marks['w'])
def y2c(y):
return (0.0+float(mediabox[3])-y*float(mediabox[3])/marks['h'])
def h2c(h):
return (0.0+h*float(mediabox[3])/marks['h'])
def nav_mark_fwd(canv, target_page, radius=5):
w=float(cropbox[2]) # not MediaBox!
h=float(cropbox[3])
r = w * radius * 0.01 # percent of page width
if leftside:
x=0
else:
x=w-r-r
canv.wedge(x,-r, x+r+r,r, 45,90, fill=1, stroke=0) # bottom
dest = "jump_"+str(canv.getPageNumber())
canv.linkAbsolute(page_ref_magic+str(target_page), dest, (x,0, x+r+r,r))
if debug:
print("nav_mark_fwd: %s + %s, dest=%s" % (page_ref_magic, target_page, dest))
def nav_mark_bwd(canv, target_page, radius=5):
w=float(cropbox[2]) # not MediaBox!
h=float(cropbox[3])
r = w * radius * 0.01 # percent of page width
if leftside:
x=0
else:
x=w-r-r
canv.wedge(x,h-r, x+r+r,h+r, 225,90, fill=1, stroke=0) # top
dest = "jump_"+str(canv.getPageNumber())
canv.linkAbsolute(page_ref_magic+str(target_page), dest, (x,h, x+r+r,h-r))
def anno_popup(canv, x,y, w,h, mark):
# We misuse linkURL() as this is the only annotation, that a) can be written with reportlab() and b)
# works both in acroread and ocular. HACK: For acroread, we include .: at the beginning, this prevents
# file://full_path expansion on strings that do not look like urls.
text = mark.get('t', '.') + ':'
if 'o' in mark:
if isinstance(mark['o'], list):
if anno_popup_src_loc_ref:
text += mark['o'][1]+': '+ mark['o'][0]
else:
text += ' '+mark['o'][0]
else:
text += ' '+mark['o']
# need ascii here. anything else triggers
# UnicodeDecodeError: 'utf16' codec can't decode bytes in position 5484-5485: illegal UTF-16 surrogate
# from File "/usr/lib/python2.7/site-packages/pyPdf/generic.py", line 248, in createStringObject
try:
# this can fail with: 'ascii' codec can't decode byte 0xe2
text = text.encode('ascii', errors='replace')
except:
# I cannot even print the below failed message, it dies in self.encode()
#print("failed to encode text: '%s'" % text)
#text = "text.encode('ascii', errors='ignore')"
text = urllib.quote_plus(text)
canv.linkURL(text, (x, y, x+w, y+h), relative=0) # , Border="[ 1 1 1 ]")
cb_x = (cb_x-0.5*cb_w) * marks['w'] # relative to pdf page width
cb_w = cb_w * marks['w'] # relative to pdf page width
min_w = min_w * float(mediabox[2]) # relative to xml page width
ext_w = ext_w * float(mediabox[2]) # extenders, if needed
if navigation:
canvas.setFillColor(Color(marks['nav_c'][0],marks['nav_c'][1],marks['nav_c'][2], alpha=trans))
# dummy bookmarks for self references, to be relocated by mergeAnnots()
canvas.bookmarkPage("jump_%s" % canvas.getPageNumber())
if 'nav_bwd' in marks: nav_mark_bwd(canvas, marks['nav_bwd'], 2)
if 'nav_fwd' in marks: nav_mark_fwd(canvas, marks['nav_fwd'], 2)
canvas.setFont('Helvetica',5)
### a testing grid
if debug > 1:
for x in range(0,13):
for y in range(0,50):
canvas.drawString(50*x,20*y,'.(%d,%d)' % (50*x,20*y))
if debug: canvas.setFont('Helvetica',16)
for m in marks['rect']:
canvas.setFillColor(Color(m['c'][0],m['c'][1],m['c'][2], alpha=trans))
canvas.setStrokeColor(Color(m['c'][0],m['c'][1],m['c'][2], alpha=0.5*trans))
# m = {'h': 23, 'c': [1,0,1], 't': 'equ', 'w': 76.56716417910448, 'x': 221.0, 'y': 299}
(x,y,w,h) = (m['x'], m['y'], m['w'], m['h'])
if w < min_w:
# normally happens with m['t'] == 'del'
if debug > 1: print("min_w:%s (%s)" % (min_w, w))
if highlight:
# delete marker: two horizontal and one vertical bar.
canvas.rect(mx2c(x-ext_w),y2c(y+0.2*h), w2c(w+2*ext_w),h2c(0.2*h), fill=1, stroke=0)
canvas.rect(mx2c(x-ext_w),y2c(y-(highlight_height-0.2)*h), w2c(w+2*ext_w),h2c(0.2*h), fill=1, stroke=0)
x = x - (0.5 * (min_w-w))
canvas.rect(mx2c(x),y2c(y),w2c(min_w),h2c(h*(highlight_height-0.2)), fill=1, stroke=0)
if anno:
anno_popup(canvas, mx2c(x),y2c(y), w2c(min_w),h2c(h*highlight_height), m)
else:
# multiply height h with (highlight_height -- ca 1.2) to add some top
# padding, similar to the bottom padding that is automatically added
# due to descenders extending across the font baseline.
if highlight:
if m['t'] == 'spl':
canvas.rect(mx2c(x),y2c(y), w2c(w),h2c(h*0.2), fill=1, stroke=0) # underline only
else:
canvas.rect(mx2c(x),y2c(y), w2c(w),h2c(h*highlight_height), fill=1, stroke=0)
if anno:
anno_popup(canvas, mx2c(x),y2c(y), w2c(w),h2c(h*highlight_height), m)
if changebar:
# need the cropbox coordinate system to find the visible right edge
canvas.rect(cx2c(cb_x), y2c(y),w2c(cb_w), h2c(h*highlight_height), fill=1, stroke=1)
if debug > 1:
canvas.drawString(cx2c(x),y2c(y),'.(%d,%d)%s(%d,%d)' % (mx2c(x),y2c(y),m['t'],x,y))
pprint(m)
return # shortcut, only the first word of the page
def page_watermark(canv, box, argv, color=[1,0,1], trans=0.5, p_w=None, p_h=None, margins=None, features='W,B'):
f_watermark=False
f_bordermargins=False
#features = map(lambda x: x[0].upper(), features.split(','))
features = [x[0].upper() for x in features.split(',')]
if 'B' in features: f_bordermargins=True
if 'W' in features: f_watermark=True
if f_bordermargins and margins:
w=float(box[2]) # not MediaBox!
h=float(box[3])
# w,h = canv._pagesize
# w=float(w)
# h=float(h)
m_n=margins['n']*float(box[3])/p_h
m_e=margins['e']*float(box[2])/p_w
m_w=margins['w']*float(box[2])/p_w
m_s=margins['s']*float(box[3])/p_h
# m_n=margins['n']*p_h
# m_e=margins['e']*p_w
# m_w=margins['w']*p_w
# m_s=margins['s']*p_h
canv.setFillColor(Color(margins['c'][0],margins['c'][1],margins['c'][2], alpha=trans))
if m_n: canv.rect(0,h-m_n, w,m_n, fill=1, stroke=0)
if m_s: canv.rect(0,0, w,m_s, fill=1, stroke=0)
if m_e: canv.rect(w-m_e,m_s, m_e,h-m_n-m_s, fill=1, stroke=0)
if m_w: canv.rect(0,m_s, m_w,h-m_n-m_s, fill=1, stroke=0)
if f_watermark:
canv.setFont('Helvetica',5)
av = []
for arg in argv:
m=re.match("\S\S\S\S+(/.*?)$", arg)
if m: arg = "..."+m.group(1)
av.append(arg)
text = "Highlights added by (V" + __VERSION__ + "): " + " ".join(av)
canv.setFillColor(Color(color[0],color[1],color[2], alpha=trans))
canv.drawString(15,10,text)
# import xml.etree.ElementTree as pET
# class RelaxedXMLParser(pET.XMLParser):
# """
# We need to handle misplaced closing tags gracefully:
# <i>Scanning Issue<a href="http://support.novell.com/">s</i>and</a>
# CAUTION: this approach is not safe. The misplaced tags may
# be at the border of a feed data buffer. We have no way to look ahead
# or behind in this interface.
# It fails miserably, when the <a ...> tag crosses buffer boundaries.
# """
# def feed(self,data):
# dlen = len(data)
# # data = re.sub("</?i>","", data) # <i> only trigger the issue
# data = re.sub("(<a.*?>|</a>)","", data) # <a...> </a> appear to be misplaced.
# # print >>sys.stderr, "FEEED", dlen, len(data), "XX "+ data[0:100] + " XX"
# super(RelaxedXMLParser,self).feed(data)
def pdf2xml(parser, infile, key='', firstpage=None, lastpage=None):
""" read a pdf file with pdftohtml and parse the resulting xml into a dom tree
the first parameter, parser is only used for calling exit() with proper messages.
FIXME: a fallback with a preprocessing xml parser (slower and more memory
consuming, but irrelevant, considered the slowness of SequenceMatcher...)
is attemted, if the normal cElementTree parser fails.
This compensates for a bug in pdftohtml -xml yielding invalid xml.
"""
dom = do_pdf2xml(parser, infile, key=key, firstpage=firstpage, lastpage=lastpage, relaxed=False)
if dom is None:
print(" pdf2xml retrying more relaxed ...")
dom = do_pdf2xml(parser, infile, key=key, firstpage=firstpage, lastpage=lastpage, relaxed=True)
return dom
def do_pdf2xml(parser, infile, key='', firstpage=None, lastpage=None, relaxed=False):
""" read a pdf file with pdftohtml and parse the resulting xml into a dom tree
the first parameter, parser is only used for calling exit() with proper messages.
CAUTION: this uses pdftohtml -xml, which may return invalid xml. A workaround
for some cases is provided, if relaxed=True.
"""
pdftohtml_cmd = ["pdftohtml", "-q", "-i", "-nodrm", "-nomerge", "-stdout", "-xml"]
if firstpage is not None:
pdftohtml_cmd += ["-f", firstpage]
if lastpage is not None:
pdftohtml_cmd += ["-l", lastpage]
if len(key):
pdftohtml_cmd += ["-upw", key]
try:
(to_child, from_child) = os.popen2(pdftohtml_cmd + [infile])
except Exception as e:
print(" ".join(pdftohtml_cmd + [infile]))
parser.exit("pdftohtml -xml failed: " + " ".join(pdftohtml_cmd + [infile]) + ": " + str(e))
try:
if relaxed:
data = from_child.read()
data = re.sub("(<a.*?>|</a>)","", data) # <a...> </a> appear to be misplaced.
dom = ET.parse(StringIO(data))
else:
dom = ET.parse(from_child)
except Exception as e:
print(" ".join(pdftohtml_cmd + [infile]))
if relaxed:
parser.exit("pdftohtml -xml failed.\nET.parse: " + str(e) + ")\n\n" + parser.format_usage())
else:
return None
print("pdf2xml done")
return dom
class DecoratedWord(list):
"""Usage in pdfcompare is:
word[0] is the word itself; word[1] is a longer string, where word[0] is
found; word[2] is the index position into word[1], and word[3] is a set of
attributes, as follows:
Elements of word[3] are:
{'f': '3', 's':'stem', 'h': '10', 'l': 'b', 'p': 2, 'w': '151', 'x': '540', 'y': '1209'}
Where f is the font index; l is the location on the page as in t(op),
m(iddle), b(ottom); p is the physical page number; x,y,w,h define the
bounding box of word[1]; and s is a substring (without digits or
punctuation) used for spell checking.
"""
def __eq__(self, other):
return self[0] == other[0]
def __hash__(self):
return hash(self[0])
def xmlfile2wordlist(fname):
""" works well with xml from pdftohtml -xml.
"""
wl = []
elementcount = 0
#tree= ET.parse(fname)
#dom = tree.getroot()
#for elem in dom.iter():
## line number counting idea from
## http://bytes.com/topic/python/answers/535191-elementtree-line-numbers-iterparse
class FileWrapperLineNo:
def __init__(self, source):
self.source = source
self.lineno = 0
def read(self, bytes):
s = self.source.readline()
self.lineno += 1
return s
f = FileWrapperLineNo(open(fname))
for event, elem in ET.iterparse(f, events=("start", "end")):
if event == "start":
elementcount += 1
## we could grab all from the root element,
## but we want to count elements.
# t = "".join(elem.itertext())
if elem.text:
for w in elem.text.split():
wl.append(DecoratedWord([w,None,None,{'e':elementcount, 'l':f.lineno}]))
return wl
def textfile2wordlist(fname):
""" CAUTION if you create your text files with pdftotxt,
things may appear in different ordering than with pdftohtml, resulting
in an enormous diff.
"""
wl = []
# assume .txt files are utf8 encoded, but please survive binary garbage.
with codecs.open(fname, 'r', 'utf-8', errors='ignore') as f:
for lnr, line in enumerate(f):
for w in line.split():
wl.append(DecoratedWord([w,None,None,{'l':lnr}]))
return wl
def bbox_inside(bb1, bb2):
""" checks if bb2 is inside the bounding box bb1.
The bounding box format is [x1,y1,x2,y2].
If bb2 is a bounding box, all 4 corners must be inside bb1.
If bb2 has only length 2, it is interpreted as a point [x,y].
"""
if len(bb2) < 4:
# a point was given, promote to 0-size bbox. Same price.
x,y = bb2
bb2 = [x,y,x,y]
if bb2[0] < bb1[0]: return False
if bb2[1] < bb1[1]: return False
if bb2[2] > bb1[2]: return False
if bb2[3] > bb1[3]: return False
return True
def bbox_overlap(bb1, bb2):
""" bb1 and bb2 are expected as four element rectangles in the format
[x1,y1,x2,y2]. Returns True if any corner of bb2 is in bb1
or if any corner of bb1 is in bb2.
That should be a valid overlap test, no?
"""
x1,y1,x2,y2 = bb2
if bbox_inside(bb1, [x1,y1]): return True
if bbox_inside(bb1, [x1,y2]): return True
if bbox_inside(bb1, [x2,y2]): return True
if bbox_inside(bb1, [x2,y1]): return True
x1,y1,x2,y2 = bb1
if bbox_inside(bb2, [x1,y1]): return True
if bbox_inside(bb2, [x1,y2]): return True
if bbox_inside(bb2, [x2,y2]): return True
if bbox_inside(bb2, [x2,y1]): return True
return False
def in_bbox_interpolated(bbox, word):
"""calculate an approximation of the on-page position of word[0].
We use the known coordinates word[3]['x,y,w,h] of the text word[1],
in which word[0] is contained as a substring starting at position word[2].
A simple constant width character interpolation is done for speed.
A more excat interpolation is possible using the font metrics word[3]['f']
"""
if bbox is None:
return True # the undefined bounding box includes all
# coordinates of word[1]
x1 = float(word[3]['x'])
x2 = float(word[3]['w']) + x1
y1 = float(word[3]['y'])
y2 = float(word[3]['h']) + y1
if bbox_inside(bbox, [x1,y1,x2,y2]):
return True # fast track. All of word[1] is in, so the word[0] is also in.
if not len(word[1]): return False # zero-size and not completly in, means out.
i = word[2]
l = len(word[0])
char_width = (x2-x1)/len(word[1])
x1 += i * char_width
x2 = x1 + l * char_width
# Given the fast track above, maybe for the rest, a
# correct font metrics interpolation is affordable?
# The impact is linear with the input text size, beware.
# Currently impact of metrics calculation is only linar with the
# result set size.
if bbox_overlap(bbox, [x1,y1,x2,y2]):
return True
return False
def textline2wordlist(text, context, bbox=None):
"""returns a list of 4-element lists like this:
[word, text, idx, context]
where the word was found in the text string at offset idx.
words are defined as any printable text delimited by whitespace.
just as str.split() would do.
Those 4-element lists are cast into DecoratedWord.
The DecoratedWord type extends the list type, so that it is hashable and
comparable using only the "word" which is the first element of the four.
Thus our wordlists work well as sequences with difflib, although they also
transport all the context to compute exact page positions later.
A bbox (x1,y1, x2, y2) with x1 < x2, y1 < y2 can be specified to prefilter
the wordlist. Only words that are (at least partially) inside the bbox
will be returned.
"""
wl = []
idx = 0
tl = re.split("(\s+)", text)
while True:
if len(tl)==0: break
head = tl.pop(0)
if len(head):
word = [head, text, idx, context]
if in_bbox_interpolated(bbox, word):
wl.append(DecoratedWord(word))
if len(tl)==0: break
sep = tl.pop(0)
idx += len(sep)+len(head)
return wl
def xml2wordlist(dom, first_page=None, last_page=None, margins=None):
"""input: a dom tree as generated by pdftohtml -xml.
first_page, last_page start counting at 0.
If margins is not None, the coordinates of all words are filtered against
a bounding box constructed by reducing the page box.
output: a wordlist with all the metadata so that the exact coordinates
of each word can be calculated.
"""
## Caution:
# <text font="1" height="14" left="230" top="203" width="635">8-bit microcontroller based on the AVR enhanced RISC architecture. By executing powerful</text>
# <text font="1" height="14" left="230" top="223" width="635">i n s t r u c t i o n s i n a s i n g l e c l o c k c y c l e , t h e</text>
## pdftohtml -xml can return strings where each letter is padded with a whitespace.
## zap_letter_spacing() handles this (somewhat)
## Seen in atmega164_324_644_1284_8272S.pdf
if first_page is None: first_page = 0
wl=[]
p_nr = 0
for p in dom.findall('page'):
if not last_page is None:
if p_nr > int(last_page):
break
p_nr += 1
if p_nr <= int(first_page):
continue
p_h = float(p.attrib['height'])
p_w = float(p.attrib['width'])
# default bounding box is entire page:
# bbox ordering is x1,y1, x2, y2 where x1,y1 are the smaller values.
p_bbox = (0, 0, p_w, p_h)
if margins is not None:
p_bbox = (margins['n'], margins['w'], p_w - margins['e'], p_h - margins['s'])
for e in p.findall('text'):
# <text font="0" height="19" left="54" top="107" width="87"><b>Features</b></text>
x=e.attrib['left']
y=e.attrib['top']
w=e.attrib['width']
h=e.attrib['height']
f=e.attrib['font']
text = ''
for t in e.itertext(): text += t
## crude top,center,bottom location
if float(y) > 0.66*p_h: l = 'b'
elif float(y) > 0.33*p_h: l = 'c'
else: l = 't'
wl += textline2wordlist(text, {'p':p_nr, 'l':l, 'x':x, 'y':y, 'w':w, 'h':h, 'f':f}, p_bbox)
#pprint(wl)
print("xml2wordlist: %d pages" % (p_nr-int(first_page)))
return wl
def xml2fontinfo(dom, last_page=None):
# last_page starts counting at 0 and is inclusive.
finfo = [None] # each page may add (or overwrite?) some fonts
p_finfo = {}
p_nr = 0
for p in dom.findall('page'):
if not last_page is None:
if p_nr > int(last_page):
break
p_nr += 1
p_finfo = p_finfo.copy()
# print("----------------- page %s -----------------" % p.attrib['number'])
for fspec in p.findall('fontspec'):
fname = fspec.attrib.get('family', 'Helvetica')
fsize = fspec.attrib.get('size', 12)
f_id = fspec.attrib.get('id')
f_file = PGF.match_font(fname)
######
# On openSUSE 12.1 Beta 1 (i586,fossy) the call to PGF.Font() triggers this warning:
# /usr/lib/python2.7/site-packages/pygame/pkgdata.py:27: UserWarning:
# Module argparse was already imported from
# /usr/lib/python2.7/argparse.pyc, but /usr/lib/python2.7/site-packages
# is being added
f = PGF.Font(f_file, int(0.5+float(fsize)))
p_finfo[f_id] = { 'name': fname, 'size':fsize, 'file': f_file, 'font':f }
#pprint(p_finfo)
finfo.append(p_finfo)
return finfo
def main():
parser = argparse.ArgumentParser(epilog="version: "+__VERSION__, description="Highlight changed/added/deleted/moved text in a PDF file.")
parser.def_trans = 0.6
parser.def_decrypt_key = ''
parser.def_colors = { 'E': [1,0,1, 'pink'], # extra
'A': [.3,1,.3, 'green'], # added
'D': [1,.3,.3, 'red'], # deleted
'C': [.9,.8,0, 'yellow'], # changed
'M': [.7,1,1, 'blue'], # moved
'B': [.9,.9,.9, 'gray'] } # borders
parser.def_output = 'output.pdf'
parser.def_marks = 'A,D,C'
parser.def_features = 'H,C,P,W,B'
parser.def_margins = '0,0,0,0'
parser.def_margins = '0,0,0,0'
parser.def_below = False
parser.add_argument("-c", "--compare-text", metavar="OLDFILE",
help="Mark added, deleted and replaced text (or see -m) with regard to OLDFILE. \
File formats .pdf, .xml, .txt are recognized by their suffix. \
The comparison works word by word.")
parser.add_argument("-d", "--decrypt-key", metavar="DECRYPT_KEY", default=parser.def_decrypt_key,
help="Open an encrypted PDF. Default: KEY='"+parser.def_decrypt_key+"'")
parser.add_argument("-e", "--exclude-irrelevant-pages", default=False, action="store_true",
help="With -s: show only matching pages; with -c: show only changed pages. \
Default: reproduce all pages from INFILE in OUTFILE.")
parser.add_argument("-f", "--features", metavar="FEATURES", default=parser.def_features,
help="Specify how to mark. Allowed values are 'highlight', 'changebar', 'popup', \
'navigation', 'watermark', 'margin'. Default: " + str(parser.def_features))
parser.add_argument("-i", "--nocase", default=False, action="store_true",
help="Make -s case insensitive; default: case sensitive.")
parser.add_argument("-l", "--log", metavar="LOGFILE",
help="Write an python datastructure describing all the overlay objects on each page. Default none.")
parser.add_argument("-m", "--mark", metavar="OPS", default=parser.def_marks,
help="Specify what to mark. Used with -c. Allowed values are 'add','delete','change','equal'. \
Multiple values can be listed comma-seperated; abbreviations are allowed.\
Default: " + str(parser.def_marks))
parser.add_argument("-n", "--no-output", default=False, action="store_true",
help="Do not write an output file; print diagnostics only. Default: write output file as per -o option.")
parser.add_argument("-o", "--output", metavar="OUTFILE", default=parser.def_output,
help="Write output to FILE; default: "+parser.def_output)
parser.add_argument("-s", "--search", metavar="WORD_REGEXP",
help="Highlight WORD_REGEXP")
parser.add_argument("--spell", "--spell-check", default=False, action="store_true",
help="Run the text body of the (new) pdf through hunspell. Unknown words are underlined. \
Use e.g. 'env DICTIONARY=en_US ...' (or de_DE, ...) to specify the spelling dictionary, \
if your system has more than one. To add new words to your private dictionary use e.g. \
'echo >> ~/.hunspell_en_US ownCloud'. Check with 'hunspell -D' and study 'man hunspell'.")
parser.add_argument("--strict", default=False, action="store_true",
help="Show really all differences. Default: ignore removed hyphenation; \
ignore character spacing inside a word.")
parser.add_argument("-t", "--transparency", type=float, default=parser.def_trans, metavar="TRANSP",
help="Set transparency of the highlight; invisible: 0.0; full opaque: 1.0; \
default: " + str(parser.def_trans))
parser.add_argument("-B", "--below", default=parser.def_below, action="store_true",
help="Paint the highlight markers below the text. Try this if the normal merge crashes. Use with care, highlights may disappear below background graphics. Default: BELOW='"+str(parser.def_below)+"'.")
parser.add_argument("-C", "--search-color", metavar="NAME=R,G,B", action="append",
help="Set colors of the search highlights as an RGB triplet; R,G,B ranges are 0.0-1.0 each; valid names are 'add,'delete','change','equal','margin','all'; default name is 'equal', which is also used for -s; default colors are " +
" ".join(["%s=%s,%s,%s /*%s*/ " %(x_y[0],x_y[1][0],x_y[1][1],x_y[1][2],x_y[1][3]) for x_y in list(parser.def_colors.items())]))
parser.add_argument("-D", "--debug", default=False, action="store_true",
help="Enable debugging. Prints more on stdout, dumps several *.xml or *.pdf files.")
parser.add_argument("-F", "--first-page", metavar="FIRST_PAGE",
help="Skip some pages at start of document; see also -L option. Default: all pages.")
parser.add_argument("-L", "--last-page", metavar="LAST_PAGE",
help="Limit pages processed; this counts pages, it does not use document \
page numbers; see also -F; default: all pages.")
parser.add_argument("-M", "--margins", metavar="N,E,W,S", default=parser.def_margins,
help="Specify margin space to ignore on each page. A margin width is expressed \
in units of ca. 100dpi. Specify four numbers in the order north,east,west,south. Default: "\
+ str(parser.def_margins))
parser.add_argument("-S", "--source-location", default=False, action="store_true",
help="Annotation start includes :pNX: markers where 'N' is the page number of the location \
in the original document and X is 't' for top, 'c' for center, or 'b' for bottom of the page. \
Default: Annotations start only with 'chg:', 'add:', 'del:' optionally followed by original text.")
parser.add_argument("-V", "--version", default=False, action="store_true",
help="Print the version number and exit.")
parser.add_argument("-X", "--no-compression", default=False, action="store_true",
help="Write uncompressed PDF. Default: FlateEncode filter compression.")
parser.add_argument("--leftside", default=False, action="store_true",
help="Put changebars and navigation at the left hand side of the page. Default: right hand side.")
parser.add_argument("infile", metavar="INFILE", help="The input file.")
parser.add_argument("infile2", metavar="INFILE2", nargs="?", help="Optional 'newer' input file; alternate syntax to -c")
args = parser.parse_args() # --help is automatic
args.transparency = 1 - args.transparency # it is needed reversed.
if args.version: parser.exit(__VERSION__)
global debug
debug = args.debug
global anno_popup_src_loc_ref
anno_popup_src_loc_ref = args.source_location
args.search_colors = parser.def_colors.copy()
if args.search_color:
for col in args.search_color:
val=None
b=None
try: (name,val) = col.split('=')
except: pass
if val is None: (name,val)=('equal', col)
try: (r,g,b) = val.split(',')
except: pass
if b is None: parser.exit("--search-color NAME=R,G,B: no two ',' found in '%s=%s'" % (name,val))
if name.upper() == 'ALL':
for c in list(args.search_colors.keys()):
args.search_colors[c] = [float(r),float(g),float(b)]
else:
name = name[0].upper()
args.search_colors[name] = [float(r),float(g),float(b)]
margins = parse_margins(args.margins, args.search_colors['B'])
## TEST this, fix or disable: they should work well together:
# if args.search and args.compare_text:
# parser.exit("Usage error: -s search and -c compare are mutually exclusive, try --help")
if args.compare_text is None and args.infile2 is not None:
args.compare_text,args.infile = args.infile,args.infile2
if args.search is None and args.compare_text is None and args.spell is None:
parser.exit("Oops. Nothing to do. Specify either -s or --spell or -c or two input files.")
if not os.access(args.infile, os.R_OK):
parser.exit("Cannot read input file: %s" % args.infile)
dom1 = pdf2xml(parser, args.infile, key=args.decrypt_key, firstpage=args.first_page, lastpage=args.last_page)
dom2 = None
wordlist2 = None
if args.compare_text:
if re.search('\.pdf$', args.compare_text, re.I):
dom2 = pdf2xml(parser, args.compare_text, key=args.decrypt_key, firstpage=args.first_page, lastpage=args.last_page)
first_page = args.first_page
if first_page is not None: first_page = int(first_page) - 1
last_page = args.last_page
if last_page is not None: last_page = int(last_page) - 1
wordlist2 = xml2wordlist(dom2, first_page, last_page, margins=margins)
elif re.search('\.xml$', args.compare_text, re.I):
wordlist2 = xmlfile2wordlist(args.compare_text)
else:
# assuming a plain text document
wordlist2 = textfile2wordlist(args.compare_text)
if debug:
dom1.write(args.output + ".1.xml")
if dom2:
dom2.write(args.output + ".2.xml")
PGF.init()
# This pygame.font module is used to calculate widths of all glyphs
# for words we need to mark. With this calculation, we can determine
# the exact position and length of the marks, if the marked word is
# only a substring (which it often is).
# For complete strings, we get the exact positions and size from pdftohtml -xml.
# Strings returned by pdftohtml are combinations of multiple PDF text fragments.
# This is good, as pdftohtml reassembles words and often complete lines in a perfectly
# readable way.
# The downside of this is, that the width and position calculation may be
# a bit off, due to uneven word-spacing or letter-spacing in the original PDF text line.
####
# f = PGF.Font(PGF.match_font('Times'), 13))
# f.metrics("Bernoulli")
# [(0, 8, 0, 9, 9), (0, 7, 0, 6, 6), (-1, 5, 0, 6, 4), (-1, 6, 0, 6, 6), (0, 7, 0, 6, 7), (0, 6, 0, 6, 6), (-1, 3, 0, 9, 3), (-1, 3, 0, 9, 3), (-1, 3, 0, 9, 3)]
# (minx, maxx, miny, maxy, advance)
input1 = PdfFileReader(file(args.infile, "rb"))
if input1.getIsEncrypted():
if input1.decrypt(args.decrypt_key):
if len(args.decrypt_key):
print("Decrypted using key='%s'." % args.decrypt_key)
else:
parser.exit("decrypt(key='%s') failed." % args.decrypt_key)
# last_page,first_page start counting at 0,
# args.last_page, args.first_page start counting at 1.
last_page = input1.getNumPages()-1
first_page = 0
if args.last_page and int(args.last_page) < last_page:
last_page = int(args.last_page)-1
if args.first_page:
first_page = int(args.first_page)-1
if first_page > last_page:
first_page = last_page
print("input pages: %d-%d" % (first_page+1, last_page+1))
page_marks = pdfhtml_xml_find(dom1, re_pattern=args.search,
wordlist=wordlist2,
nocase=args.nocase,
first_page=first_page,
last_page=last_page,
mark_ops=args.mark,
margins=margins,
strict=args.strict,
spell_check=args.spell,
move_similarity=0.75, # 0.75 implies 1 of 1, 2 of 2, 3 of 3, 3 of 4 identical.
move_minwords=1,
ext={'a': {'c':args.search_colors['A']},
'd': {'c':args.search_colors['D']},
'c': {'c':args.search_colors['C']},
'm': {'c':args.search_colors['M']},
'e': {'c':args.search_colors['E']} })
if args.log is not None:
lf = open(args.log, "w")
pprint(page_marks, stream=lf)
lf.close()
output = PdfFileWriter()
# Evil hack: there is no sane way to transport DocumentInfo metadata.
# : This is the insane way, we duplicate this code from
# : PdfFileWriter.__init__()
# FIXME: We should also copy the XMP metadata from the document.
try:
di = input1.getDocumentInfo()
# update ModDate, Creator, DiffCmd
selfcmd = " ".join(sys.argv) + ' # V' + __VERSION__ + ' ' + time.ctime()
if not "/Creator" in di:
di[Pdf.NameObject('/Creator')] = Pdf.createStringObject(selfcmd)
elif not '/Producer' in di:
di[Pdf.NameObject('/Producer')] = Pdf.createStringObject(selfcmd)
di[Pdf.NameObject('/DiffCmd')] = Pdf.createStringObject(selfcmd)
di[Pdf.NameObject('/ModDate')] = Pdf.createStringObject(time.strftime("D:%Y%m%d%H%M%S"))
if debug:
print("DocumentInfo():")
pprint(di)
output._objects.append(di)
except Exception as e:
print("WARNING: getDocumentInfo() failed: " + str(e) )
output._info = Pdf.IndirectObject(len(output._objects), 0, output)
pages_written = 0
total_hits = 0
outline = []
page_idx = 0
nav_bwd = None
for i in range(first_page,last_page+1):
if not nav_bwd is None: page_marks[i]['nav_bwd'] = nav_bwd
if len(page_marks[i]['rect']): nav_bwd = page_idx
page_idx += 1
nav_fwd = None
for i in range(last_page,first_page-1,-1):
page_idx -= 1
if not nav_fwd is None: page_marks[i]['nav_fwd'] = nav_fwd
if len(page_marks[i]['rect']): nav_fwd = page_idx
for i in range(first_page,last_page+1):
if args.exclude_irrelevant_pages and len(page_marks[i]['rect']) == 0:
continue
hitdetails = {'equ':0, 'add':0, 'del':0, 'chg':0, 'spl':0, 'mov':0 }
for r in page_marks[i]['rect']:
tag = r.get('t','unk')
if not tag in hitdetails:
hitdetails[tag] = 0
hitdetails[tag] += 1
total_hits += 1
hits_fmt = ''
for det,ch in (['add','+'], ['del','-'], ['chg','~'], ['equ','='], ['mov','>'], ['spl','!']):
if hitdetails[det]: hits_fmt += '%s%d' % (ch,hitdetails[det])
print(" page %d: %d hits %s" % (page_marks[i]['nr'], len(page_marks[i]['rect']), hits_fmt))
outline.append(" page %d: %d hits %s" % (page_marks[i]['nr'], len(page_marks[i]['rect']), hits_fmt))
# pprint(hitdetails)