-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathZ_module.py
858 lines (735 loc) · 31.6 KB
/
Z_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
"""
Utility Functions for Multiple Modules
This script contains a collection of utility functions that are shared and
utilized by multiple modules within the project.
Grouping these functions in a single script enhances code modularity,
making it easier to maintain and update the functions from a central location.
Feel free to extend this script with additional utility functions as needed to
support various parts of the project.
"""
import json
import os
import pickle
import re
import sys
import docx
import mammoth # to convert docx to html
from docx.shared import Pt # for run.font.size = Pt(r.font.size.pt)
from tqdm import tqdm
class CompactJSONEncoder(json.JSONEncoder):
"""A JSON Encoder that puts small containers on single lines."""
CONTAINER_TYPES = (list, tuple, dict)
"""Container datatypes include primitives or other containers."""
MAX_WIDTH = 100
"""Maximum width of a container that might be put on a single line."""
MAX_ITEMS = 10
"""Maximum number of items in container that might be put on single line."""
def __init__(self, *args, **kwargs):
# using this class without indentation is pointless
if kwargs.get("indent") is None:
kwargs["indent"] = 4
super().__init__(*args, **kwargs)
self.indentation_level = 0
def encode(self, o):
"""Encode JSON object *o* with respect to single line lists."""
if isinstance(o, (list, tuple)):
return self._encode_list(o)
if isinstance(o, dict):
return self._encode_object(o)
if isinstance(o, float): # Use scientific notation for floats
return format(o, "g")
return json.dumps(
o,
skipkeys=self.skipkeys,
ensure_ascii=self.ensure_ascii,
check_circular=self.check_circular,
allow_nan=self.allow_nan,
sort_keys=self.sort_keys,
indent=self.indent,
separators=(self.item_separator, self.key_separator),
default=self.default if hasattr(self, "default") else None,
)
def _encode_list(self, o):
if self._put_on_single_line(o):
return "[" + ", ".join(self.encode(el) for el in o) + "]"
self.indentation_level += 1
output = [self.indent_str + self.encode(el) for el in o]
self.indentation_level -= 1
return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]"
def _encode_object(self, o):
if not o:
return "{}"
if self._put_on_single_line(o):
return (
"{ "
+ ", ".join(
f"{self.encode(k)}: {self.encode(el)}" for k, el in o.items()
)
+ " }"
)
self.indentation_level += 1
output = [
f"{self.indent_str}{json.dumps(k)}: {self.encode(v)}" for k, v in o.items()
]
self.indentation_level -= 1
return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}"
def iterencode(self, o, **kwargs):
"""Required to also work with `json.dump`."""
return self.encode(o)
def _put_on_single_line(self, o):
return (
self._primitives_only(o)
and len(o) <= self.MAX_ITEMS
and len(str(o)) - 2 <= self.MAX_WIDTH
)
def _primitives_only(self, o: list | tuple | dict):
if isinstance(o, (list, tuple)):
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o)
elif isinstance(o, dict):
return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values())
@property
def indent_str(self) -> str:
if isinstance(self.indent, int):
return " " * (self.indentation_level * self.indent)
elif isinstance(self.indent, str):
return self.indentation_level * self.indent
else:
raise ValueError(
f"indent must either be of type int or str (is: {type(self.indent)})"
)
def runtype(ri, run, mode=""):
"""
Function Description:
The runtype function examines the runs produced by python-docx and generates an alternative
(a meaning) for each run based on its text style properties.
It returns the alternative as a string.
Function Parameters:
- ri (int): Stands for run index. It represents the index of the run being examined.
- run (str): The actual run, which is a text string extracted from 'clean-output.docx'.
- mode (optional, str): An optional parameter to consider ['the', 'a', 'an'] as a term
regardless of its index in the entry.This is primarily used to fix certain cases
where alt, run is not being handled correctly. By default, mode is an empty string.
Function Returns:
- alternative (str): A string representing the alternative meaning or category of the run
based on its text style properties.
The runtype function takes a run and analyzes its text style properties to determine its meaning
It checks for various style properties, such as boldness, font name, and font size,
to categorize the run into different alternatives.
The mode parameter allows for special handling of certain cases where alt, run needs to be
adjusted.
Note:
parameter mode affects the following 94 entries
[[634, 642], [701, 706], [1695, 1698], [4872, 4886], [4978, 4981], [5209, 5213], [5239, 5246],
[5323, 5325], [6023, 6028], [6394, 6400], [6556, 6561], [8503, 8508], [8670, 8679],
[14612, 14619], [14620, 14628], [15271, 15274], [15586, 15591], [16272, 16275],
[16399, 16404], [17238, 17248], [17267, 17269], [17270, 17273], [20502, 20508], [20845, 20849],
[21608, 21618], [22796, 22802], [22860, 22864], [23392, 23395], [23883, 23888], [23981, 23990],
[26601, 26606], [31062, 31068], [31263, 31266], [31442, 31448], [31549, 31556], [31799, 31803],
[32188, 32192], [32607, 32610], [32631, 32634], [34303, 34305], [34598, 34604], [34623, 34628],
[36078, 36081], [36082, 36087], [36869, 36873], [37143, 37148], [37627, 37631], [44409, 44419],
[44424, 44428], [47005, 47012], [47042, 47044], [47345, 47349], [47540, 47548], [47575, 47583],
[48171, 48174], [48305, 48308], [48318, 48321], [48322, 48328], [50327, 50332], [50664, 50673],
[50808, 50812], [51641, 51647], [52986, 52989], [55055, 55059], [56938, 56942], [57356, 57360],
[58966, 58974], [64653, 64658], [64659, 64663], [65400, 65402], [65408, 65412], [66320, 66323],
[66388, 66391], [68832, 68835], [71819, 71825], [72045, 72048], [72390, 72397], [72613, 72618],
[73210, 73214], [73849, 73856], [75139, 75141], [76109, 76111], [78661, 78668], [79184, 79187],
[79952, 79957], [80210, 80214], [81854, 81859], [81904, 81907], [82682, 82688], [83441, 83446],
[83984, 83986], [84390, 84394], [87046, 87049], [87970, 87978]]
13 of those 94 entries are actually ['the', 'a', 'an'] at the very end of the line,
and in fact the beginning of a definition - will fix this later
"""
terms = [
"Cliché",
"Euph.",
"Fig.",
"Inf.",
"Lit.",
"Prov.",
"Rur.",
"Sl.",
"Euph",
"Fig",
"Inf",
"Lit",
"Prov",
"Rur",
"Sl",
"Prov. Cliché",
"Fig. Cliché",
"Fig. Inf.",
"Fig. Sl.",
"Fig. Euph.",
"Cliché Fig.",
"Euph. Fig.",
"Inf. Fig.",
"Lit. Fig.",
"Inf. Lit.",
]
if (
run.text.strip() == "*"
and run.bold
and run.font.name == "Formata-Medium"
and run.font.size.pt == float("8.5")
):
return "asterisk"
elif (
(ri in (0, 1))
and run.text.strip().lower() in ["the", "a", "an"]
and run.font.name == "Minion-Regular"
and run.font.size.pt == float("8.5")
) or (
mode == "MUL"
and run.text.strip().lower() in ["the", "a", "an"]
and run.font.name == "Minion-Regular"
and run.font.size.pt == float("8.5")
):
return "article"
elif run.bold and re.match(r"[1-9]\.", run.text.strip()):
return "new_sense"
elif (
run.bold
and run.font.name == "Formata-Medium"
and run.font.size.pt == float("8.5")
):
return "constant"
elif run.font.name == "Formata-Condensed" and run.font.size.pt == float("9.0"):
return "variable"
elif (
(run.text.strip() in terms)
and run.italic
and (run.font.name == "Minion-Italic")
and (run.font.size.pt == float("8.5"))
):
return "term"
elif (
run.text.strip() == "and"
and run.font.name == "Minion-Regular"
and int(run.font.size.pt) == 7
):
return "and"
elif (run.font.name == "Minion-Regular" and run.font.size.pt == float("8.5")) or (
run.font.name == "Formata-Regular" and run.font.size.pt == float("8.0")
):
return "definition"
elif (
(run.text.strip() == "_")
and (run.font.name == "MathematicalPi-Six")
and (run.font.size.pt == float("8.5"))
):
return "underscore"
elif (
(run.text.strip() not in terms)
and run.italic
and (run.font.name == "Minion-Italic")
and (run.font.size.pt == float("8.5"))
):
return "example"
elif (
run.bold
and run.font.name == "Formata-Medium"
and run.font.size.pt == float("4.5")
and run.text.strip() == "†"
):
return "dagger"
else:
return "None"
def cleanup(line_alt_list, line_runs_list):
"""
Function Description:
The cleanup function examines the line_alt_list and line_runs_list produced by python-docx
and runtype(), and it removes any items that should not be part of an entry head.
It returns modified line_alt_list and line_runs_list that represent an entry head.
Function Parameters:
- line_alt_list (list): A list containing alternative meanings for entry runs.
- line_runs_list (list): A list containing runs (sections of text) for any entry.
Function Returns:
- line_alt_list (list): A modified list of alternative meanings representing an entry head.
- line_runs_list (list): A modified list of runs representing an entry head.
The cleanup function is designed to process and refine the provided lists to isolate the
entry head from other content in a document.
It performs several checks to remove specific elements that are not part of the
entry head and returns the cleaned lists.
"""
try:
def_index = line_alt_list.index("definition")
line_alt_list = line_alt_list[:def_index]
line_runs_list = line_runs_list[:def_index]
except:
# only two entries have no definition, ignore them for now
pass
# remove all items in both lists from the 1st 'term' and beyond
try:
term_index = line_alt_list.index("term")
line_alt_list = line_alt_list[:term_index]
line_runs_list = line_runs_list[:term_index]
except:
# not all entries have a term
pass
# remove all items in both lists from the 1st 'example' and beyond
try:
example_index = line_alt_list.index("example")
line_alt_list = line_alt_list[:example_index]
line_runs_list = line_runs_list[:example_index]
except:
pass
# remove '1.' if it was picked up at the very end of entry head - 1030 match
if line_alt_list[-1] == "constant" and line_runs_list[-1].strip() == "1.":
line_alt_list.pop(-1)
line_runs_list.pop(-1)
# remove 'article' if it was picked up at the very end of entry head - 13 match
# (all from multiple phrase entries)
if line_alt_list[-1] == "article" and line_runs_list[-1].strip().lower() in [
"the",
"a",
"an",
]:
line_alt_list.pop(-1)
line_runs_list.pop(-1)
return line_alt_list, line_runs_list
def copy_docx(ranges_list, output_file):
"""
a function to copy lines (within a given list of ranges) from clean-output.docx into
output_file.docx
created a function as I needed to use this code multiple times
"""
# open up clean-output.docx
doc = docx.Document("files/clean-output.docx")
doc_lines = doc.paragraphs
print(f"creating {output_file}.docx")
new_doc = docx.Document() # create a new document
for s, e in tqdm(ranges_list):
# add range
paragraph_1 = new_doc.add_paragraph() # initiate blank line
paragraph_1.paragraph_format.space_before = Pt(1)
paragraph_1.paragraph_format.space_after = Pt(1)
add_range = paragraph_1.add_run(f"[{s}, {e}]")
add_range.font.name = "Calibri"
add_range.font.size = docx.shared.Pt(10)
# add the actual entry lines
for line in range(s, e + 1):
paragraph_2 = (
new_doc.add_paragraph()
) # add an empty paragraph/line to tmp_doc
paragraph_2.paragraph_format.space_before = Pt(1)
paragraph_2.paragraph_format.space_after = Pt(1)
# break down the line I want to copy from 'clean-output.docx' into runs
doc_runs = doc_lines[line].runs
# copy runs from 'clean-output.docx' to new_doc and maintain the same style
for i, r in enumerate(doc_runs):
# add each run to the new paragraph/line in tmp_doc
run = paragraph_2.add_run(r.text)
# apply original run style
if r.bold:
run.bold = True
if r.italic:
run.italic = True
run.font.name = r.font.name
run.font.size = Pt(r.font.size.pt)
# add a break after each entry
paragraph_3 = new_doc.add_paragraph()
paragraph_3.paragraph_format.space_before = Pt(1)
paragraph_3.paragraph_format.space_after = Pt(1)
break_line = paragraph_3.add_run(f'{"*" * 36}')
break_line.font.name = "Courier New"
break_line.font.size = docx.shared.Pt(10)
new_doc.save(f"files/{output_file}.docx") # save the docx file locally
def correct_sense_numbering(entry_html):
"""Corrects sense/definition numbering in a dictionary entry HTML string.
Args:
entry_html (str): The HTML string representing the dictionary entry.
Returns:
str: HTML string with corrected sense numbering and double spaces replaced by single spaces.
Example:
>>> entry_body_html = '''<strong>3. </strong>[for something] to</p><p>cave in to the
pressure of moving air. _ <em> The door blew in</em></p><p><em>during the storm. </em>_
<em>The window blew in from the wind.</em></p>'''
>>> corrected_html = correct_sense_numbering(entry_body_html)
>>> print(corrected_html)
<p> [for something] to</p><p>cave in to the pressure of moving air. _ <em>
The door blew in</em></p><p><em>during the storm. </em>_ <em>The window blew in from
the wind.</em></p>
"""
# Define pattern to match senses
sense_pattern = r"<strong>(\d+)\. </strong>"
# Find all matches of sense pattern
matches = re.findall(sense_pattern, entry_html)
# If no matches found, return original HTML
if not matches:
return entry_html.replace(" ", " ")
# If only one sense remains, remove sense numbering altogether
if len(matches) == 1:
return re.sub(sense_pattern, "", entry_html).replace(" ", " ")
# Iterate over matches and build a mapping of old sense numbers to new sense numbers
sense_mapping = {}
new_sense_number = 1
for old_sense_number in sorted(set(matches), key=int):
sense_mapping[old_sense_number] = str(new_sense_number)
new_sense_number += 1
# Replace old sense numbers with new sense numbers in HTML
corrected_html = entry_html
for old_sense, new_sense in sense_mapping.items():
corrected_html = re.sub(
f"<strong>{old_sense}\. </strong>",
f"<strong>{new_sense}. </strong>",
corrected_html,
)
return corrected_html.replace(" ", " ")
def parse_entry(entry_range, cut_off, multiple_phrases=False, line_runs=[]):
"""
Function Description:
The parse_entry function extracts and processes an entry head and body from the
"clean-output.docx" document, returning the phrase text, phrase HTML, and definition HTML
for the entry.
Function Parameters:
- entry_range (tuple): A tuple (s, e) representing the range of lines to be copied from
"clean-output.docx." The function will extract content from line s to line e.
- cut_off (int): The index that marks the end of the entry head.
All content before this index is considered part of the entry head, and content after
it is part of the entry body.
- multiple_phrases (bool, optional): A flag indicating whether the entry head contains
multiple phrases. If True, the function will extract and process each phrase separately.
Default value is False.
- line_runs (list, optional): A list of runs representing lines.
This is used when multiple_phrases is True to extract individual phrases.
Function Usage:
1. The function checks whether the entry with the specified entry_range has been processed
before by loading entries from a pickled file called "entry_details.pickle."
If the entry has been processed previously, it returns the cached results.
2. If the entry is not found in the cache, it opens the "clean-output.docx" document and
extracts its content.
3. The entry head content is saved to "entry_head.docx," and the entry body content is
saved to "entry_body.docx."
4. If multiple_phrases is True, the entry head is divided into individual phrases,
and both phrase text and HTML are generated.
5. For each phrase, temporary docx files ("phrase_0.docx," "phrase_1.docx," etc.) are created
and processed, and the results are cached.
6. HTML for the entry body is generated and processed, removing terms and making modifications
to the HTML content.
7. The temporarily created docx files are deleted, and the results are pickled and cached.
8. The function returns phrase text, phrase HTML, and entry body HTML.
Example Usage:
```
entry_range = (5, 10) # Specify the range of lines to extract
cut_off = 7 # Index marking the end of the entry head
multiple_phrases = True # If entry head has multiple phrases
line_runs = [...] # List of runs representing lines (only needed when multiple_phrases is True)
phrase_text, phrase_html, entry_body_html = parse_entry(
(s, e), len(line_runs), multiple_phrases, line_runs
)
```
"""
#######################################################
# check to see if this entry has been processed before
if not os.path.exists("files/entry_details.pickle"):
# create the file if this was the 1st script run
entry_details = []
with open("files/entry_details.pickle", "wb") as file:
pickle.dump(entry_details, file)
else:
# load the file
with open("files/entry_details.pickle", "rb") as file:
entry_details = pickle.load(file)
for entry in entry_details:
if entry[0] == entry_range:
return entry[1], entry[2], entry[3]
#######################################################
# open up clean-output.docx
doc = docx.Document("files/clean-output.docx")
lines = doc.paragraphs
# create a temp docx document for entry head
tmp_doc_1 = docx.Document()
runs_count = 0 # counter
# let's copy the entry head from clean-output.docx and save it to entry_head.docx
for i in range(entry_range[0], entry_range[1] + 1):
p = tmp_doc_1.add_paragraph() # add an empty paragraph/line to tmp_doc
# break down the line I want to copy from 'clean-output.docx' into runs
doc_runs = lines[i].runs
for r in doc_runs:
# check if I should add run or not
if runs_count < cut_off:
# add run to the new paragraph/line in tmp_doc
run = p.add_run(r.text)
# apply original run style
if r.bold:
run.bold = True
if r.italic:
run.italic = True
run.font.name = r.font.name
run.font.size = Pt(r.font.size.pt)
# increment runs_count
runs_count += 1
else:
break # break from the inner loop
else:
# Continue if the inner loop wasn't broken.
continue
# Inner loop was broken, break the outer.
break
# save the docx file locally
tmp_doc_1.save("entry_head.docx")
if multiple_phrases:
## step #1
# create phrase_text - a list of strings, each is an individual phrase in the entry head
#######################
phrase_text = []
for lr in line_runs:
pt = " ".join(lr)
pt1 = pt.replace("1.", "").replace("* ", "*")
phrase_text.append(pt1.strip())
## step #2
# create phrase_html - a list of strings, each is an HTML code block that represents
# a phrase in the entry head
#######################
phrase_html = []
tmp_docx_files = []
# open up entry_head.docx
eh_doc = docx.Document("entry_head.docx")
eh_lines = eh_doc.paragraphs
# count how many runs are there in entry_head.docx & add all entry head runs to a list
total_runs_count = 0
entry_head_runs = []
for l in eh_lines:
eh_runs = l.runs
for r in eh_runs:
total_runs_count += 1
entry_head_runs.append(r)
# add a single space ' ' after each line
total_runs_count += 1
entry_head_runs.append(" ")
# read entry head and save each individual phrase to a new docx file
run_counter = 0
done = [] # runs that I've captured already
for i in range(20):
# create a tmp docx file to host a single phrase
sp_doc = docx.Document() # single phrase document
# add an empty paragraph/line to tmp_doc
sp_p = sp_doc.add_paragraph()
# loop over the full entry head and save phrases to tmp_doc one by one
for c, r in enumerate(entry_head_runs):
if c == run_counter and r == " ":
sp_p.add_run(r)
run_counter += 1
done.append((c, r, " "))
elif c == run_counter and runtype(c, r) != "and" and ";" not in r.text:
# add run to the new paragraph/line in tmp_doc
tmp_run = sp_p.add_run(r.text)
# apply original run style
if r.bold:
tmp_run.bold = True
if r.italic:
tmp_run.italic = True
tmp_run.font.name = r.font.name
tmp_run.font.size = Pt(r.font.size.pt)
run_counter += 1 # increment counter
done.append((c, r, r.text))
elif c == run_counter and ";" in r.text:
all_parts = r.text.split(";")
# in case r.text.startswith(';') or r.text.endswith(';') it will add an empty string to all_parts - fix it
# if len(all_parts[-1]) == 0: all_parts.pop(-1)
if len(all_parts[-1]) == 0:
all_parts[-1] = ";"
# if len(all_parts[0]) == 0: all_parts.pop(0)
if len(all_parts[0]) == 0:
all_parts[0] = ";"
# select the part that is in line
for pc, p in enumerate(all_parts, start=1):
if p not in [x[2] for x in done if x[0] == c]:
run_text = p
break
# add part to the new paragraph/line in tmp_doc
tmp_run = sp_p.add_run(run_text)
# apply original run style
if r.bold:
tmp_run.bold = True
if r.italic:
tmp_run.italic = True
tmp_run.font.name = r.font.name
tmp_run.font.size = Pt(r.font.size.pt)
done.append((c, r, run_text))
# only increment run_counter if we have been through all run parts
if run_text == all_parts[-1] and pc == len(all_parts):
run_counter += 1
continue
else:
break
elif c == run_counter and (
runtype(c, r) == "and" or r.text.strip() == ";"
):
run_counter += 1
done.append((c, r, r.text))
break
# save the docx file locally
sp_doc.save(f"phrase_{i}.docx")
tmp_docx_files.append(f"phrase_{i}.docx")
# check if we have reached the end of the file
if run_counter == total_runs_count:
break
# convert all docx files to HTML code and save it to [phrase_html]
for f in tmp_docx_files:
with open(f, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
entry_head_html = result.value # The generated HTML
# clean up the HTML output
phrase_html.append(
entry_head_html.replace("1.", "")
.replace(" </p>", "</p>")
.replace(" </strong></p>", "</strong></p>")
.replace("</p><p>", " ")
.replace("<strong>;</strong>", "")
.replace("</em> <em>", " ")
.replace("</em><em>", "")
)
# delete the file as it's no longer needed
try:
os.remove(f)
except:
pass
# check to make sure everything is working as it should be
if len(phrase_text) != len(phrase_html):
print("*" * 25)
print("something went wrong".upper())
print("entry_range")
print(entry_range)
print("line_runs")
print(line_runs)
print("phrase_text")
print(phrase_text)
print("len(phrase_text)")
print(len(phrase_text))
print("phrase_html")
print(phrase_html)
print("len(phrase_html)")
print(len(phrase_html))
print("*" * 25)
sys.exit()
else:
# grab phrase text from tmp_doc
tmp_a_phrase_text = []
for l in tmp_doc_1.paragraphs:
tmp_a_phrase_text.append(l.text.strip())
tmp_b_phrase_text = " ".join(tmp_a_phrase_text)
tmp_c_phrase_text = tmp_b_phrase_text.replace("1.", "")
phrase_text = tmp_c_phrase_text.strip()
# get HTML of the entry head
with open("entry_head.docx", "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
entry_head_html = result.value # The generated HTML
# clean up the HTML output
phrase_html = (
entry_head_html.replace("1.", "")
.replace(" </p>", "</p>")
.replace(" </strong></p>", "</strong></p>")
.replace("</p><p>", " ")
.replace("</em> <em>", " ")
.replace("</em><em>", "")
)
# create a temp docx document for entry body
tmp_doc_2 = docx.Document()
runs_count = 0 # reset counter
for i in range(entry_range[0], entry_range[1] + 1):
p = tmp_doc_2.add_paragraph() # add an empty paragraph/line to tmp_doc
# break down the line I want to copy from 'clean-output.docx' into runs
doc_runs = lines[i].runs
for r in doc_runs:
# check if I should add run or not
if runs_count >= cut_off:
# add run to the new paragraph/line in tmp_doc
run = p.add_run(r.text)
# apply original run style
if r.bold:
run.bold = True
if r.italic:
run.italic = True
run.font.name = r.font.name
run.font.size = Pt(r.font.size.pt)
# increment runs_count
runs_count += 1
# save the docx file locally
tmp_doc_2.save("entry_body.docx")
# generate HTML for entry body
with open("entry_body.docx", "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
generated_html = result.value # The generated HTML
# remove terms from definitions
# terms = [
# "Cliché",
# "Euph.",
# "Fig.",
# "Inf.",
# "Lit.",
# "Prov.",
# "Rur.",
# "Sl.",
# "Euph",
# "Fig",
# "Inf",
# "Lit",
# "Prov",
# "Rur",
# "Sl",
# "Prov. Cliché",
# "Fig. Cliché",
# "Fig. Inf.",
# "Fig. Sl.",
# "Fig. Euph.",
# "Cliché Fig.",
# "Euph. Fig.",
# "Inf. Fig.",
# "Lit. Fig.",
# "Inf. Lit.",
# ]
# for term in terms:
# generated_html = generated_html.replace(
# f"{term} ", ""
# ) # remove terms and training whitespace
# generated_html = generated_html.replace(term, "")
# let's make some modifications to the HTML
# https://stackoverflow.com/questions/59072514/efficiently-make-many-multiple-substitutions-in-a-string/59072515#59072515
replace = {
"</p><p>": " ",
"</em> <em>": " ",
"</em><em>": "",
"_": "<br>_",
"<strong>2. </strong>": "<br><strong>2. </strong>",
"<strong>3. </strong>": "<br><strong>3. </strong>",
"<strong>4. </strong>": "<br><strong>4. </strong>",
"<strong>5. </strong>": "<br><strong>5. </strong>",
"<strong>6. </strong>": "<br><strong>6. </strong>",
"<strong>7. </strong>": "<br><strong>7. </strong>",
"<strong>8. </strong>": "<br><strong>8. </strong>",
"<strong>9. </strong>": "<br><strong>9. </strong>",
"<strong>10. </strong>": "<br><strong>10. </strong>",
}
rep = dict((re.escape(k), v) for k, v in replace.items())
pattern = re.compile("|".join(rep.keys()))
entry_body_html = pattern.sub(lambda m: rep[re.escape(m.group(0))], generated_html)
if "<strong>2." in entry_body_html and "<strong>1." not in entry_body_html:
# add "<strong>1. </strong>" to the beginning of the string
entry_body_html = entry_body_html.replace(
"<p>", "<p><strong>1. </strong>", 1
) # only replace the 1st occurrence
# remove all (See also)
entry_body_html = re.sub(
r"\(See also.+?\)\.\)|\(See also.+?\)", "", entry_body_html
)
# remove entry sense/definition if it was only reference to another entry
# e.g. "<strong>1. </strong>Go to blow in (from some place)."
# and correct definition numbering if needed
goto_pattern = r"<strong>[1-9]\. <\/strong>Go to.+?\."
entry_body_html = re.sub(goto_pattern, "", entry_body_html)
entry_body_html = correct_sense_numbering(entry_body_html)
# delete the temporarily created docx files
try:
os.remove("entry_head.docx")
os.remove("entry_body.docx")
except:
pass
# pickle results
results = [entry_range, phrase_text, phrase_html, entry_body_html]
entry_details.append(results)
with open("files/entry_details.pickle", "wb") as file:
pickle.dump(entry_details, file)
# return results
return phrase_text, phrase_html, entry_body_html
if __name__ == "__main__":
pass