-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathA_breakitup.py
397 lines (346 loc) · 14.1 KB
/
A_breakitup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
""" Break the text in clean-output.docx into individual dictionary entries
Description:
This script goes through clean-output.docx and breaks it into individual dictionary entries,
and gets the range (start line, end line) of each entry - non-valid entries are ignored.
Example of a valid dictionary entry:
the ABCs of something Fig. the basic facts or principles of something.
_ I have never mastered the ABCs of car maintenance.
non-valid entries are there to reference a previous/next entry or some other entry.
e.g. "able to breathe (freely) again Go to previous."
Input:
- clean-output.docx: The input DOCX file containing data to process.
Output:
- ranges.pickle: The main output file containing ranges for all valid entries as a list of tuples.
- clean_entries.txt (optional): A text file containing valid entries.
- clean_entries.docx (optional): A DOCX file containing valid entries.
- deleted_entries.txt (optional): A text file containing non-valid entries.
Thought Process:
- we loop through the lines of the docx file, and look for the range of each entry.
- the goal here is recognize individual dictionary entries and know where each entry
exists in the book.
- in other words, we should be able to tell that entry #1 goes from line 0 through 3,
entry #2 goes from line 3 through 6, etc.
- the ranges will be used later to parse data from each entry.
Runtime:
- Creating ranges.pickle: Completed in 27 seconds.
- Creating deleted_entries.txt: Completed in 0 seconds (optional).
- Creating clean_entries.txt: Completed in 2 minutes and 46 seconds (optional).
- Creating clean_entries.docx: Completed in 12 minutes and 8 seconds (optional).
Usage:
Please run this script from the command line (CMD)
Example:
python A_breakitup.py
"""
import pickle
import docx
from tqdm import tqdm
from Z_module import copy_docx, runtype
def typically():
"""
returns True if the beginning of the line looks like this:
'be ~; go ~; run ~; turn ~.) _ When did the'
"""
try:
if (
line.runs[0].bold
and line.runs[0].font.name == "Formata-Medium"
and line.runs[1].bold
and line.runs[1].font.name == "Minion-Black"
and line.runs[1].text.strip() == "~"
):
return True
if (
line.runs[0].bold
and line.runs[0].font.name == "Formata-Medium"
and line.runs[1].font.name == "Minion-Regular"
and line.runs[1].text.strip() == ".)"
):
return True
else:
return False
except:
return False
# step 1: open up the document
doc = docx.Document("files/clean-output.docx")
lines = doc.paragraphs
# step 2: determine where each entry begin and ends
"""
how to tell if this is the beginning of an entry
1: line begins with a bold text
2: line begins with 'a' 'an' 'the', and then bold text
"""
articles = [
"a",
"an",
"the",
] # all possible articles with 'Minion-Regular' font that precede a constant
runs0vairable = [
"one’s",
"someone",
"someone’s",
"something",
"someone or something",
"somewhere",
"some creature’s",
"do something",
] # all possible variables in runs[0] with 'Formata-Condensed' font that precede a constant
beginning = (
[]
) # every item is a line number in clean-output.docx that marks a new entry
ignore = [] # line numbers to ignore when looking for entry beginnings and endings
ranges = [] # a list of tuples (beginning line number, ending line number)
for line_number, line in enumerate(lines):
# ignore empty lines
if len(line.text) == 0:
continue
# keep in mind: if there is a single empty line in the docx document,
# it would miss up the code and return a IndexError: list index out of range
# because of 'lines[line_number -1].runs[-1]' part as the empty line won't have any runs
if (
# ignore the line if it begins with a constant, and the previous line ends with a variable
not (
line.runs[0].bold
and line.runs[0].font.name == "Formata-Medium"
and lines[line_number - 1].runs[-1].font.name == "Formata-Condensed"
and lines[line_number - 1].runs[-1].font.size.pt == float("9.0")
)
and
# ignore the line if the previous line ends with 'Usually'
# this picks up entries with a long phrase
(not lines[line_number - 1].runs[-1].text.strip().endswith("Usually"))
and
# ignore the line if the previous line ends with a constant
# this picks up entries with a long phrase
(
not (
lines[line_number - 1].runs[-1].bold
and lines[line_number - 1].runs[-1].font.name == "Formata-Medium"
)
)
and
# ignore the line if the previous line ends with ';'
(not lines[line_number - 1].text.endswith(";"))
and
# ignore the line if the previous line had '*Typically:' and this line has '~;'
not ("*Typically:" in lines[line_number - 1].text and "~;" in line.text)
and
# ignore lines in [ignore]
(line_number not in ignore)
and
# sometimes the beginning of the line is just a definition.
# definition numbers come in bold fond. ignore those lines
(
line.runs[0].text.strip()
not in ["1.", "2.", "3.", "4.", "5.", "6.", "7.", "8.", "9.", "10."]
)
and
# ignore the line if it begins like this: ''be ~; go ~; run ~; turn ~.) _ When did the''
(not typically())
and
# ignore the line if the previous line ends with a bold number for a new definition
# and the word "and" (e.g 3. and)
not (
# run index is not a factor here, added 100
runtype(100, lines[line_number - 1].runs[-1]) == "and"
and lines[line_number - 1].runs[-2].bold
)
and (
# if the 1st run in the line is bold and has 'Formata-Medium'
# then it's probably a beginning of a phrase
(line.runs[0].bold and line.runs[0].font.name == "Formata-Medium")
or
# if the 1st run is 'a','an', or 'the' and the 2nd run is bold
(
(line.runs[0].text.strip().lower() in articles)
and line.runs[0].font.name == "Minion-Regular"
and line.runs[1].bold
and line.runs[1].font.name == "Formata-Medium"
)
or
# if the 1st run is in runs0vairable and the 2nd run is bold
# (and the 2nd run does not start with ')' )
(
line.runs[0].text.strip().lower() in runs0vairable
and line.runs[0].font.name == "Formata-Condensed"
and line.runs[1].bold
and line.runs[1].font.name == "Formata-Medium"
and (not line.runs[1].text.startswith(")"))
)
or
# I'm throwing this if statement for a single entry 'the someone or something from hell'
(
line.runs[0].text.strip() == "the"
and line.runs[0].font.name == "Minion-Regular"
and line.runs[1].text.strip() == "someone or something"
and line.runs[1].font.name == "Formata-Condensed"
and line.runs[2].bold
and line.runs[2].font.name == "Formata-Medium"
and line.runs[2].text.strip() == "from hell"
)
)
):
beginning.append(line_number)
# let take a closer look at the current line - break it down into runs
# In the context of the Python-docx library, "runs" refer to a sequence of characters
# within a paragraph in a Microsoft Word document that share the same set of
# character-level formatting properties.
# Runs are used to represent portions of text with consistent formatting,
# such as font style, size, color, and other character-level attributes.
runs = line.runs
for run in runs:
# sometimes an entry could contain two phrases, and there is an 'and' separating them.
# this could lead to the next line having it's 1st run as bold.
# in this case the next line should not be considered as a new entry,
# but rather a continuation of the same entry
if (
run.text.strip() == "and"
and run.font.name == "Minion-Regular"
and int(run.font.size.pt) == 7
):
# ignore next line
ignore.append(line_number + 1)
# create ranges for each entry
for i, v in enumerate(beginning[:-1]):
# NOTE: the final entry in the document won't be included, because there is no next beginning,
# and we can't calculate range this way
start = v
end = beginning[i + 1] - 1
ranges.append((start, end))
# step 3: remove non-valid entries from [ranges]
ranges_to_delete = [
(41058, 41059),
(47646, 47648),
] # a list that includes ranges to delete from [ranges]
# every iteration is an entry
for s, e in ranges:
if s == e:
# that means the entry is a single line
runs = lines[s].runs
for run in runs:
if (
"Go to" in run.text.strip()
and run.font.name == "Minion-Regular"
and run.font.size.pt == float("8.5")
):
# 1838 matches
ranges_to_delete.append((s, e))
# also ignore any lines that have any of the following in them
d = [
"See the entries beginning with",
"See also the entries beginning with",
"See previous.",
"See also entries at",
"See The game is up.",
]
if any(item.lower() in lines[s].text.lower() for item in d):
ranges_to_delete.append((s, e))
else:
# check the 1st two lines of the entry for 'Go to'
for i in range(s, s + 2):
runs = lines[i].runs
for run in runs:
if (
"1." not in lines[i].text
and "Go to" in run.text.strip()
and run.font.name == "Minion-Regular"
and run.font.size.pt == float("8.5")
):
# 785 matches
ranges_to_delete.append((s, e))
# check one more time for those entries where the 1st line ends with 'Go'
# and the 2nd line begins with 'to'
if (
lines[s].runs[-1].text.strip().endswith("Go")
and lines[s].runs[-1].font.name == "Minion-Regular"
and lines[s].runs[-1].font.size.pt == float("8.5")
) and (
"to" in lines[s + 1].runs[0].text.strip()
and lines[s + 1].runs[0].font.name == "Minion-Regular"
and lines[s + 1].runs[0].font.size.pt == float("8.5")
):
# 22 matches
ranges_to_delete.append((s, e))
# delete entries that contain "See the expressions"
for i in range(s, s + 2):
runs = lines[i].runs
for run in runs:
if (
"See the expressions" in run.text.strip()
and run.font.name == "Minion-Regular"
and run.font.size.pt == float("8.5")
):
# 3 matches
ranges_to_delete.append((s, e))
# remove ranges in [ranges_to_delete] from [ranges]
clean_ranges = [item for item in ranges if item not in ranges_to_delete]
# step 4: some entries don't fit any of the rules above, and needs to be merged manually.
to_merge = [
[(17857, 17857), (17858, 17861)],
[(55963, 55965), (55966, 55974)],
[(82682, 82683), (82684, 82688)],
[(39577, 39578), (39579, 39581)],
[(39214, 39215), (39216, 39217)],
[(31413, 31417), (31418, 31421)],
[
(22537, 22540),
(22541, 22547),
], # "and" not formatted correctly
]
# merge and add new ranges to []
for llist in to_merge:
# get index of 1st tuple in the ranges list
x = clean_ranges.index(llist[0])
# merge two tuples
start = llist[0][0]
end = llist[1][1]
new_range = (start, end)
# add new range to clean_ranges
clean_ranges.insert(x + 2, new_range)
# delete old ones
indexes_to_delete = [] # old unmerged ranges that I want to delete
for llist in to_merge:
# get index of 1st tuple in the ranges list
x = clean_ranges.index(llist[0])
# delete both indexes
indexes_to_delete.append(x)
indexes_to_delete.append(x + 1)
# deleting them in reverse order so that I don't throw off the subsequent indexes.
for indx in sorted(indexes_to_delete, reverse=True):
del clean_ranges[indx]
# Output File #1 - main
with open("files/ranges.pickle", "wb") as file:
pickle.dump(clean_ranges, file)
# Output File #2 (optional)
# save ranges_to_delete to a txt file to make sure only bad entries were eliminated
# text_file_1 = str()
# for s, e in ranges_to_delete:
# # add range
# text_file_1 = text_file_1 + "\n" + f"[{s}, {e}]"
# # add entry lines from clean-output.docx
# for i in range(s, e + 1):
# text_file_1 = text_file_1 + "\n" + lines[i].text
# # add a line break after each entry
# text_file_1 = text_file_1 + "\n" + "*" * 50
# # write string to desk
# with open("files/deleted_entries.txt", "w") as myfile:
# myfile.write(text_file_1)
# Output File #3 (optional)
# save clean_ranges to a text file
# Note: writing the whole file to a string variable and calling write() once
# is 16x faster than calling write() 3 times in every iteration
print("creating clean_entries.txt")
text_file_2 = str()
for s, e in tqdm(clean_ranges):
# add range
text_file_2 = text_file_2 + "\n" + f"[{s}, {e}]"
# add entry lines from clean-output.docx
for i in range(s, e + 1):
text_file_2 = text_file_2 + "\n" + lines[i].text
# add a line break after each entry
text_file_2 = text_file_2 + "\n" + "*" * 50
# write string to desk
with open("files/clean_entries.txt", "w", encoding="utf-8") as myfile:
myfile.write(text_file_2)
# Output File #4 (optional)
# save clean_ranges to a docx file - for better readability
# copy_docx(clean_ranges, "clean_entries")