-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathK_getexamples.py
83 lines (67 loc) · 2.15 KB
/
K_getexamples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""Extract Examples from 'phrases.json'
Description:
This script is designed to parse dictionary entries from phrases.json,
extracting and saving the examples contained within each entry.
The process involves iterating through all dictionary entries.
For each entry, it picks up 'definition_html' and uses regular expressions to identify and
extract example sentences, saving both the examples and their corresponding ranges. the script only
keeps unique ranges in order not to repeat examples that appear multiple times.
The script offers two output formats: a human-readable 'examples.txt' file and a serialized
'examples.pickle' file.
Input:
- phrases.json
Output:
- examples.txt (optional)
- examples.pickle
Runtime:
- Generating 'examples.txt' and 'examples.pickle': Completed in 3 seconds.
Usage:
Please run this script from the command line (CMD)
Example:
python K_getexamples.py
In total, 41,512 examples were captured by this script.
"""
import json
import pickle
import re
# load the json file
with open("englishidioms/phrases.json", encoding="UTF-8") as f:
data = json.load(f)
# a list that contain examples and their associated range
# [[[range], [examples]], [[range], [examples]], [[range], [examples]]]
er = []
processed_ranges = []
for entry in data["dictionary"]:
if entry["range"] in processed_ranges:
continue
matches = [
m
for m in re.findall(
r"_.+?<\/em>[ ]*<(?!em)", entry["definition_html"]
)
if m
]
if matches:
er.append(
[
entry["range"],
[
re.sub(r"<[a-z\/]+?>", "", match)
.replace("<", "")
.replace("_", "")
.replace(" ", "")
.strip()
for match in matches
],
]
)
processed_ranges.append(entry["range"])
# output file #1
with open("files/examples.txt", "w", encoding="UTF-8") as f:
for r, e in er:
f.write(str(r))
f.write(str(e))
f.write("\n")
# output file #2
with open("files/examples.pickle", "wb") as file:
pickle.dump(er, file)