-
Notifications
You must be signed in to change notification settings - Fork 0
/
change_maximum_number_of_definitions.py
117 lines (79 loc) · 4.65 KB
/
change_maximum_number_of_definitions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# README
# This program is associated with quizlet_termscraper_webdriver.py
# It subsets the amount of definitions found per term to a given maximum_number_of_definitions argument
# This program is useful if you want to change the maximum_number_of_definitions argument
# given to quizlet_termscraper_webdriver.py, but the termscraper had already been run, and you dont want to rerun it.
# August 4, 2021
# Phillip Long
# cat quizlet_termscraper_output | python ~/quizlet_termscraper/change_maximum_number_of_definitions.py maximum_number_of_definitions > terms_definitions_filtered.md
# Ex. : cat quizlet_termscraper_output | python ~/quizlet_termscraper/change_maximum_number_of_definitions.py 3 > terms_definitions_filtered.md
# sys.argv[1] = maximum number of definitions to output per term (-1 to output all definitions found) [REQUIRED]
import sys
import numpy
import re
# sys.argv = ("change_maximum_number_of_definitions.py", "1")
maximum_number_of_definitions = int(sys.argv[1])
if maximum_number_of_definitions < -1:
print(f"Error: faulty maximum_number_of_definitions argument provided to {sys.argv[1]}")
quit()
term_definition_delimiter = ": "
# a helper function to remove extra spaces and whitespace from text
def remove_whitespace(text):
return(" ".join(text.strip().split()))
# a helper function to simplify terms and definitions for better comparison
def simplify_text(text):
return(remove_whitespace(re.sub("[^\w ]+", "", text.lower().replace("/", " ").replace("-", " "))))
# a function that deals with the current chunk to make it into what we want
def subset_current_term_chunk(current_term_chunk):
current_term_chunk_definitions_lower = numpy.array(list(remove_whitespace(simplify_text(term_definition.split(term_definition_delimiter)[1])) for term_definition in current_term_chunk))
current_term_chunk = list(numpy.array(current_term_chunk)[numpy.sort((numpy.unique(current_term_chunk_definitions_lower, return_index=True)[1]))])
if maximum_number_of_definitions >= 1:
if len(current_term_chunk) >= maximum_number_of_definitions:
number_of_definitions = maximum_number_of_definitions
else:
number_of_definitions = len(current_term_chunk)
current_term_chunk = current_term_chunk[0:number_of_definitions]
del number_of_definitions
elif maximum_number_of_definitions == 0: # basically if I want just the term
current_term_chunk = [(current_term_chunk[0].split(term_definition_delimiter))[0]]
# if maximum_number_of_definitions == -1, nothing needs to be done to current term chunk
return(list(current_term_chunk))
term_chunks = []
current_term_chunk = []
is_introductory_lines = True
for line in sys.stdin:
line = re.sub(r"○|˚|•", "", line.rstrip().replace("\n", "; ").replace("′", "'"))
if line == "**********":
is_introductory_lines = False
print("**********", end = "\n")
continue
# change the heading which tells how many definitions the program filtered to
elif is_introductory_lines and (("filtered to a maximum of" in line) or ("containing all definitions found" in line)):
line_starter = line[ :(line.index("(") + 1)]
prioritize_definitions_method_used = line[line.index(","): ]
if maximum_number_of_definitions == -1:
line = f"{line_starter}containing all definitions found{prioritize_definitions_method_used}"
elif maximum_number_of_definitions > -1:
line = f"{line_starter}filtered to a maximum of {maximum_number_of_definitions} definition(s){prioritize_definitions_method_used}"
# if is_introductory_lines, print the line and skip the rest of the iteration
if is_introductory_lines:
print(line, end = "\n")
continue
# if the line is the start of a new term chunk, add current_term_chunk (if it's not empty) to term_chunks
if re.match(re.compile("[0-9]+. "), line) and current_term_chunk != []:
current_term_chunk = subset_current_term_chunk(current_term_chunk)
term_chunks.append(current_term_chunk)
del current_term_chunk
current_term_chunk = [line]
# elsewise, just add the current line to the current_term_chunk
else:
current_term_chunk.append(line)
# add the last current_term_chunk to term chunks
current_term_chunk = subset_current_term_chunk(current_term_chunk)
term_chunks.append(current_term_chunk)
del line, current_term_chunk
# flatten term_chunks for printing
term_chunks = [definition for chunk in term_chunks for definition in chunk]
for line in term_chunks:
print(line, end = "\n")
del line, term_chunks