-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw_f_d.py
157 lines (135 loc) · 5.12 KB
/
w_f_d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import wikipedia
import urllib3
import requests
import ctypes
import matplotlib.pyplot as plt
class WordFrequencyDistribution:
def __init__(self, text_dict=dict()):
self.__text_dict = text_dict
def merge(self, current_wfd):
if not self.__text_dict:
self.__text_dict = current_wfd.__text_dict
else:
for key in current_wfd.__text_dict:
for item in self.__text_dict:
is_found = False
if key == item:
is_found = True
break
if is_found:
self.__text_dict[key] += current_wfd.__text_dict[key]
else:
self.__text_dict[key] = current_wfd.__text_dict[key]
def sort_dictionary(self):
self.__text_dict = sorted(self.__text_dict.items(), key=lambda x: x[1], reverse=True)
def plot_graph(self):
if len(self.__text_dict) > 100:
for a in range(0, 101):
plt.plot(a + 1, self.__text_dict[a][1], 'ro', markersize=1)
plt.axis = (0, 100, 0, self.get_word_count())
else:
for a in range(0, len(self.__text_dict)):
plt.plot(a + 1, self.__text_dict[a][1], 'ro', markersize=2)
plt.axis = (0, len(dict(self.__text_dict)), 0, self.get_word_count())
plt.title("Zipf's Law")
plt.xlabel("Frequency Rank")
plt.ylabel("Number of Occurrences")
plt.show()
def get_word_count(self):
word_count = 0
for key in dict(self.__text_dict):
word_count += dict(self.__text_dict)[key]
def get_text_dict(self):
return self.__text_dict
class Page:
def __init__(self):
self.__page = list()
self.__reference = ctypes.Union
self.__content = ''
self.__url = ''
self.__word_list = list()
self.__page_wfd = dict()
def get_reference(self):
reference = ''
while reference == '':
self.__page = wikipedia.random()
try:
reference = wikipedia.page(self.__page)
except wikipedia.exceptions.DisambiguationError as e:
reference = ''
except ConnectionError as a:
reference = ''
except TimeoutError as t:
reference = ''
except urllib3.exceptions.NewConnectionError as n:
reference = ''
except urllib3.exceptions.MaxRetryError as m:
reference = ''
except requests.exceptions.ConnectionError as q:
reference = ''
except wikipedia.exceptions.PageError as w:
reference = ''
except Exception as h:
reference = ''
self.__reference = reference
def get_content(self):
error = ''
while error == '':
try:
self.__content = self.__reference.content
error = 'no'
except UnicodeEncodeError as e:
error = ''
def get_url(self):
self.__url = self.__reference.url
return self.__url
def form_list(self):
text_list = []
original_list = []
spli_cont = self.__content.split()
for b in spli_cont:
original_list.append(b)
for a in range(0, len(original_list)):
if original_list[a].isalnum():
text_list.append(original_list[a])
self.__word_list = text_list
for key in self.__word_list:
key.lower()
def get_wfd(self):
text_dict = dict()
for counter in range(0, len(self.__word_list)):
current_word = self.__word_list[counter].lower()
is_found = False
for key in text_dict:
if key == current_word:
is_found = True
break
if is_found:
text_dict[current_word] += 1
else:
text_dict[current_word] = 1
self.__page_wfd = WordFrequencyDistribution(text_dict)
return self.__page_wfd
def user_input():
n = int(input("Enter how many wiki pages to add: "))
while n <= 0 or type(n) is not int:
n = int(input("Please enter an integer bigger than 0: "))
return n
def processing():
main_wfd = WordFrequencyDistribution()
number_of_pages_to_process = user_input()
for counter in range(0, number_of_pages_to_process):
page = Page()
page.get_reference()
page.get_content()
print(counter + 1, "/", number_of_pages_to_process, "-", page.get_url())
page.form_list()
current_wfd = page.get_wfd()
main_wfd.merge(current_wfd)
main_wfd.sort_dictionary()
print(main_wfd.get_text_dict())
maximum = main_wfd.get_text_dict()[0]
print("The most frequently occurring word was \"", maximum[0], "\" which was used ", maximum[1], "times")
main_wfd.plot_graph()
if __name__ == "__main__":
processing()