-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommender.py
237 lines (203 loc) · 7.78 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from csv import DictReader
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from random import shuffle, sample
import sys
NUM_RECS = 101
MAX_NEW_RECS = 8
isInit = True
def load_articles(file_path, file_format, num=None):
"""
Load articles from a specified file path and format (CSV or JSON).
Args:
file_path (str): The path to the file containing articles.
file_format (str): The format of the file ('csv' or 'json').
num (int, optional): The maximum number of articles to load. If None, all articles are loaded.
Returns:
list: A list of articles as dictionaries.
"""
articles = []
if file_format=="csv":
with open(file_path, encoding="utf-8") as csvfile:
reader = DictReader(csvfile)
for row in reader:
articles.append(row)
elif file_format=="json":
with open(file_path, encoding="utf-8") as jsonfile:
articles = json.loads(jsonfile.read())
for row in articles:
if row["title"] is None:
row["title"] = row["text"][:30]
if num:
shuffle(articles)
articles = articles[:num]
print(len(articles),"articles loaded")
return articles
def init_recommendations(n, articles):
"""
Initialize a list of random article recommendations.
Args:
n (int): The number of recommendations to generate.
articles (list): The list of articles from which to recommend.
Returns:
list: A list of indices corresponding to recommended articles.
"""
return sample(range(len(articles)), n)
def display_article_titles(recommendations, articles):
"""
Display the titles of articles based on the provided recommendations.
Args:
recommendations (list): List of indices of recommended articles.
articles (list): The list of articles containing the titles.
"""
for i in range(len(recommendations)):
art_num = recommendations[i]
print(f'{i + 1}. {articles[art_num]["title"]}')
def display_recommendations(recommendations, articles, isInit):
"""
Display a list of recommended articles based on whether this is the initial display.
Args:
recommendations (list): List of indices of recommended articles.
articles (list): The list of articles containing the titles.
isInit (bool): Indicates if this is the initial display of recommendations.
"""
if isInit:
print("\n\n\nHere is a list of articles for you:\n")
display_article_titles(recommendations, articles)
else:
print("\n\n\nHere are some new recommendations for you:\n")
display_article_titles(recommendations[:MAX_NEW_RECS], articles)
print("\nOr if you want something different, how about...\n")
for i in range(len(recommendations) - 2, len(recommendations)):
art_num = recommendations[i]
print(f'{i + 1}. {articles[art_num]['title']}')
def display_article(art_num, articles):
"""
Display the full content of a specific article.
Args:
art_num (int): The index of the article to display.
articles (list): The list of articles containing the content.
"""
print("\n\n")
print("article",art_num)
print("=========================================")
print(articles[art_num]["title"])
print()
print(articles[art_num]["text"])
print("=========================================")
print("\n\n")
def new_recommendations(last_choice, n, articles, vectors):
"""
Generate new article recommendations based on the last chosen article.
Args:
last_choice (int): The index of the last chosen article.
n (int): The number of recommendations to generate.
articles (list): The list of articles.
vectors: The vector representations of the articles for similarity computation.
Returns:
list: A list of indices corresponding to the new recommended articles.
"""
last_article_vector = vectors[last_choice].reshape(1, -1)
similarities = cosine_similarity(last_article_vector, vectors).flatten()
similar_articles = similarities.argsort()[::-1]
similar_articles = similar_articles[similar_articles != last_choice]
top_similar = similar_articles[:8]
dissimilar_articles = similar_articles[-2:]
dissimilar_articles = dissimilar_articles.tolist()
shuffle(dissimilar_articles)
recommendations = list(top_similar) + list(dissimilar_articles)
return recommendations
def choose_file():
"""
Prompt the user to choose a file to load articles from.
Returns:
str: The path to the chosen file.
"""
print("\nWelcome to the article database recommender system. choose a file to explore:\n")
print("1. wikipedia_sample.json")
print("2. arxiv_abstracts.csv")
print("3. bbc_news.csv")
print("4. Quit")
while True:
try:
choice = int(input("\nYour choice? "))
except ValueError:
print("Invalid choice. Please enter a valid number.")
continue
if choice == 1:
return "data/wikipedia_sample.json"
elif choice == 2:
return "data/arxiv_abstracts.csv"
elif choice == 3:
return "data/bbc_news.csv"
elif choice == 4:
sys.exit()
else:
print("Invalid choice. Please choose a number between 1 and 3. Or enter 4 to quit")
def get_file_type(file_path):
"""
Determine the file type based on the file path.
Args:
file_path (str): The path to the file.
Returns:
str: The format of the file ('json' or 'csv').
"""
return "json" if file_path == "data/wikipedia_sample.json" else "csv"
def get_vectors(articles):
"""
Generate vector representations for the articles using CountVectorizer.
Args:
articles (list): The list of articles to vectorize.
Returns:
Sparse matrix: The vector representations of the articles.
"""
vectorizer = CountVectorizer(max_df=0.9, min_df=3)
docs = []
docnames = []
for article_num in range(len(articles)):
try:
docs.append(articles[article_num]['text'])
docnames.append(articles[article_num]['title'])
except:
pass
vectors = vectorizer.fit_transform(docs)
return vectors
def main():
"""
Main function that runs the article recommender system.
It handles file selection, article loading, and user interaction for displaying
recommendations and articles.
"""
global isInit
while True:
file = choose_file()
file_type = get_file_type(file)
articles = load_articles(file, file_type)
vectors = get_vectors(articles)
print("\n\n")
recs = init_recommendations(NUM_RECS, articles)
while True:
display_recommendations(recs, articles, isInit)
print("\nEnter 'q' to quit the application or 'r' to return to the file selection menu.\n")
choice = input("\nYour choice? ")
if choice.lower() == 'q':
print("Goodbye!")
return
elif choice.lower() == 'r':
isInit = True
break
try:
choice = int(choice) - 1
except ValueError:
print("Invalid choice. Please enter a valid number or 'r' to return or 'q' to quit.")
continue
if choice < 0 or choice >= len(recs):
print("Invalid Choice. Goodbye!")
break
display_article(recs[choice], articles)
input("Press Enter")
recs = new_recommendations(recs[choice], NUM_RECS, articles, vectors)
isInit = False
if __name__ == "__main__":
main()