-
Notifications
You must be signed in to change notification settings - Fork 0
/
determine_relevance.py
102 lines (72 loc) · 3.75 KB
/
determine_relevance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
from google_language import GoogleLanguage
from google_language import REALLY_IMP_ENTITY_IDX
from news_utils import pretty_print_news
google_lang = GoogleLanguage()
def get_relevant_news(tweet, tweet_entities, news_articles, threshold):
relevant_news_articles = []
for item in news_articles:
relevance_score = relevance_score_google(tweet, tweet_entities,
item["title"] + ". " + item["description"])
item["relevance_score"] = relevance_score
if relevance_score >= threshold:
relevant_news_articles.append(item)
relevant_news_articles.sort(key=lambda x: x["relevance_score"], reverse=True)
final_articles = []
sources_covered = []
for item in relevant_news_articles:
if item["source"]["id"] not in sources_covered:
final_articles.append(item)
sources_covered.append(item["source"]["id"])
for item in final_articles[:3]:
news_item = item["title"] + ". " + item["description"]
sentiment = google_lang.get_document_sentiment(news_item)
item["sentiment_score"] = sentiment.score
pretty_print_news(final_articles[:3])
return final_articles[:3]
def relevance_score_google(tweet, tweet_entities, news_item):
news_entities_names = []
entities = google_lang.get_entities(news_item)
for entity in entities:
news_entities_names.append(entity.name)
total_score = 0
for i in range(len(tweet_entities)):
if tweet_entities[i].name in news_entities_names:
idx = news_entities_names.index(tweet_entities[i].name)
if entities[idx].type in REALLY_IMP_ENTITY_IDX:
total_score += (entities[idx].salience * 1.5) * min(3, len(entities[idx].mentions))
else:
total_score += entities[idx].salience * min(3, len(entities[idx].mentions))
return total_score
def get_relevant_news_tfidf(tweet, news_articles, threshold=0.5):
import gensim
from nltk.tokenize import word_tokenize
news_articles_text = [item["title"] + ". " + item["description"] for item in news_articles]
news_articles_tokenized = [[w.lower() for w in word_tokenize(item)]
for item in news_articles_text]
dictionary = gensim.corpora.Dictionary(news_articles_tokenized)
corpus = [dictionary.doc2bow(item_tokenized) for item_tokenized in news_articles_tokenized]
tf_idf = gensim.models.TfidfModel(corpus)
sims = gensim.similarities.Similarity("", tf_idf[corpus],
num_features=len(dictionary))
tweet_tokenized = [w.lower() for w in word_tokenize(tweet)]
tweet_tokenized_bow = dictionary.doc2bow(tweet_tokenized)
tweet_tokenized_tf_idf = tf_idf[tweet_tokenized_bow]
relevant_news_articles = []
for idx, similarity_score in enumerate(sims[tweet_tokenized_tf_idf]):
if similarity_score >= threshold:
news_articles[idx]["relevance_score"] = similarity_score
relevant_news_articles.append(news_articles[idx])
return relevant_news_articles
def get_relevant_news_cosine(tweet, news_articles, threshold=0.5):
import spacy
nlp = spacy.load("en_core_web_sm") # need to download: python -m spacy download en_core_web_sm/_md/_lg
news_articles_vectors = [nlp(item["title"] + ". " + item["description"]) for item in news_articles]
tweet_vector = nlp(tweet)
relevant_news_articles = []
for idx, item in enumerate(news_articles_vectors):
similarity_score = tweet_vector.similarity(item)
if similarity_score >= threshold:
news_articles[idx]["relevance_score"] = similarity_score
relevant_news_articles.append(news_articles[idx])
return relevant_news_articles