-
Notifications
You must be signed in to change notification settings - Fork 0
/
method.py
364 lines (281 loc) · 14.9 KB
/
method.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import random
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import networkx as nx
import community
import utils
import visualiser
def compute_count_sentiment(token_list, positive_words, negative_words):
"""
Basic sentiment analysis by counting the number of positive words, counting the negative words.
The overall polarity is the difference in the two numbers.
@param token_list: token list from a post + associated comments
@param positive_words: set of positive sentiment words
@param negative_words: set of negative sentiment words
@returns: Difference of the positive and negative word count
"""
positive_word_count = len([tok for tok in token_list if tok in positive_words])
negative_word_count = len([tok for tok in token_list if tok in negative_words])
sentiment = positive_word_count - negative_word_count
return sentiment
def sentiment_analysis(method, posts_df):
"""
Analysing the sentiment of the posts and comments via the methods 'Count' and 'Vader'.
Count sentiment analysis -> Basic sentiment analysis by counting the number of positive words,
counting the negative words. The overall polarity is the difference in the two numbers.
Vader sentiment analysis -> Using Vader lexicons for sentiment analysis instead of
raw positive and negative word counts.
@param method: 'Vader' or 'Count'
@param posts_df: The df with post and comment data
@returns: list of reddit posts' sentiments, in the format of [date, sentiment]
"""
set_pos_words = []
set_neg_words = []
if method == 'Count':
# load pos, neg word lists
set_pos_words = utils.read_file('positive-words.txt')
set_neg_words = utils.read_file('negative-words.txt')
sentiment_list = []
vader_sentiment_analyser = SentimentIntensityAnalyzer()
for row in posts_df.itertuples(index=True):
print_processing = True if row[0] <= 1 else False
token_list = row.processed_tokens
date = row.utc_date
sentiment = 0
# compute sentiment
if method == 'Vader':
sentiment = vader_sentiment_analyser.polarity_scores(" ".join(token_list))
sentiment_list.append([pd.to_datetime(date, unit='s'), sentiment['compound']])
elif method == 'Count':
sentiment = compute_count_sentiment(token_list, set_pos_words, set_neg_words)
# save the date and sentiment of each reddit post
sentiment_list.append([pd.to_datetime(date, unit='s'), sentiment])
if print_processing:
title = row.title
desc = row.desc
num_comments = row.num_comments
date = row.formatted_date
start = '\n\n------------Analysing sentiment------------\n'
end = '\n------------------------------------\n\n'
formatted_post = f'Date: {date}\n\nPost title:\n{title}\n\nPost desc:\n{desc}\n\nNum Comments: {num_comments}'
print(utils.yellow_rgb + start + formatted_post + end, end='')
utils.print_coloured_tokens(method, token_list, sentiment, set_pos_words, set_neg_words)
return sentiment_list
def construct_ego_graph(client, ego, ego_name, ego_graph_filepath):
"""
Constructing the ego graph for top users.
@param client: Connection to the social media API
@param ego: The current user instance
@param ego_name: User name of the current user
@param ego_graph_filepath: Filepath to save the ego graph
@returns: The constructed ego graph
"""
ego_graph = nx.DiGraph()
ego_graph.add_node(ego_name)
# get all the users that have replied to a submission of the ego
ego_submissions = ego.submissions
for submission in ego_submissions.top(time_filter="all"):
# get comments/replies to submission
for replies in submission.comments:
if replies.author != None:
# author of comment
replier_name = replies.author.name
try:
replier_karma = replies.author.comment_karma
except AttributeError:
replier_karma = 0
# Adding nodes and edges to add follower to graph
ego_graph.add_node(replier_name, karma=replier_karma)
ego_graph.add_edge(replier_name, ego_name)
# get all users that the ego has replied to
ego_comments = ego.comments
# get comments
for comment in ego_comments.top(time_filter="all"):
# get the user_id of the post that the ego's comment is replying to
parent_comment_id = comment.parent_id
parent_comment = client.comment(parent_comment_id)
if parent_comment.author != None:
replied_to_name = parent_comment.author.name
try:
replied_karma = parent_comment.author.comment_karma
except AttributeError:
replied_karma = 0
# Add nodes and edges to add follower to graph
ego_graph.add_node(replied_to_name, karma=replied_karma)
ego_graph.add_edge(ego_name, replied_to_name)
# save graph
with open(ego_graph_filepath, 'wb') as fOut:
nx.write_graphml(ego_graph, fOut)
return ego_graph
def update_reply_graph_node(reply_graph, post_author):
"""
Updating the reply graph upon encountering a new post.
@param reply_graph: Reply graph instance
@param post_author: Author of the post
@returns: The updated reply graph
"""
# Check if author is in the reply graph
# If author is already in the reply graph, update the no. of posts associated with the user
# Else, create a new node for the author with 1 associated post
if post_author in reply_graph:
reply_graph.nodes[post_author]['subNum'] += 1
else:
reply_graph.add_node(post_author, subNum=1)
return reply_graph
def update_reply_graph_edge(reply_graph, comment_author_name, post_comment_ids, post_id, comment_parent_id):
"""
Updating the reply graph upon encountering a new comment.
@param reply_graph: Reply graph instance
@param comment_author_name: Author of the comment
@param post_comment_ids: List of post, post id, associated comments and comment ids
@param post_id: Current post id
@param comment_parent_id: Parent comment id of the current comment
@returns: The updated reply graph
"""
# If edge exists, increment the number of replies (replyNum)
# else, add a new edge
if reply_graph.has_edge(comment_author_name, post_comment_ids[post_id][comment_parent_id]):
reply_graph[comment_author_name][post_comment_ids[post_id][comment_parent_id]]['replyNum'] += 1
else:
# need to check if the nodes have been added yet, if not add it and set subNum to 0
if comment_author_name not in reply_graph:
reply_graph.add_node(comment_author_name, subNum=0)
if not post_comment_ids[post_id][comment_parent_id] in reply_graph:
reply_graph.add_node(post_comment_ids[post_id][comment_parent_id], subNum=0)
reply_graph.add_edge(comment_author_name, post_comment_ids[post_id][comment_parent_id], replyNum=1)
return reply_graph
def compute_reply_graph_stats(reply_graph, data_folder_path, social_media_id, beverage_type, color):
"""
Display the reply graph stats for the selected social media.
Update node attributes with centrality.
Save the updated graph.
@param reply_graph: The current reply graph
@param data_folder_path: Filepath to save the graph file
@param social_media_id: Social media
@param beverage_type: 'tea' or 'coffee' for the filename
@param color: Bar color
"""
degree_centrality_list = nx.degree_centrality(reply_graph)
eigen_vector_centrality_list = nx.eigenvector_centrality(reply_graph)
katz_centrality_list = nx.katz_centrality(reply_graph)
visualiser.display_centrality_histograms(degree_centrality_list,
eigen_vector_centrality_list,
katz_centrality_list, color)
# Update node attributes with centrality
# eigenvector centrality, stored in node attribute 'eigen'
for node_id, cent in eigen_vector_centrality_list.items():
reply_graph.nodes[node_id]['eigen'] = float(cent)
# katz centrality, stored in node attribute 'katz'
for node_id, cent in katz_centrality_list.items():
reply_graph.nodes[node_id]['katz'] = float(cent)
modified_reply_graph_filepath = \
f'{data_folder_path}/{social_media_id}_{beverage_type}_centrality_graph.graphml'
nx.readwrite.write_graphml(reply_graph, modified_reply_graph_filepath, infer_numeric_types=True)
# compute clustering
print(utils.yellow_rgb + f'\n\nGlobal clustering coefficient/transitivity: {nx.transitivity(reply_graph)}', end='')
# compute components
print(
utils.green_rgb + f'\n\nNumber of strongly connected components: {nx.number_strongly_connected_components(reply_graph)}',
end='')
print(
utils.red_rgb + f'\n\nNumber of weakly connected components: {nx.number_weakly_connected_components(reply_graph)}',
end='')
print(utils.yellow_rgb + f'\n\nBridges:\n{list(nx.bridges(reply_graph.to_undirected()))}', end='')
def compute_community_stats(reply_graph, data_folder_path, social_media_id, beverage_type):
"""
Display the community detection stats for the selected social media.
Update node attributes with community detection.
Save the updated graph.
@param reply_graph: The current reply graph
@param data_folder_path: Filepath to save the graph file
@param social_media_id: Social media
@param beverage_type: 'tea' or 'coffee' for the filename
"""
# k (clique size)
k = 3
cpm_community_list = list(nx.algorithms.community.k_clique_communities(nx.to_undirected(reply_graph), k))
print(utils.green_rgb + f'\nCPM community:\n{cpm_community_list}', end='')
# louvain
louvain_community_dict = community.best_partition(nx.to_undirected(reply_graph))
print(utils.red_rgb + f'\n\nLouvain community:\n{louvain_community_dict}', end='')
# convert output of Louvain to the same format as CPM
max_num_louvain_communities = max([y for (x, y) in louvain_community_dict.items()]) + 1
louvain_community_list = utils.dict_to_set_format(louvain_community_dict, max_num_louvain_communities)
# write out cpm and Louvain values to node attributes of graph
# cpm labels, stored in node attribute 'cpmClusId'
for cluster_id, community_list in enumerate(cpm_community_list):
for node_id in community_list:
reply_graph.nodes[node_id]['cpmClusId'] = cluster_id
# louvain labels, stored in node attribute 'louvain'
for cluster_id, community_list in enumerate(louvain_community_list):
for node_id in community_list:
reply_graph.nodes[node_id]['louvain'] = cluster_id
modified_reply_graph_filepath = \
f'{data_folder_path}/{social_media_id}_{beverage_type}_community_graph.graphml'
# output modified graph
nx.readwrite.write_graphml(reply_graph, modified_reply_graph_filepath, infer_numeric_types=True)
def compute_linear_threshold(graph, trial_num, list_of_seeds):
"""
Performs linear threshold model over the input directed graph.
Results are stored in two output lists.
@param graph: Input graph to perform linear threshold over.
@param trial_num: The number of runs/trials to run. The results are averaged over the trials/runs.
@param list_of_seeds: List of initial nodes to seed. Range from 0 to (number of nodes - 1).
@return: Two lists, average_activations_per_node_list, average_activations_per_iteration_list.
average_activations_per_node_list is a list with the size same as the number of nodes in
the graph.
Each index of the list (starting with zero) corresponds directly to the associated node,
and each entry represents the average number of activations
over the trials/runs, and should lie in [0,1] range.
average_activations_per_iteration_list is a list with the size same as the number of trials/runs.
Each index of the list corresponds to a trial/run, and each entry is the
total number of active nodes in that trial/run.
"""
# generate initial lists/vectors for the two output lists
average_activations_per_node_list = [0 for x in range(nx.number_of_nodes(graph))]
average_activations_per_iteration_list = []
print('****** Begin linear threshold runs ******')
# loop through the runs/trials
for i in range(trial_num):
print(f'Trial/run no. {i}')
print('Trial/run no. {}'.format(i))
# for each node, generate the random thresholds
for current_node, attr in graph.nodes(data=True):
attr['threshold'] = random.random()
# list of active nodes
active_set = set(list_of_seeds)
last_active_set = set(list_of_seeds)
new_active_set = set()
# Looping until no more new activations
while len(last_active_set) > 0:
# Get all the nodes next to the current set of active nodes
neighbour_set = set()
for active_node in last_active_set:
neighbour_set.update([neighbour for neighbour in graph.successors(active_node) if
neighbour not in active_set and neighbour not in new_active_set])
# for each of these potential neighbours to be activated, test if it will be activated
for neighbour in neighbour_set:
try:
# get the sum of weights
total_weight = sum(
[data_dict['weight'] for (u, v, data_dict) in graph.in_edges(neighbour, data=True)])
# test against the node threshold
if graph.nodes[neighbour]['threshold'] < total_weight:
new_active_set.add(neighbour)
except KeyError as e:
print(f"Key error: {e}, Edge is missing weights")
# update last active
last_active_set = new_active_set
# extend active set
active_set.update(new_active_set)
# reset new active
new_active_set = set()
# update the output lists
for x in active_set:
average_activations_per_node_list[x] += 1
# update with total number of activations
average_activations_per_iteration_list.append(len(active_set))
# average each entry in average_activations_per_node_list by number of runs/trials
entry_average_average_activations_per_node_list = [float(count) / trial_num for count in
average_activations_per_node_list]
return entry_average_average_activations_per_node_list, average_activations_per_iteration_list