-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
187 lines (177 loc) · 7.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import networkx as nx
import numpy as np
from tqdm import tqdm
def read_adjacency_matrix(input_path):
"""
:return: The directed graph in networkx format
:param input_path: Path of the dataset file of the form:
<from_node> <to_node> <influence_probability>
"""
graph = nx.DiGraph()
with open(input_path, 'r') as f:
data_lines = f.readlines()
for data_line in tqdm(data_lines[1:], desc="Reading adjacency matrix"):
start, end, weight = data_line.split()
graph.add_edge(int(start), int(end), p=float(weight))
return graph
def read_weighted_and_signed_adjacency_matrix(input_path):
"""
:return: The directed graph in networkx format
:param input_path: Path of the dataset file of the form:
<from_node> <to_node> <influence_probability> <sign>
"""
graph = nx.DiGraph()
with open(input_path, 'r') as f:
data_lines = f.readlines()
for data_line in tqdm(data_lines[1:], desc="Reading adjacency matrix"):
start, end, p, sign = data_line.split()
graph.add_edge(int(start), int(end), p=float(p),s=int(sign))
return graph
def read_signed_adjacency_matrix(input_path):
"""
This function reads a signed adjacency matrix of a signed network. The influence probabilities are sampled by
a binomial distribution with parameters n=in_neighbors_positive, p=number_of_positive_edges/number_of_edges
:return: The directed graph in networkx format
:param input_path: Path of the dataset file of the form:
<from_node> <to_node> <sign>
"""
graph = nx.DiGraph()
with open(input_path, 'r') as f:
data_lines = f.readlines()
for data_line in tqdm(data_lines[1:], desc="Reading signed adjacency matrix"):
start, end, sign = data_line.split()
graph.add_edge(int(start), int(end), s=int(sign))
# Sample influence probabilities
# Parameters of the binomial distribution
# p: probability of success
# For all nodes, trusted and not trusted in-neighbors, to avoid re-computation
p_success = len([(u, v) for (u, v, attr) in graph.edges(data=True) if attr['s'] == 1]) / len(graph.edges)
#p_success= 0.6
trusted_in_neighbors = {}
not_trusted_in_neighbors = {}
num_samples = 100
for (u,v,attr) in tqdm(graph.edges(data=True), desc="Sampling influence probabilities"):
# n: number of trusted in-neighbors
n_trusted = trusted_in_neighbors.get(v,-1)
n_not_trusted = not_trusted_in_neighbors.get(v,-1)
# If the lists are not already computed, compute them
if n_trusted == -1:
n_trusted = len([x for x in graph.predecessors(v) if graph.edges[x,v]['s'] == 1])
trusted_in_neighbors[v] = n_trusted
if n_not_trusted == -1:
n_not_trusted = len([x for x in graph.predecessors(v) if graph.edges[x,v]['s'] == -1])
not_trusted_in_neighbors[v] = n_not_trusted
# Sample the probabilities num_samples times and compute the average
sum_of_prob = 0
n = n_trusted if attr['s'] == 1 else n_not_trusted
for experiment in range(num_samples):
sum_of_prob += (np.random.binomial(n, p_success)/n) * attr['s']
influence_probability = sum_of_prob/num_samples
graph.edges[u,v]['p'] = influence_probability
write_graph_to_file(graph, input_path[:-4] + "_edgelist.txt")
return graph
def write_graph_to_file(graph, output_path):
"""
Writes the edges of the graph to a text file with each row containing:
fromNode toNode <value of attribute p> <value of attribute sign>
:param graph: The directed graph in networkx format
:param output_path: Path of the output file
"""
with open(output_path, 'w') as f:
f.write(f"{len(graph.nodes)} {len(graph.edges)}\n")
for u, v, attr in graph.edges(data=True):
p = attr.get('p', 0) # Default to 0 if 'p' is not present
sign = attr.get('s', 0) # Default to 0 if 's' is not present
f.write(f"{u} {v} {p} {sign}\n")
def __my_similarity__(list1, list2):
"""
Computes the similarity between two seed sets of the same length) by counting the common elements normalized by the seed set length.
"""
if len(list1) != len(list2):
raise ValueError("Lists must have the same length")
number_of_elements_of_list1_in_list2 = 0
for x in list1:
if x in list2:
number_of_elements_of_list1_in_list2 += 1
return number_of_elements_of_list1_in_list2 / len(list1)
def __compute_similarities__(dataframe):
"""
Computes similarities between seed sets found by different algorithms.
:param: dataframe: A pandas dataframe which contains the results returned by different algorithms.
:return: The updated dataframe with the similarity values for every couple of results.
"""
algorithms = dataframe["algorithm"].unique()
for i, row in dataframe.iterrows():
current_seed_set = row["seed"]
for elem in algorithms:
other_seed_set=dataframe[dataframe["algorithm"] == elem].iloc[0]["seed"]
dataframe.loc[i, f'similarity_{elem}'] = __my_similarity__(current_seed_set, other_seed_set)
return dataframe
# Different processing methods, based on the input file format
def __process_1__(file_path):
"""
Initial format:
<from_node> <to_node>
Final format:
<num_nodes> <num_edges>
<from_node> <to_node>
"""
with open(file_path, 'r') as f:
lines = f.readlines()
nodes = set()
num_edges = len(lines)
for line in lines:
node1, node2 = map(int, line.split())
nodes.add(node1)
nodes.add(node2)
num_nodes = len(nodes)
new_first_line = f"{num_nodes} {num_edges}\n"
with open(file_path, 'w') as f:
f.write(new_first_line)
f.writelines(lines)
def __process_2__(file_path):
"""
Initial format:
<from_node>,<to_node>,<sign>
Final format:
<num_nodes> <num_edges>
<from_node> <to_node> <sign>
"""
with open(file_path, 'r') as f:
lines = f.readlines()
nodes = set()
num_edges = len(lines)
for line in lines:
node1, node2, _ = map(int, line.split(','))
nodes.add(node1)
nodes.add(node2)
num_nodes = len(nodes)
new_first_line = f"{num_nodes} {num_edges}\n"
with open(file_path, 'w') as f:
f.write(new_first_line)
for line in lines:
node1, node2, sign = map(int, line.split(','))
f.write(f"{node1} {node2} {sign}\n")
def __process_3__(file_path):
"""
Initial format:
<from_node> <to_node> <sign> <other_term>
Final format:
<num_nodes> <num_edges>
<from_node> <to_node> <sign>
"""
with open(file_path, 'r') as f:
lines = f.readlines()
nodes = set()
num_edges = len(lines)
for line in lines:
node1, node2, _, _ = map(int, line.split())
nodes.add(node1)
nodes.add(node2)
num_nodes = len(nodes)
new_first_line = f"{num_nodes} {num_edges}\n"
with open(file_path, 'w') as f:
f.write(new_first_line)
for line in lines:
node1, node2, sign, _ = map(int, line.split())
f.write(f"{node1} {node2} {sign}\n")