-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathevaluate_algorithm.py
101 lines (72 loc) · 2.86 KB
/
evaluate_algorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
from sklearn import metrics
import numpy as np
from scipy.misc import comb
import glob
import os
'''
This script is for evaluation of event chain algorithm
'''
def myComb(a, b):
return comb(a, b, exact=True)
vComb = np.vectorize(myComb)
def get_tp_fp_tn_fn(cooccurrence_matrix):
tp_plus_fp = vComb(cooccurrence_matrix.sum(0, dtype=int), 2).sum()
tp_plus_fn = vComb(cooccurrence_matrix.sum(1, dtype=int), 2).sum()
tp = vComb(cooccurrence_matrix.astype(int), 2).sum()
fp = tp_plus_fp - tp
fn = tp_plus_fn - tp
tn = comb(cooccurrence_matrix.sum(), 2) - tp - fp - fn
return [tp, fp, tn, fn]
def precision_recall_fmeasure(cooccurrence_matrix):
tp, fp, tn, fn = get_tp_fp_tn_fn(cooccurrence_matrix)
# print ("TP: %d, FP: %d, TN: %d, FN: %d" % (tp, fp, tn, fn))
rand_index = (float(tp + tn) / (tp + fp + fn + tn))
precision = float(tp) / (tp + fp)
recall = float(tp) / (tp + fn)
f1 = ((2.0 * precision * recall) / (precision + recall))
return rand_index, precision, recall, f1
def run(input_dir, output_dir):
original_clusters_path = input_dir
file_name = '*.csv'
all_files = glob.glob(os.path.join(original_clusters_path, file_name))
gkg_id_to_index = {}
class_labels_dict = {}
label = 1
index = 0
for f in all_files:
df = pd.read_csv(f, header=None, encoding='latin-1')
df_list = df.values.tolist()
for row in df_list:
try:
gkg_id = row[0].strip()
except AttributeError:
continue
class_labels_dict[gkg_id] = label
gkg_id_to_index[gkg_id] = index
index += 1
label += 1
class_labels = [None]*len(class_labels_dict)
for key, value in class_labels_dict.items():
class_labels[gkg_id_to_index[key]] = value
formed_clusters_path = output_dir
file_name = '*.csv'
all_files = glob.glob(os.path.join(formed_clusters_path, file_name))
cluster_labels_dict = {}
label = 1
for f in all_files:
df = pd.read_csv(f, header=None, encoding='latin-1')
df_list = df.values.tolist()
for row in df_list:
gkg_id = row[0]
cluster_labels_dict[gkg_id] = label
label += 1
cluster_labels = [0] * len(cluster_labels_dict)
for key, value in cluster_labels_dict.items():
cluster_labels[gkg_id_to_index[key]] = value
matrix = metrics.cluster.contingency_matrix(class_labels, cluster_labels)
rand_index, precision, recall, f1 = precision_recall_fmeasure(matrix)
ari = metrics.cluster.adjusted_rand_score(class_labels, cluster_labels)
nmi = metrics.normalized_mutual_info_score(class_labels, cluster_labels)
result = [precision, recall, f1, ari, nmi]
return result