-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_fixed_thresholds_with_compensation.py
134 lines (115 loc) · 7.53 KB
/
test_fixed_thresholds_with_compensation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import EncoderFactory
from DatasetManager import DatasetManager
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support, confusion_matrix
from sklearn.pipeline import FeatureUnion
import time
import os
import sys
import csv
from sys import argv
import pickle
def get_min_nonmonotonic(dataSetName):
if str.startswith(dataSetName,"traffic_fines"):
return 3
elif str.startswith(dataSetName,"bpic2017"):
return 18
def calculate_cost(x, costs):
return costs[int(x['prediction']), int(x['actual'])](x)
dataset_name = argv[1]
predictions_dir = argv[2]
results_dir = argv[3]
# create results directory
if not os.path.exists(os.path.join(results_dir)):
os.makedirs(os.path.join(results_dir))
# load predictions
dt_preds = pd.read_csv(os.path.join(predictions_dir, "preds_%s.csv" % dataset_name), sep=";")
#set nonomonotic-threshold
nonmonotonic_threshold = get_min_nonmonotonic(dataset_name)
# write results to file
out_filename = os.path.join(results_dir, "results_%s_fixedconfs.csv" % (dataset_name))
with open(out_filename, 'w') as fout:
writer = csv.writer(fout, delimiter=';', quotechar='', quoting=csv.QUOTE_NONE)
writer.writerow(["dataset", "method", "metric", "value", "c_miss", "c_action", "c_postpone", "c_com", "early_type", "threshold"])
for conf in range(0, 120, 10):
conf_threshold = conf / 100.0
method = "fixed%s" % conf
# trigger alarms according to conf_threshold
dt_final = pd.DataFrame()
unprocessed_case_ids = set(dt_preds.case_id.unique())
for nr_events in range(1, dt_preds.prefix_nr.max() + 1):
tmp = dt_preds[(dt_preds.case_id.isin(unprocessed_case_ids)) & (dt_preds.prefix_nr == nr_events)]
tmp = tmp[tmp.predicted_proba >= conf_threshold]
tmp["prediction"] = 1
dt_final = pd.concat([dt_final, tmp], axis=0)
unprocessed_case_ids = unprocessed_case_ids.difference(tmp.case_id)
tmp = dt_preds[(dt_preds.case_id.isin(unprocessed_case_ids)) & (dt_preds.prefix_nr == 1)]
tmp["prediction"] = 0
dt_final = pd.concat([dt_final, tmp], axis=0)
case_lengths = dt_preds.groupby("case_id").prefix_nr.max().reset_index()
case_lengths.columns = ["case_id", "case_length"]
dt_final = dt_final.merge(case_lengths)
# calculate precision, recall etc. independent of the costs
prec, rec, fscore, _ = precision_recall_fscore_support(dt_final.actual, dt_final.prediction, pos_label=1,
average="binary")
tn, fp, fn, tp = confusion_matrix(dt_final.actual, dt_final.prediction).ravel()
# calculate earliness based on the "true alarms" only
tmp = dt_final[(dt_final.prediction == 1) & (dt_final.actual == 1)]
earliness = (1 - ((tmp.prefix_nr - 1) / tmp.case_length))
tmp = dt_final[(dt_final.prediction == 1)]
earliness_alarms = (1 - ((tmp.prefix_nr - 1) / tmp.case_length))
writer.writerow([dataset_name, method, "prec", prec, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "rec", rec, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "fscore", fscore, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "tn", tn, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "fp", fp, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "fn", fn, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "tp", tp, None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "earliness_mean", earliness.mean(), None, None, None, None, None, conf_threshold])
writer.writerow([dataset_name, method, "earliness_std", earliness.std(), None, None, None, None, None, conf_threshold])
writer.writerow(
[dataset_name, method, "earliness_alarms_mean", earliness_alarms.mean(), None, None, None, None, None, conf_threshold])
writer.writerow(
[dataset_name, method, "earliness_alarms_std", earliness_alarms.std(), None, None, None, None, None, conf_threshold])
# evaluate the cost based on different misclassification costs and earliness rewards
cost_weights = [(1, 1), (2, 1), (3, 1), (5, 1), (10, 1), (20, 1)]
c_postpone_weight = 0
c_com_weights = [1 / 40.0, 1 / 20.0, 1 / 10.0, 1 / 5.0, 1 / 3.0, 1 / 2.0, 1, 2, 3, 5, 10, 20, 40, 0]
for early_type in ["const", "linear","nonmonotonic"]:
for c_com_weight in c_com_weights:
for c_miss_weight, c_action_weight in cost_weights:
c_miss = c_miss_weight / (c_miss_weight + c_action_weight + c_com_weight)
c_action = c_action_weight / (c_miss_weight + c_action_weight + c_com_weight)
c_com = c_com_weight / (c_miss_weight + c_action_weight + c_com_weight)
if early_type == "linear":
costs = np.matrix([[lambda x: 0,
lambda x: c_miss],
[lambda x: c_action * (x['prefix_nr'] - 1) / x['case_length'] + c_com,
lambda x: c_action * (x['prefix_nr'] - 1) / x['case_length'] + (
x['prefix_nr'] - 1) / x['case_length'] * c_miss
]])
elif early_type == "nonmonotonic":
costs = np.matrix([[lambda x: 0,
lambda x: c_miss],
[lambda x: c_action * (
x['case_length'] - min(x['prefix_nr'],nonmonotonic_threshold) / x['case_length']) + (
c_com * (x['case_length'] - x['prefix_nr'] / x[
'case_length'])),
lambda x: c_action * (
x['case_length'] - min(x['prefix_nr'],nonmonotonic_threshold) / x['case_length']) + (
x['prefix_nr'] - 1) / x['case_length'] * c_miss
]])
else:
costs = np.matrix([[lambda x: 0,
lambda x: c_miss],
[lambda x: c_action + c_com, # 0:1
lambda x: c_action + (x['prefix_nr'] - 1) / x['case_length'] * c_miss
]])
# calculate cost
cost = dt_final.apply(calculate_cost, costs=costs, axis=1).sum()
writer.writerow(
[dataset_name, method, "cost", cost, c_miss_weight, c_action_weight, c_postpone_weight,
c_com_weight, early_type, conf_threshold])
writer.writerow([dataset_name, method, "cost_avg", cost / len(dt_final), c_miss_weight, c_action_weight,
c_postpone_weight, c_com_weight, early_type, conf_threshold])