-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmccf1.R
190 lines (164 loc) · 8.3 KB
/
mccf1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
library(ROCR)
library(ggplot2)
#' MCC (Matthews correlation coefficient) - F1
#'
#' This function creates the "mccf1" object.
#' @param response a vector of actual values (0 or 1) in binary classification.
#' @param predictor a vector of predicted values (between 0 and 1) in binary classification.
#' @return the "mccf1" object, which is basically a list containing a vector of thresholds, a vector of corresponding normalized MCC, and a vector of corresponding F1.
#' @examples
#' response <- c(rep(1, 1000), rep(0, 10000))
#' set.seed(2017)
#' predictor <- c(rbeta(300, 12, 2), rbeta(700, 3, 4), rbeta(10000, 2, 3))
#' x <- mccf1(response, predictor)
#' head(x$normalizedMCC)
#' # [1] NaN 0.5150763 0.5213220 0.5261152 0.5301566 0.5337177
#' head(x$F1)
#' # [1] NaN 0.001998002 0.003992016 0.005982054 0.007968127 0.009950249
#' head(x$thresholds)
#' # [1] Inf 0.9935354 0.9931493 0.9930786 0.9925507 0.9900520
mccf1 <- function(response, predictor){
# get a performance object based on the classification
pred <- ROCR::prediction(predictor, response)
perf <- ROCR::performance(pred, measure = "mat", x.measure = "f")
# get MCC (Matthews correlation coefficient)
mcc <- attr(perf, "y.values")[[1]]
# get normalised MCC: change the range of MCC from [-1, 1] to [0, 1]
mcc.nor <- (mcc + 1) / 2
# get F1 score
f1 <- attr(perf, "x.values")[[1]]
# get the thresholds
thresholds <- attr(perf, "alpha.values")[[1]]
res = list(normalizedMCC = mcc.nor, F1 = f1, thresholds = thresholds)
class(res)="mccf1"
return(res)
}
#' Plot the MCC-F1 curve
#'
#' This function plots the MCC-F1 score curve using ggplot.
#' @param x "mccf1" object resulting from the function mccf1
#' @param main main title of the plot (default: "the MCC-F1 score curve")
#' @param xlab,ylab x- and y- axis annotation (default: "F1 score","normalized MCC")
#' @param .curveFileName character string, used for naming pdf file of the plot with \code{\link[grDevices]{pdf}} (if this argument is present, the plot will be stored in pdf with specified file name)
#' @param ... further arguments passed to and from methods
#' @return the ggplot object
#' @examples
#' response <- c(rep(1, 1000), rep(0, 10000))
#' predictor <- c(rbeta(300, 12, 2), rbeta(700, 3, 4), rbeta(10000, 2, 3))
#' plot(mccf1(response, predictor))
plot.mccf1 <- function(x, main = "the MCC-F1 score curve", xlab = "F1 score", ylab = "normalized MCC", .curveFileName, ...){
# get rid of NaN values in the vectors of mcc.nor and f1
mcc.nor_truncated <- x$normalizedMCC[2: (length(x$normalizedMCC) - 1)]
f_truncated <- x$F1[2: (length(x$F1) - 1)]
f <- m <- NULL
mccf1_df <- data.frame(f = f_truncated, m = mcc.nor_truncated)
# plot the MCC-F1 curve
mccf1_plot = ggplot2::ggplot(mccf1_df, ggplot2::aes(x = f, y = m, ymin = 0, ymax = 1, xmin = 0, xmax = 1)) +
ggplot2::geom_point(size = 0.2, shape = 21, fill = "white")+
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5))+
ggplot2::coord_equal(ratio = 1)+
ggplot2::labs(x = xlab, y = ylab, title = main)
plot(mccf1_plot)
if (!missing(.curveFileName)){
pdf(file = .curveFileName)
plot(mccf1_plot)
dev.off()
}
}
#' Summarize the the performance of a binary classification using MCC-F1 metric and the best threshold
#'
#' This function calculates the MCC-F1 metric and the best threshold for a binary classification.
#' @param object "mccf1" object resulting from the function mccf1
#' @param digits an integer, used for number formatting with \code{\link[base]{signif}}
#' @param bins an integer, will be used to divide the range of normalized MCC when calculating the MCC-F1 metric (default = 100)
#' @param ... further arguments passed to and from methods
#' @return a data.frame that shows the MCC-F1 metric (between 0 and 1) and the best threshold (between 0 and 1)
#' @examples
#' response <- c(rep(1, 1000), rep(0, 10000))
#' set.seed(2017)
#' predictor <- c(rbeta(300, 12, 2), rbeta(700, 3, 4), rbeta(10000, 2, 3))
#' \dontrun{summary(mccf1(response, predictor))}
#' # mccf1_metric best_threshold
#' # 0.3508904 0.786905
#' summary(mccf1(response, predictor), bins = 50)
#' # mccf1_metric best_threshold
#' # 0.3432971 0.786905
#' \dontrun{summary(mccf1(response, predictor), digits = 3)}
#' # mccf1_metric best_threshold
#' # 0.351 0.787
summary.mccf1 <- function(object, digits, bins = 100, ...){
# get rid of NaN values in the vectors of mcc.nor and F1
mcc.nor_truncated <- object$normalizedMCC[2: (length(object$normalizedMCC) - 1)]
f_truncated <- object$F1[2: (length(object$F1) - 1)]
# get the index of the point with largest normalized MCC ("point" refers to the point on the MCC-F1 curve)
index_of_max_mcc <- which.max(mcc.nor_truncated)
# define points on the MCC-F1 curve located on the left of the point with the highest normalized MCC as "left curve"
# get the left curve by getting the subvectors of MCC and F1 up to the index of the largest normalized MCC
mcc_left <- mcc.nor_truncated[1: index_of_max_mcc]
f_left <- f_truncated[1: index_of_max_mcc]
# define points on the MCC-F1 curve located on the right of the point with the highest normalized MCC as "right curve"
# get the right curve by getting the subvectors of MCC and F1 after the index of the largest normalized MCC
mcc_right <- mcc.nor_truncated[(index_of_max_mcc + 1): length(mcc.nor_truncated)]
f_right <- f_truncated[(index_of_max_mcc + 1): length(f_truncated)]
# divide the range of normalized MCC into subranges
unit_len <- (max(mcc.nor_truncated) - min(mcc.nor_truncated)) / bins
# calculate the sum of mean distances from the left curve to the point (1, 1)
mean_distances_left <- 0
for (i in 1: bins){
# find all the points on the left curve with normalized MCC between unit_len*(i-1) and unit_len*i
pos1 <- which(mcc_left >= min(mcc.nor_truncated) + (i-1) * unit_len)
pos2 <- which(mcc_left <= min(mcc.nor_truncated) + i * unit_len)
pos <- c()
for (index in pos1){
if (index %in% pos2){
pos <- c(pos, index)
}
}
sum_of_distance_within_subrange <- 0
for (index in pos){
d <- sqrt((mcc_left[index] - 1)^2 + (f_left[index] - 1)^2)
sum_of_distance_within_subrange <- sum_of_distance_within_subrange + d
}
mean_distances_left <- c(mean_distances_left, sum_of_distance_within_subrange / length(pos))
}
# get rid of NAs in mean_distances_left and sum the mean distances
num_of_na_left <- sum(is.na(mean_distances_left))
sum_of_mean_distances_left_no_na <- sum(mean_distances_left, na.rm = T)
# calculate the sum of mean distances from the right curve to the point (1, 1)
mean_distances_right <- 0
for (i in 1: bins){
# find all the points on the right curve with normalized MCC between unit_len*(i-1) and unit_len*i
pos1 <- which(mcc_right >= min(mcc.nor_truncated) + (i-1) * unit_len)
pos2 <- which(mcc_right <= min(mcc.nor_truncated) + i * unit_len)
pos <- c()
for (index in pos1){
if (index %in% pos2){
pos <- c(pos, index)
}
}
sum_of_distance_within_subrange <- 0
for (index in pos){
d <- sqrt((mcc_right[index] - 1)^2 + (f_right[index] - 1)^2)
sum_of_distance_within_subrange <- sum_of_distance_within_subrange + d
}
mean_distances_right <- c(mean_distances_right, sum_of_distance_within_subrange / length(pos))
}
# get rid of NAs in mean_distances_right and sum the mean distances
num_of_na_right <- sum(is.na(mean_distances_right))
sum_of_mean_distances_right_no_na <- sum(mean_distances_right, na.rm = T)
# calculate the MCC-F1 metric
mccf1_metric <- 1 - ((sum_of_mean_distances_left_no_na + sum_of_mean_distances_right_no_na) / (bins*2 - num_of_na_right - num_of_na_left)) / sqrt(2)
# find the best threshold
eu_distance = c()
for (i in (1: length(mcc.nor_truncated))){
eu_distance <- c(eu_distance, sqrt((1 - mcc.nor_truncated[i])^2 + (1 - f_truncated[i])^2))
}
best_threshold <- object$thresholds[2 : (length(object$thresholds) - 1)][match(min(eu_distance, na.rm = T), eu_distance)]
# output of the function is the MCC-F1 metric and the top threshold
if (!missing(digits)){
mccf1_result <- data.frame(mccf1_metric = signif(mccf1_metric, digits), best_threshold = signif(best_threshold, digits))
} else{
mccf1_result <- data.frame(mccf1_metric = mccf1_metric, best_threshold = best_threshold)
}
print(mccf1_result, row.names = FALSE)
}