Skip to content

Commit

Permalink
polish with the rejection option
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 committed Sep 6, 2024
1 parent 0ba2fc8 commit 0c53f3b
Show file tree
Hide file tree
Showing 47 changed files with 383 additions and 244 deletions.
3 changes: 2 additions & 1 deletion CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,5 @@ v<2.0.2>, <07/01/2024> -- Add AE1SVM.
v<2.0.2>, <07/04/2024> -- Moving from TF to Torch -- reimplement ALAD.
v<2.0.2>, <07/04/2024> -- Moving from TF to Torch -- reimplement anogan.
v<2.0.2>, <07/06/2024> -- Complete of removing all Tensorflow and Keras code.
v<2.0.2>, <07/21/2024> -- Add DevNet.
v<2.0.2>, <07/21/2024> -- Add DevNet.
v<2.0.3>, <09/06/2024> -- Add Reject Option in Unsupervised Anomaly Detection (#605).
187 changes: 85 additions & 102 deletions pyod/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,71 +294,76 @@ def predict_confidence(self, X):
np.place(confidence, prediction == 0, 1 - confidence[prediction == 0])

return confidence
def predict_with_rejection(self, X, T = 32, return_stats = False,
delta = 0.1, c_fp = 1, c_fn = 1, c_r = -1):

def predict_with_rejection(self, X, T=32, return_stats=False,
delta=0.1, c_fp=1, c_fn=1, c_r=-1):
"""Predict if a particular sample is an outlier or not,
allowing the detector to reject (i.e., output = -2)
low confidence predictions.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
X : numpy array of shape (n_samples, n_features)
The input samples.
T : int, optional(default=32)
It allows to set the rejection threshold to 1-2exp(-T).
The higher the value of T, the more rejections are made.
T : int, optional(default=32)
It allows to set the rejection threshold to 1-2exp(-T).
The higher the value of T, the more rejections are made.
return_stats: bool, optional (default = False)
If true, it returns also three additional float values:
the estimated rejection rate, the upper bound rejection rate,
and the upper bound of the cost.
return_stats: bool, optional (default = False)
If true, it returns also three additional float values:
the estimated rejection rate, the upper bound rejection
rate, and the upper bound of the cost.
delta: float, optional (default = 0.1)
The upper bound rejection rate holds with probability 1-delta.
delta: float, optional (default = 0.1)
The upper bound rejection rate holds with probability 1-delta.
c_fp, c_fn, c_r: floats (positive), optional (default = [1,1, contamination])
costs for false positive predictions (c_fp), false negative
predictions (c_fn) and rejections (c_r).
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, it tells whether it should be considered
as an outlier according to the fitted model. 0 stands for inliers,
1 for outliers and -2 for rejection.
outlier_labels : numpy array of shape (n_samples,)
For each observation, it tells whether it should be
considered as an outlier according to the fitted
model. 0 stands for inliers, 1 for outliers and
-2 for rejection.
expected_rejection_rate: float, if return_stats is True;
upperbound_rejection_rate: float, if return_stats is True;
upperbound_cost: float, if return_stats is True;
"""
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
if c_r <0:
warnings.warn("The cost of rejection must be positive. It has been set to the contamination rate.")
if c_r < 0:
warnings.warn(
"The cost of rejection must be positive. "
"It has been set to the contamination rate.")
c_r = self.contamination

if delta<=0 or delta>=1:
warnings.warn("delta must belong to (0,1). It's value has been set to 0.1")
if delta <= 0 or delta >= 1:
warnings.warn(
"delta must belong to (0,1). It's value has been set to 0.1")
delta = 0.1

self.rejection_threshold_ = 1- 2*np.exp(-T)
self.rejection_threshold_ = 1 - 2 * np.exp(-T)
prediction = self.predict(X)
confidence = self.predict_confidence(X)
np.place(confidence, prediction == 0, 1 - confidence[prediction == 0])
confidence = 2*abs(confidence-.5)
prediction[np.where(confidence<=self.rejection_threshold_)[0]] = -2
confidence = 2 * abs(confidence - .5)
prediction[np.where(confidence <= self.rejection_threshold_)[0]] = -2

if return_stats:
expected_rejrate, ub_rejrate, ub_cost = self.compute_rejection_stats(T = T, delta = delta,
c_fp=c_fp, c_fn =c_fn, c_r = c_r)
expected_rejrate, ub_rejrate, ub_cost = self.compute_rejection_stats(
T=T, delta=delta,
c_fp=c_fp, c_fn=c_fn, c_r=c_r)
return prediction, [expected_rejrate, ub_rejrate, ub_cost]

return prediction


def compute_rejection_stats(self, T = 32, delta = 0.1, c_fp = 1, c_fn = 1, c_r = -1, verbose = False):
def compute_rejection_stats(self, T=32, delta=0.1, c_fp=1, c_fn=1, c_r=-1,
verbose=False):
"""Add reject option into the unsupervised detector.
This comes with guarantees: an estimate of the expected
rejection rate (return_rejectrate=True), an upper
Expand All @@ -367,20 +372,21 @@ def compute_rejection_stats(self, T = 32, delta = 0.1, c_fp = 1, c_fn = 1, c_r =
Parameters
----------
T: int, optional(default=32)
It allows to set the rejection threshold to 1-2exp(-T).
The higher the value of T, the more rejections are made.
T: int, optional(default=32)
It allows to set the rejection threshold to 1-2exp(-T).
The higher the value of T, the more rejections are made.
delta: float, optional (default = 0.1)
The upper bound rejection rate holds with probability 1-delta.
delta: float, optional (default = 0.1)
The upper bound rejection rate holds with probability 1-delta.
c_fp, c_fn, c_r: floats (positive), optional (default = [1,1, contamination])
costs for false positive predictions (c_fp), false negative
predictions (c_fn) and rejections (c_r).
c_fp, c_fn, c_r: floats (positive),
optional (default = [1,1, contamination])
costs for false positive predictions (c_fp),
false negative predictions (c_fn) and rejections (c_r).
verbose: bool, optional (default = False)
If true, it prints the expected rejection rate, the upper bound rejection rate,
and the upper bound of the cost.
verbose: bool, optional (default = False)
If true, it prints the expected rejection rate, the upper
bound rejection rate, and the upper bound of the cost.
Returns
-------
Expand All @@ -389,59 +395,67 @@ def compute_rejection_stats(self, T = 32, delta = 0.1, c_fp = 1, c_fn = 1, c_r =
satisfied with probability 1-delta;
upperbound_cost: float, the upper bound for the cost;
"""

check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
if c_r <0:

if c_r < 0:
c_r = self.contamination
if delta<=0 or delta>=1:

if delta <= 0 or delta >= 1:
delta = 0.1

# Computing the expected rejection rate
n = len(self.decision_scores_)
n_gamma_minus1 = int(n * self.contamination) -1
argsmin = (n_gamma_minus1, n, 1-np.exp(-T))
n_gamma_minus1 = int(n * self.contamination) - 1
argsmin = (n_gamma_minus1, n, 1 - np.exp(-T))
argsmax = (n_gamma_minus1, n, np.exp(-T))
q1 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, bracket=[0, 1], method='brentq', args=argsmin).root
q2 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, bracket=[0, 1], method='brentq', args=argsmax).root
expected_reject_rate = q2-q1
q1 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C,
bracket=[0, 1], method='brentq', args=argsmin).root
q2 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C,
bracket=[0, 1], method='brentq', args=argsmax).root
expected_reject_rate = q2 - q1

# Computing the upper bound for the rejection rate
right_mar = (-self.contamination * (n + 2) + n + 1) / n + (T * (n + 2)) / (np.sqrt(2 * n**3 * T))
right_mar = (-self.contamination * (n + 2) + n + 1) / n + (
T * (n + 2)) / (np.sqrt(2 * n ** 3 * T))
right_mar = min(1, right_mar)
left_mar = (
(2 + n * (1 - self.contamination) * (n + 1)) / n**2
- np.sqrt(
0.5 * n**5 * (
(2 + n * (1 - self.contamination) * (n + 1)) / n ** 2
- np.sqrt(
0.5 * n ** 5 * (
2 * n * (
-3 * self.contamination**2
- 2 * n * (1 - self.contamination)**2
+ 4 * self.contamination - 3
)
+ T * (n + 2)**2 - 8
)
) / n**4
-3 * self.contamination ** 2
- 2 * n * (1 - self.contamination) ** 2
+ 4 * self.contamination - 3
)
+ T * (n + 2) ** 2 - 8
)
) / n ** 4
)
left_mar = max(0, left_mar)
add_term = 2 * np.sqrt(np.log(2 / delta) / (2 * n))
upperbound_rejectrate = right_mar - left_mar + add_term

# Computing the upper bound for the cost function
n_gamma_minus1 = int(n * self.contamination) -1
argsmin = (n_gamma_minus1, n, 1-np.exp(-T))
n_gamma_minus1 = int(n * self.contamination) - 1
argsmin = (n_gamma_minus1, n, 1 - np.exp(-T))
argsmax = (n_gamma_minus1, n, np.exp(-T))
q1 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, bracket=[0, 1], method='brentq', args=argsmin).root
q2 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C, bracket=[0, 1], method='brentq', args=argsmax).root
upperbound_cost = np.min([self.contamination,q1])*c_fp + np.min([1-q2,self.contamination])*c_fn + (q2-q1)*c_r
q1 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C,
bracket=[0, 1], method='brentq', args=argsmin).root
q2 = root_scalar(lambda p, k, n, C: binom.cdf(k, n, p) - C,
bracket=[0, 1], method='brentq', args=argsmax).root
upperbound_cost = np.min([self.contamination, q1]) * c_fp + np.min(
[1 - q2, self.contamination]) * c_fn + (q2 - q1) * c_r

if verbose:
print("Expected rejection rate: ", np.round(expected_reject_rate, 4), '%')
print("Upper bound rejection rate: ", np.round(upperbound_rejectrate, 4), '%')
print("Expected rejection rate: ",
np.round(expected_reject_rate, 4), '%')
print("Upper bound rejection rate: ",
np.round(upperbound_rejectrate, 4), '%')
print("Upper bound cost: ", np.round(upperbound_cost, 4))

return expected_reject_rate, upperbound_rejectrate, upperbound_cost

def _predict_rank(self, X, normalized=False):
"""Predict the outlyingness rank of a sample by a fitted model. The
method is for outlier detector score combination.
Expand Down Expand Up @@ -518,37 +532,6 @@ def fit_predict_score(self, X, y, scoring='roc_auc_score'):

return score

# def score(self, X, y, scoring='roc_auc_score'):
# """Returns the evaluation resulted on the given test data and labels.
# ROC is chosen as the default evaluation metric
#
# :param X: The input samples
# :type X: numpy array of shape (n_samples, n_features)
#
# :param y: Outlier labels of the input samples
# :type y: array, shape (n_samples,)
#
# :param scoring: Evaluation metric
#
# -' roc_auc_score': ROC score
# - 'prc_n_score': Precision @ rank n score
# :type scoring: str, optional (default='roc_auc_score')
#
# :return: Evaluation score
# :rtype: float
# """
# check_is_fitted(self, ['decision_scores_'])
# if scoring == 'roc_auc_score':
# score = roc_auc_score(y, self.decision_function(X))
# elif scoring == 'prc_n_score':
# score = precision_n_scores(y, self.decision_function(X))
# else:
# raise NotImplementedError('PyOD built-in scoring only supports '
# 'ROC and Precision @ rank n')
#
# print("{metric}: {score}".format(metric=scoring, score=score))
#
# return score

def _set_n_classes(self, y):
"""Set the number of classes if `y` is presented, which is not
Expand Down
9 changes: 6 additions & 3 deletions pyod/test/test_abod.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,14 @@ def test_prediction_proba_linear_confidence(self):
assert (confidence.max() <= 1)

def test_prediction_with_rejection(self):
pred_labels = self.clf.predict_with_rejection(self.X_test, return_stats = False)
pred_labels = self.clf.predict_with_rejection(self.X_test,
return_stats=False)
assert_equal(pred_labels.shape, self.y_test.shape)

def test_prediction_with_rejection_stats(self):
_, [expected_rejrate, ub_rejrate, ub_cost] = self.clf.predict_with_rejection(self.X_test, return_stats = True)
_, [expected_rejrate, ub_rejrate,
ub_cost] = self.clf.predict_with_rejection(self.X_test,
return_stats=True)
assert (expected_rejrate >= 0)
assert (expected_rejrate <= 1)
assert (ub_rejrate >= 0)
Expand Down
9 changes: 6 additions & 3 deletions pyod/test/test_ae1svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,14 @@ def test_fit_predict(self):
assert_equal(pred_labels.shape, self.y_train.shape)

def test_prediction_with_rejection(self):
pred_labels = self.clf.predict_with_rejection(self.X_test, return_stats = False)
pred_labels = self.clf.predict_with_rejection(self.X_test,
return_stats=False)
assert_equal(pred_labels.shape, self.y_test.shape)

def test_prediction_with_rejection_stats(self):
_, [expected_rejrate, ub_rejrate, ub_cost] = self.clf.predict_with_rejection(self.X_test, return_stats = True)
_, [expected_rejrate, ub_rejrate,
ub_cost] = self.clf.predict_with_rejection(self.X_test,
return_stats=True)
assert (expected_rejrate >= 0)
assert (expected_rejrate <= 1)
assert (ub_rejrate >= 0)
Expand Down
9 changes: 6 additions & 3 deletions pyod/test/test_alad.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,14 @@ def test_prediction_proba_linear_confidence(self):
assert (confidence.max() <= 1)

def test_prediction_with_rejection(self):
pred_labels = self.clf.predict_with_rejection(self.X_test, return_stats = False)
pred_labels = self.clf.predict_with_rejection(self.X_test,
return_stats=False)
assert_equal(pred_labels.shape, self.y_test.shape)

def test_prediction_with_rejection_stats(self):
_, [expected_rejrate, ub_rejrate, ub_cost] = self.clf.predict_with_rejection(self.X_test, return_stats = True)
_, [expected_rejrate, ub_rejrate,
ub_cost] = self.clf.predict_with_rejection(self.X_test,
return_stats=True)
assert (expected_rejrate >= 0)
assert (expected_rejrate <= 1)
assert (ub_rejrate >= 0)
Expand Down
9 changes: 6 additions & 3 deletions pyod/test/test_auto_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,14 @@ def test_prediction_proba_linear_confidence(self):
self.assertInRange(confidence, 0, 1)

def test_prediction_with_rejection(self):
pred_labels = self.clf.predict_with_rejection(self.X_test, return_stats = False)
pred_labels = self.clf.predict_with_rejection(self.X_test,
return_stats=False)
self.assertEqual(pred_labels.shape, self.y_test.shape)

def test_prediction_with_rejection_stats(self):
_, [expected_rejrate, ub_rejrate, ub_cost] = self.clf.predict_with_rejection(self.X_test, return_stats = True)
_, [expected_rejrate, ub_rejrate,
ub_cost] = self.clf.predict_with_rejection(self.X_test,
return_stats=True)
self.assertGreaterEqual(expected_rejrate, 0)
self.assertLessEqual(expected_rejrate, 1)
self.assertGreaterEqual(ub_rejrate, 0)
Expand Down
Loading

0 comments on commit 0c53f3b

Please sign in to comment.