Merge pull request #506 from xuhongzuo/master

Deep Isolation Forest method implemented
yzhao062 · Oct 25, 2023 · f23fc0e · f23fc0e
2 parents 6c77e27 + 8c00d8a
commit f23fc0e
Show file tree

Hide file tree

Showing 5 changed files with 649 additions and 0 deletions.
diff --git a/README.rst b/README.rst
@@ -392,6 +392,7 @@ Proximity-Based      SOD                 Subspace Outlier Detection
 Proximity-Based      ROD                 Rotation-based Outlier Detection                                                                        2020   [#Almardeny2020A]_
 Outlier Ensembles    IForest             Isolation Forest                                                                                        2008   [#Liu2008Isolation]_
 Outlier Ensembles    INNE                Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles                                      2018   [#Bandaragoda2018Isolation]_
+Outlier Ensembles    DIF                 Deep Isolation Forest for Anomaly Detection                                                             2023   [#Xu2023Deep]_
 Outlier Ensembles    FB                  Feature Bagging                                                                                         2005   [#Lazarevic2005Feature]_
 Outlier Ensembles    LSCP                LSCP: Locally Selective Combination of Parallel Outlier Ensembles                                       2019   [#Zhao2019LSCP]_
 Outlier Ensembles    XGBOD               Extreme Boosting Based Outlier Detection **(Supervised)**                                               2018   [#Zhao2018XGBOD]_
@@ -684,6 +685,8 @@ Reference
 
 .. [#Wang2020adVAE] Wang, X., Du, Y., Lin, S., Cui, P., Shen, Y. and Yang, Y., 2019. adVAE: A self-adversarial variational autoencoder with Gaussian anomaly prior knowledge for anomaly detection. *Knowledge-Based Systems*.
 
+.. [#Xu2023Deep] Xu, H., Pang, G., Wang, Y., Wang, Y., 2023. Deep isolation forest for anomaly detection. *IEEE Transactions on Knowledge and Data Engineering*.
+
 .. [#You2017Provable] You, C., Robinson, D.P. and Vidal, R., 2017. Provable self-representation based outlier detection in a union of subspaces. In Proceedings of the IEEE conference on computer vision and pattern recognition.
 
 .. [#Zenati2018Adversarially] Zenati, H., Romain, M., Foo, C.S., Lecouat, B. and Chandrasekhar, V., 2018, November. Adversarially learned anomaly detection. In 2018 IEEE International conference on data mining (ICDM) (pp. 727-736). IEEE.

diff --git a/docs/zreferences.bib b/docs/zreferences.bib
@@ -489,4 +489,15 @@ @article{fang2001wrap
   pages={608--624},
   year={2001},
   publisher={Elsevier}
+}
+
+@article{xu2023dif,
+  author={Xu, Hongzuo and Pang, Guansong and Wang, Yijie and Wang, Yongjun},
+  journal={IEEE Transactions on Knowledge and Data Engineering},
+  title={Deep Isolation Forest for Anomaly Detection},
+  year={2023},
+  volume={},
+  number={},
+  pages={1-14},
+  doi={10.1109/TKDE.2023.3270293}
 }
diff --git a/examples/dif_example.py b/examples/dif_example.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""Example of using Deep Isolation Forest for
+outlier detection"""
+# Author: Hongzuo Xu <hongzuoxu@126.com>
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+from pyod.models.dif import DIF
+from pyod.utils.data import generate_data
+from pyod.utils.data import evaluate_print
+
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 20000  # number of training points
+    n_test = 2000  # number of testing points
+    n_features = 300  # number of features
+
+    # Generate sample data
+    X_train, X_test, y_train, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=n_features,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train AutoEncoder detector
+    clf_name = 'DIF'
+    clf = DIF()
+    clf.fit(X_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)