add DIF (#506)

yzhao062 · Oct 25, 2023 · 909b84f · 909b84f
1 parent f23fc0e
commit 909b84f
Show file tree

Hide file tree

Showing 8 changed files with 566 additions and 549 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -178,4 +178,5 @@ v<1.0.8>, <03/08/2023> -- Add QMCD detector (#452).
 v<1.0.8>, <03/08/2023> -- Optimized ECDF and drop Statsmodels dependency (#467).
 v<1.0.9>, <03/19/2023> -- Hot fix for errors in ECOD and COPOD due to the issue of scipy.
 v<1.1.0>, <06/19/2023> -- Further integration of PyThresh.
-v<1.1.1>, <07/03/2023> -- Bump up sklearn requirement and some hot fixes.
+v<1.1.1>, <07/03/2023> -- Bump up sklearn requirement and some hot fixes.
+v<1.1.1>, <10/24/2023> -- Add deep isolation forest (#506)
diff --git a/README.rst b/README.rst
@@ -58,7 +58,7 @@ Python Outlier Detection (PyOD)
 
 -----
 
-**News**: We just released a 45-page, the most comprehensive `anomaly detection benchmark paper <https://www.andrew.cmu.edu/user/yuezhao2/papers/22-neurips-adbench.pdf>`_.
+**News**: We have a 45-page, the most comprehensive `anomaly detection benchmark paper <https://www.andrew.cmu.edu/user/yuezhao2/papers/22-neurips-adbench.pdf>`_.
 The fully `open-sourced ADBench <https://github.com/Minqi824/ADBench>`_ compares 30 anomaly detection algorithms on 57 benchmark datasets.
 
 **For time-series outlier detection**, please use `TODS <https://github.com/datamllab/tods>`_.
@@ -70,7 +70,7 @@ multivariate data. This exciting yet challenging field is commonly referred as
 or `Anomaly Detection <https://en.wikipedia.org/wiki/Anomaly_detection>`_.
 
 PyOD includes more than 40 detection algorithms, from classical LOF (SIGMOD 2000) to
-the latest ECOD (TKDE 2022). Since 2017, PyOD has been successfully used in numerous academic researches and
+the latest ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD has been successfully used in numerous academic researches and
 commercial products with more than `10 million downloads <https://pepy.tech/project/pyod>`_.
 It is also well acknowledged by the machine learning community with various dedicated posts/tutorials, including
 `Analytics Vidhya <https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/>`_,
@@ -199,9 +199,10 @@ Alternatively, you could clone and run setup.py file:
 * numpy>=1.19
 * numba>=0.51
 * scipy>=1.5.1
-* scikit_learn>=0.20.0
+* scikit_learn>=0.22.0
 * six
 
+
 **Optional Dependencies (see details below)**\ :
 
 * combo (optional, required for models/combination.py and FeatureBagging)

diff --git a/docs/index.rst b/docs/index.rst
@@ -76,7 +76,7 @@ multivariate data. This exciting yet challenging field is commonly referred as
 or `Anomaly Detection <https://en.wikipedia.org/wiki/Anomaly_detection>`_.
 
 PyOD includes more than 40 detection algorithms, from classical LOF (SIGMOD 2000) to
-the latest ECOD (TKDE 2022). Since 2017, PyOD :cite:`a-zhao2019pyod` has been successfully used in numerous
+the latest ECOD and DIF (TKDE 2022 and 2023). Since 2017, PyOD :cite:`a-zhao2019pyod` has been successfully used in numerous
 academic researches and commercial products with more than `10 million downloads <https://pepy.tech/project/pyod>`_.
 It is also well acknowledged by the machine learning community with various dedicated posts/tutorials, including
 `Analytics Vidhya <https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/>`_,
@@ -209,6 +209,7 @@ Proximity-Based      SOD               Subspace Outlier Detection
 Proximity-Based      ROD               Rotation-based Outlier Detection                                                                        2020   :class:`pyod.models.rod.ROD`                         :cite:`a-almardeny2020novel`
 Outlier Ensembles    IForest           Isolation Forest                                                                                        2008   :class:`pyod.models.iforest.IForest`                 :cite:`a-liu2008isolation,a-liu2012isolation`
 Outlier Ensembles    INNE              Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles                                      2018   :class:`pyod.models.inne.INNE`                       :cite:`a-bandaragoda2018isolation`
+Outlier Ensembles    DIF               Deep Isolation Forest for Anomaly Detection                                                             2023   :class:`pyod.models.dif.DIF`                         :cite:`a-Xu2023Deep`
 Outlier Ensembles    FB                Feature Bagging                                                                                         2005   :class:`pyod.models.feature_bagging.FeatureBagging`  :cite:`a-lazarevic2005feature`
 Outlier Ensembles    LSCP              LSCP: Locally Selective Combination of Parallel Outlier Ensembles                                       2019   :class:`pyod.models.lscp.LSCP`                       :cite:`a-zhao2019lscp`
 Outlier Ensembles    XGBOD             Extreme Boosting Based Outlier Detection **(Supervised)**                                               2018   :class:`pyod.models.xgbod.XGBOD`                     :cite:`a-zhao2018xgbod`

diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst
@@ -105,6 +105,17 @@ pyod.models.deep\_svdd module
     :show-inheritance:
     :inherited-members:
 
+pyod.models.dif module
+-----------------------------
+
+.. automodule:: pyod.models.dif
+    :members:
+    :exclude-members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+
+
 pyod.models.ecod module
 ------------------------
 

diff --git a/examples/dif_example.py b/examples/dif_example.py
@@ -13,42 +13,41 @@
 # temporary solution for relative imports in case pyod is not installed
 # if pyod is installed, no need to use the following line
 sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+	os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
 
 from pyod.models.dif import DIF
 from pyod.utils.data import generate_data
 from pyod.utils.data import evaluate_print
 
-
 if __name__ == "__main__":
-    contamination = 0.1  # percentage of outliers
-    n_train = 20000  # number of training points
-    n_test = 2000  # number of testing points
-    n_features = 300  # number of features
-
-    # Generate sample data
-    X_train, X_test, y_train, y_test = \
-        generate_data(n_train=n_train,
-                      n_test=n_test,
-                      n_features=n_features,
-                      contamination=contamination,
-                      random_state=42)
-
-    # train AutoEncoder detector
-    clf_name = 'DIF'
-    clf = DIF()
-    clf.fit(X_train)
-
-    # get the prediction labels and outlier scores of the training data
-    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
-    y_train_scores = clf.decision_scores_  # raw outlier scores
-
-    # get the prediction on the test data
-    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
-    y_test_scores = clf.decision_function(X_test)  # outlier scores
-
-    # evaluate and print the results
-    print("\nOn Training Data:")
-    evaluate_print(clf_name, y_train, y_train_scores)
-    print("\nOn Test Data:")
-    evaluate_print(clf_name, y_test, y_test_scores)
+	contamination = 0.1  # percentage of outliers
+	n_train = 1000  # number of training points
+	n_test = 200  # number of testing points
+	n_features = 30  # number of features
+
+	# Generate sample data
+	X_train, X_test, y_train, y_test = \
+		generate_data(n_train=n_train,
+					  n_test=n_test,
+					  n_features=n_features,
+					  contamination=contamination,
+					  random_state=42)
+
+	# train deep isolation forest detector
+	clf_name = 'DIF'
+	clf = DIF()
+	clf.fit(X_train)
+
+	# get the prediction labels and outlier scores of the training data
+	y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+	y_train_scores = clf.decision_scores_  # raw outlier scores
+
+	# get the prediction on the test data
+	y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+	y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+	# evaluate and print the results
+	print("\nOn Training Data:")
+	evaluate_print(clf_name, y_train, y_train_scores)
+	print("\nOn Test Data:")
+	evaluate_print(clf_name, y_test, y_test_scores)