modify 2 comments / add AutoSpearman

awsm-research · Apr 14, 2021 · 6ad9406 · 6ad9406
1 parent 1c245ac
commit 6ad9406
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 4 deletions.
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/pyExplainer.iml b/.idea/pyExplainer.iml
diff --git a/pyexplainer/pyexplainer_pyexplainer.py b/pyexplainer/pyexplainer_pyexplainer.py
@@ -14,6 +14,8 @@
 from sklearn.utils import check_random_state, all_estimators
 from sklearn.ensemble import RandomForestClassifier
 from pyexplainer.rulefit import RuleFit
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from statsmodels.tools.tools import add_constant
 
 
 def get_base_prefix_compat():
@@ -226,6 +228,116 @@ def __init__(self,
         self.X_explain = None
         self.y_explain = None
 
+    def auto_spearman(self, correlation_threshold=0.7, correlation_method='spearman', VIF_threshold=5):
+        """An automated feature selection approach that address collinearity and multicollinearity.
+        For more information, please kindly refer to the `paper <https://ieeexplore.ieee.org/document/8530020>`_.
+
+        Parameters
+        ----------
+        correlation_threshold : :obj:`float`
+            Threshold value of correalation.
+        correlation_method : :obj:`str`
+            Method for solving the correlation between the features.
+        VIF_threshold : :obj:`int`
+            Threshold value of VIF score.
+
+        Returns
+        -------
+        :obj:`pandas.core.frame.DataFrame`
+            A Dataframe that contains selected features
+        """
+        X_AS_train = self.X_train.copy()
+        AS_metrics = X_AS_train.columns
+        count = 1
+
+        # (Part 1) Automatically select non-correlated metrics based on a Spearman rank correlation test.
+        print('(Part 1) Automatically select non-correlated metrics based on a Spearman rank correlation test')
+        while True:
+            corrmat = X_AS_train.corr(method=correlation_method)
+            top_corr_features = corrmat.index
+            abs_corrmat = abs(corrmat)
+
+            # identify correlated metrics with the correlation threshold of the threshold
+            highly_correlated_metrics = ((corrmat > correlation_threshold) | (corrmat < -correlation_threshold)) & (
+                        corrmat != 1)
+            n_correlated_metrics = np.sum(np.sum(highly_correlated_metrics))
+            if n_correlated_metrics > 0:
+                # find the strongest pair-wise correlation
+                find_top_corr = pd.melt(abs_corrmat, ignore_index=False)
+                find_top_corr.reset_index(inplace=True)
+                find_top_corr = find_top_corr[find_top_corr['value'] != 1]
+                top_corr_index = find_top_corr['value'].idxmax()
+                top_corr_i = find_top_corr.loc[top_corr_index, :]
+
+                # get the 2 correlated metrics with the strongest correlation
+                correlated_metric_1 = top_corr_i[0]
+                correlated_metric_2 = top_corr_i[1]
+                print('> Step', count, 'comparing between', correlated_metric_1, 'and', correlated_metric_2)
+
+                # compute their correlation with other metrics outside of the pair
+                correlation_with_other_metrics_1 = np.mean(abs_corrmat[correlated_metric_1][
+                                                               [i for i in top_corr_features if
+                                                                i not in [correlated_metric_1, correlated_metric_2]]])
+                correlation_with_other_metrics_2 = np.mean(abs_corrmat[correlated_metric_2][
+                                                               [i for i in top_corr_features if
+                                                                i not in [correlated_metric_1, correlated_metric_2]]])
+                print('>>', correlated_metric_1, 'has the average correlation of',
+                      np.round(correlation_with_other_metrics_1, 3), 'with other metrics')
+                print('>>', correlated_metric_2, 'has the average correlation of',
+                      np.round(correlation_with_other_metrics_2, 3), 'with other metrics')
+                # select the metric that shares the least correlation outside of the pair and exclude the other
+                if correlation_with_other_metrics_1 < correlation_with_other_metrics_2:
+                    exclude_metric = correlated_metric_2
+                else:
+                    exclude_metric = correlated_metric_1
+                print('>>', 'Exclude', exclude_metric)
+                count = count + 1
+                AS_metrics = list(set(AS_metrics) - set([exclude_metric]))
+                X_AS_train = X_AS_train[AS_metrics]
+            else:
+                break
+
+        print('According to Part 1 of AutoSpearman,', AS_metrics, 'are selected.')
+
+        # (Part 2) Automatically select non-correlated metrics based on a Variance Inflation Factor analysis.
+        print('(Part 2) Automatically select non-correlated metrics based on a Variance Inflation Factor analysis')
+
+        # Prepare a dataframe for VIF
+        X_AS_train = add_constant(X_AS_train)
+
+        selected_features = X_AS_train.columns
+        count = 1
+        while True:
+            # Calculate VIF scores
+            vif_scores = pd.DataFrame([variance_inflation_factor(X_AS_train.values, i)
+                                       for i in range(X_AS_train.shape[1])],
+                                      index=X_AS_train.columns)
+            # Prepare a final dataframe of VIF scores
+            vif_scores.reset_index(inplace=True)
+            vif_scores.columns = ['Feature', 'VIFscore']
+            vif_scores = vif_scores.loc[vif_scores['Feature'] != 'const', :]
+            vif_scores.sort_values(by=['VIFscore'], ascending=False, inplace=True)
+
+            # Find features that have their VIF scores of above the threshold
+            filtered_vif_scores = vif_scores[vif_scores['VIFscore'] >= VIF_threshold]
+
+            # Terminate when there is no features with the VIF scores of above the threshold
+            if len(filtered_vif_scores) == 0:
+                break
+
+            # exclude the metric with the highest VIF score
+            metric_to_exclude = list(filtered_vif_scores['Feature'].head(1))[0]
+
+            print('> Step', count, '- exclude', str(metric_to_exclude))
+            count = count + 1
+
+            selected_features = list(set(selected_features) - set([metric_to_exclude]))
+
+            X_AS_train = X_AS_train.loc[:, selected_features]
+
+        print('Finally, according to Part 2 of AutoSpearman,', AS_metrics, 'are selected.')
+        return AS_metrics
+
     def explain(self,
                 X_explain,
                 y_explain,
@@ -690,7 +802,7 @@ def generate_instance_crossover_interpolation(self, X_explain, y_explain, debug=
             sample_classes_arr = []
             sample_indexes_list = []
 
-            # Generating 1000 instances using interpolation technique
+            # Generating instances using the cross-over technique
             for num in range(0, 1000):
                 rand_rows = train_set_neigh.sample(2)
                 sample_indexes_list = sample_indexes_list + rand_rows.index.values.tolist()
@@ -724,7 +836,7 @@ def generate_instance_crossover_interpolation(self, X_explain, y_explain, debug=
                 new_ins.name = num
                 new_con_df = new_con_df.append(new_ins, ignore_index=True)
 
-            # Generating 1000 instances using cross-over technique
+            # Generating instances using the mutation technique
             for num in range(1000, 2000):
                 rand_rows = train_set_neigh.sample(3)
                 sample_indexes_list = sample_indexes_list + rand_rows.index.values.tolist()
@@ -1178,6 +1290,12 @@ def set_top_k_rules(self, top_k_rules):
         else:
             self.top_k_rules = top_k_rules
 
+    def set_X_train(self, X_train):
+        if isinstance(X_train, pd.core.frame.DataFrame):
+            self.X_train = X_train
+        else:
+            return print("set X_train failed, X_train should be type of pandas.core.frame.DataFrame!")
+
     def show_visualisation(self):
         """Display items as follows,
         (1) Risk Score Progress Bar (made from ipywidgets)