From 05a8538ace590e9fec328923c9340c642a7993f2 Mon Sep 17 00:00:00 2001 From: Sowmya V Kollipara Date: Wed, 30 Jan 2019 18:38:56 +0530 Subject: [PATCH 1/4] Adding support for training statistics in lime descritzed path --- lime/discretize.py | 42 +++++++++++- lime/lime_tabular.py | 117 ++++++++++++++++++++------------ lime/tests/test_lime_tabular.py | 71 ++++++++++++++++++- 3 files changed, 184 insertions(+), 46 deletions(-) diff --git a/lime/discretize.py b/lime/discretize.py index 41635198..ca1a943d 100644 --- a/lime/discretize.py +++ b/lime/discretize.py @@ -18,7 +18,8 @@ class BaseDiscretizer(): __metaclass__ = ABCMeta # abstract class - def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): + def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, + data_stats=None): """Initializer Args: data: numpy 2d array @@ -33,7 +34,8 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando in the training data. """ self.to_discretize = ([x for x in range(data.shape[1]) - if x not in categorical_features]) + if x not in categorical_features]) + self.data_stats = data_stats self.names = {} self.lambdas = {} self.means = {} @@ -46,6 +48,13 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando bins = self.bins(data, labels) bins = [np.unique(x) for x in bins] + # Read the stats from data_stats if exists + if(data_stats is not None): + self.means = self.data_stats.get("means") + self.stds = self.data_stats.get("stds") + self.mins = self.data_stats.get("mins") + self.maxs = self.data_stats.get("maxs") + for feature, qts in zip(self.to_discretize, bins): n_bins = qts.shape[0] # Actually number of borders (= #bins-1) boundaries = np.min(data[:, feature]), np.max(data[:, feature]) @@ -60,6 +69,10 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x) discretized = self.lambdas[feature](data[:, feature]) + # If data stats are provided no need to compute the below set of details + if(data_stats is not None): + continue + self.means[feature] = [] self.stds[feature] = [] for x in range(n_bins + 1): @@ -117,6 +130,31 @@ def get_inverse(q): return ret +class StatsDiscretizer(BaseDiscretizer): + """ + Class to be used to supply the data stats info when descritize_continuos is true + """ + + def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, + data_stats=None): + + BaseDiscretizer.__init__(self, data, categorical_features, + feature_names, labels=labels, + random_state=random_state, + data_stats=data_stats) + + def bins(self, data, labels): + bins_from_stats = self.data_stats.get("bins") + bins = [] + if bins_from_stats is not None: + for feature in self.to_discretize: + bins_from_stats_feature = bins_from_stats.get(feature) + if bins_from_stats_feature is not None: + qts = np.array(bins_from_stats_feature) + bins.append(qts) + return bins + + class QuartileDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py index fdc3fef0..fd2dcb35 100644 --- a/lime/lime_tabular.py +++ b/lime/lime_tabular.py @@ -16,6 +16,7 @@ from lime.discretize import DecileDiscretizer from lime.discretize import EntropyDiscretizer from lime.discretize import BaseDiscretizer +from lime.discretize import StatsDiscretizer from . import explanation from . import lime_base @@ -112,7 +113,8 @@ def __init__(self, discretize_continuous=True, discretizer='quartile', sample_around_instance=False, - random_state=None): + random_state=None, + training_data_stats=None): """Init function. Args: @@ -153,11 +155,19 @@ def __init__(self, random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. + training_data_stats: a dict object having the details of training data + statistics.If None, training data information will be used.only matters + if discretize_continuous is True """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance + self.training_data_stats = training_data_stats + + # Validate data stats structure + if self.training_data_stats is not None: + self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: categorical_features = [] @@ -169,18 +179,24 @@ def __init__(self, self.discretizer = None if discretize_continuous: + # Set the discretizer if training data stats are provided + if self.training_data_stats is not None: + discretizer = StatsDiscretizer(training_data, self.categorical_features, + self.feature_names, labels=training_labels, + data_stats=self.training_data_stats) + if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: @@ -188,7 +204,10 @@ def __init__(self, ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) - discretized_training_data = self.discretizer.discretize( + + # Get the discretized_training_data when the stats are not provided + if(self.training_data_stats is None): + discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: @@ -203,21 +222,27 @@ def kernel(d, kernel_width): self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) - self.scaler = None self.class_names = class_names + + # Though set has no role to play if training data stats are provided + self.scaler = None self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: - if self.discretizer is not None: - column = discretized_training_data[:, feature] - else: - column = training_data[:, feature] + if training_data_stats is None: + if self.discretizer is not None: + column = discretized_training_data[:, feature] + else: + column = training_data[:, feature] - feature_count = collections.Counter(column) - values, frequencies = map(list, zip(*(feature_count.items()))) + feature_count = collections.Counter(column) + values, frequencies = map(list, zip(*(feature_count.items()))) + else: + values = training_data_stats["feature_values"][feature] + frequencies = training_data_stats["feature_frequencies"][feature] self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / @@ -229,6 +254,14 @@ def kernel(d, kernel_width): def convert_and_round(values): return ['%.2f' % v for v in values] + @staticmethod + def validate_training_data_stats(training_data_stats: dict): + stat_keys = list(training_data_stats.keys()) + valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"] + missing_keys = list(set(valid_stat_keys) - set(stat_keys)) + if len(missing_keys) > 0: + raise Exception("Missing keys in training_data_stats.Details:" % (missing_keys)) + def explain_instance(self, data_row, predict_fn, @@ -274,9 +307,9 @@ def explain_instance(self, scaled_data = (data - self.scaler.mean_) / self.scaler.scale_ distances = sklearn.metrics.pairwise_distances( - scaled_data, - scaled_data[0].reshape(1, -1), - metric=distance_metric + scaled_data, + scaled_data[0].reshape(1, -1), + metric=distance_metric ).ravel() yss = predict_fn(inverse) @@ -344,7 +377,7 @@ def explain_instance(self, discretized_feature_names = copy.deepcopy(feature_names) for f in self.discretizer.names: discretized_feature_names[f] = self.discretizer.names[f][int( - discretized_instance[f])] + discretized_instance[f])] domain_mapper = TableDomainMapper(feature_names, values, @@ -371,13 +404,13 @@ def explain_instance(self, (ret_exp.intercept[label], ret_exp.local_exp[label], ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data( - scaled_data, - yss, - distances, - label, - num_features, - model_regressor=model_regressor, - feature_selection=self.feature_selection) + scaled_data, + yss, + distances, + label, + num_features, + model_regressor=model_regressor, + feature_selection=self.feature_selection) if self.mode == "regression": ret_exp.intercept[1] = ret_exp.intercept[0] @@ -414,8 +447,8 @@ def __data_inverse(self, categorical_features = range(data_row.shape[0]) if self.discretizer is None: data = self.random_state.normal( - 0, 1, num_samples * data_row.shape[0]).reshape( - num_samples, data_row.shape[0]) + 0, 1, num_samples * data_row.shape[0]).reshape( + num_samples, data_row.shape[0]) if self.sample_around_instance: data = data * self.scaler.scale_ + data_row else: @@ -503,7 +536,7 @@ def __init__(self, training_data, training_labels=None, feature_names=None, # Reshape X n_samples, n_timesteps, n_features = training_data.shape training_data = np.transpose(training_data, axes=(0, 2, 1)).reshape( - n_samples, n_timesteps * n_features) + n_samples, n_timesteps * n_features) self.n_timesteps = n_timesteps self.n_features = n_features @@ -513,19 +546,19 @@ def __init__(self, training_data, training_labels=None, feature_names=None, # Send off the the super class to do its magic. super(RecurrentTabularExplainer, self).__init__( - training_data, - training_labels=training_labels, - feature_names=feature_names, - categorical_features=categorical_features, - categorical_names=categorical_names, - kernel_width=kernel_width, - kernel=kernel, - verbose=verbose, - class_names=class_names, - feature_selection=feature_selection, - discretize_continuous=discretize_continuous, - discretizer=discretizer, - random_state=random_state) + training_data, + training_labels=training_labels, + feature_names=feature_names, + categorical_features=categorical_features, + categorical_names=categorical_names, + kernel_width=kernel_width, + kernel=kernel, + verbose=verbose, + class_names=class_names, + feature_selection=feature_selection, + discretize_continuous=discretize_continuous, + discretizer=discretizer, + random_state=random_state) def _make_predict_proba(self, func): """ diff --git a/lime/tests/test_lime_tabular.py b/lime/tests/test_lime_tabular.py index cc860320..7b2e00cb 100644 --- a/lime/tests/test_lime_tabular.py +++ b/lime/tests/test_lime_tabular.py @@ -1,10 +1,11 @@ import unittest import numpy as np -import sklearn # noqa +import collections +import sklearn # noqa import sklearn.datasets import sklearn.ensemble -import sklearn.linear_model # noqa +import sklearn.linear_model # noqa from numpy.testing import assert_array_equal from sklearn.datasets import load_iris, make_classification from sklearn.ensemble import RandomForestClassifier @@ -12,6 +13,7 @@ from sklearn.linear_model import LinearRegression from lime.discretize import QuartileDiscretizer, DecileDiscretizer, EntropyDiscretizer + try: from sklearn.model_selection import train_test_split except ImportError: @@ -577,6 +579,71 @@ def testFeatureValues(self): assert_array_equal(explainer.feature_frequencies[1], np.array([.25, .25, .25, .25])) assert_array_equal(explainer.feature_frequencies[2], np.array([.5, .5])) + def test_lime_explainer_with_data_stats(self): + np.random.seed(1) + + rf = RandomForestClassifier(n_estimators=500) + rf.fit(self.train, self.labels_train) + i = np.random.randint(0, self.test.shape[0]) + + # Generate stats using a quartile descritizer + descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, + random_state=20) + + d_means = descritizer.means + d_stds = descritizer.stds + d_mins = descritizer.mins + d_maxs = descritizer.maxs + d_bins = descritizer.bins(self.train, self.target_names) + + # Compute feature values and frequencies of all columns + cat_features = np.arange(self.train.shape[1]) + discretized_training_data = descritizer.discretize(self.train) + + feature_values = {} + feature_frequencies = {} + for feature in cat_features: + column = discretized_training_data[:, feature] + feature_count = collections.Counter(column) + values, frequencies = map(list, zip(*(feature_count.items()))) + feature_values[feature] = values + feature_frequencies[feature] = frequencies + + # Convert bins to list from array + d_bins_revised = {} + index = 0 + for bin in d_bins: + d_bins_revised[index] = bin.tolist() + index = index+1 + + # Descritized stats + data_stats = {} + data_stats["means"] = d_means + data_stats["stds"] = d_stds + data_stats["maxs"] = d_maxs + data_stats["mins"] = d_mins + data_stats["bins"] = d_bins_revised + data_stats["feature_values"] = feature_values + data_stats["feature_frequencies"] = feature_frequencies + + data = np.zeros((2, len(self.feature_names))) + explainer = LimeTabularExplainer( + data, feature_names=self.feature_names, random_state=10, training_data_stats=data_stats, training_labels=self.target_names) + + exp = explainer.explain_instance(self.test[i], + rf.predict_proba, + num_features=2, + model_regressor=LinearRegression()) + + self.assertIsNotNone(exp) + keys = [x[0] for x in exp.as_list()] + self.assertEqual(1, + sum([1 if 'petal width' in x else 0 for x in keys]), + "Petal Width is a major feature") + self.assertEqual(1, + sum([1 if 'petal length' in x else 0 for x in keys]), + "Petal Length is a major feature") + if __name__ == '__main__': unittest.main() From c35342c76edb213f7b1d43ddf6746f75bc5a7fe6 Mon Sep 17 00:00:00 2001 From: Sowmya V Kollipara Date: Wed, 30 Jan 2019 19:30:50 +0530 Subject: [PATCH 2/4] Merge changes --- lime/lime_tabular.py | 107 ++++++++++++------------------------------- 1 file changed, 29 insertions(+), 78 deletions(-) diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py index 5fa5d5f0..02b30383 100644 --- a/lime/lime_tabular.py +++ b/lime/lime_tabular.py @@ -16,7 +16,6 @@ from lime.discretize import DecileDiscretizer from lime.discretize import EntropyDiscretizer from lime.discretize import BaseDiscretizer -from lime.discretize import StatsDiscretizer from . import explanation from . import lime_base @@ -113,8 +112,7 @@ def __init__(self, discretize_continuous=True, discretizer='quartile', sample_around_instance=False, - random_state=None, - training_data_stats=None): + random_state=None): """Init function. Args: @@ -155,19 +153,11 @@ def __init__(self, random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. - training_data_stats: a dict object having the details of training data - statistics.If None, training data information will be used.only matters - if discretize_continuous is True """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance - self.training_data_stats = training_data_stats - - # Validate data stats structure - if self.training_data_stats is not None: - self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: categorical_features = [] @@ -179,24 +169,18 @@ def __init__(self, self.discretizer = None if discretize_continuous: - # Set the discretizer if training data stats are provided - if self.training_data_stats is not None: - discretizer = StatsDiscretizer(training_data, self.categorical_features, - self.feature_names, labels=training_labels, - data_stats=self.training_data_stats) - if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: @@ -204,10 +188,7 @@ def __init__(self, ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) - - # Get the discretized_training_data when the stats are not provided - if(self.training_data_stats is None): - discretized_training_data = self.discretizer.discretize( + discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: @@ -222,27 +203,21 @@ def kernel(d, kernel_width): self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) - self.class_names = class_names - - # Though set has no role to play if training data stats are provided self.scaler = None + self.class_names = class_names self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: - if training_data_stats is None: - if self.discretizer is not None: - column = discretized_training_data[:, feature] - else: - column = training_data[:, feature] - - feature_count = collections.Counter(column) - values, frequencies = map(list, zip(*(feature_count.items()))) + if self.discretizer is not None: + column = discretized_training_data[:, feature] else: - values = training_data_stats["feature_values"][feature] - frequencies = training_data_stats["feature_frequencies"][feature] + column = training_data[:, feature] + + feature_count = collections.Counter(column) + values, frequencies = map(list, zip(*(feature_count.items()))) self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / @@ -254,14 +229,6 @@ def kernel(d, kernel_width): def convert_and_round(values): return ['%.2f' % v for v in values] - @staticmethod - def validate_training_data_stats(training_data_stats: dict): - stat_keys = list(training_data_stats.keys()) - valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"] - missing_keys = list(set(valid_stat_keys) - set(stat_keys)) - if len(missing_keys) > 0: - raise Exception("Missing keys in training_data_stats.Details:" % (missing_keys)) - def explain_instance(self, data_row, predict_fn, @@ -307,9 +274,9 @@ def explain_instance(self, scaled_data = (data - self.scaler.mean_) / self.scaler.scale_ distances = sklearn.metrics.pairwise_distances( - scaled_data, - scaled_data[0].reshape(1, -1), - metric=distance_metric + scaled_data, + scaled_data[0].reshape(1, -1), + metric=distance_metric ).ravel() yss = predict_fn(inverse) @@ -377,7 +344,7 @@ def explain_instance(self, discretized_feature_names = copy.deepcopy(feature_names) for f in self.discretizer.names: discretized_feature_names[f] = self.discretizer.names[f][int( - discretized_instance[f])] + discretized_instance[f])] domain_mapper = TableDomainMapper(feature_names, values, @@ -404,13 +371,13 @@ def explain_instance(self, (ret_exp.intercept[label], ret_exp.local_exp[label], ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data( - scaled_data, - yss, - distances, - label, - num_features, - model_regressor=model_regressor, - feature_selection=self.feature_selection) + scaled_data, + yss, + distances, + label, + num_features, + model_regressor=model_regressor, + feature_selection=self.feature_selection) if self.mode == "regression": ret_exp.intercept[1] = ret_exp.intercept[0] @@ -447,8 +414,8 @@ def __data_inverse(self, categorical_features = range(data_row.shape[0]) if self.discretizer is None: data = self.random_state.normal( - 0, 1, num_samples * data_row.shape[0]).reshape( - num_samples, data_row.shape[0]) + 0, 1, num_samples * data_row.shape[0]).reshape( + num_samples, data_row.shape[0]) if self.sample_around_instance: data = data * self.scaler.scale_ + data_row else: @@ -538,7 +505,7 @@ def __init__(self, training_data, mode="classification", # Reshape X n_samples, n_timesteps, n_features = training_data.shape training_data = np.transpose(training_data, axes=(0, 2, 1)).reshape( - n_samples, n_timesteps * n_features) + n_samples, n_timesteps * n_features) self.n_timesteps = n_timesteps self.n_features = n_features @@ -548,21 +515,6 @@ def __init__(self, training_data, mode="classification", # Send off the the super class to do its magic. super(RecurrentTabularExplainer, self).__init__( -<<<<<<< HEAD - training_data, - training_labels=training_labels, - feature_names=feature_names, - categorical_features=categorical_features, - categorical_names=categorical_names, - kernel_width=kernel_width, - kernel=kernel, - verbose=verbose, - class_names=class_names, - feature_selection=feature_selection, - discretize_continuous=discretize_continuous, - discretizer=discretizer, - random_state=random_state) -======= training_data, mode=mode, training_labels=training_labels, @@ -577,7 +529,6 @@ def __init__(self, training_data, mode="classification", discretize_continuous=discretize_continuous, discretizer=discretizer, random_state=random_state) ->>>>>>> 9c906b4a15ed98032b0b4ea6ab4ecada35f9ec30 def _make_predict_proba(self, func): """ From b48de1fd3fdd543c4ffcd82eba20ee6e7065f23a Mon Sep 17 00:00:00 2001 From: Sowmya V Kollipara Date: Wed, 30 Jan 2019 19:46:04 +0530 Subject: [PATCH 3/4] Changes post merge conflicts --- lime/lime_tabular.py | 118 +++++++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 43 deletions(-) diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py index 02b30383..aec2dbdc 100644 --- a/lime/lime_tabular.py +++ b/lime/lime_tabular.py @@ -16,6 +16,7 @@ from lime.discretize import DecileDiscretizer from lime.discretize import EntropyDiscretizer from lime.discretize import BaseDiscretizer +from lime.discretize import StatsDiscretizer from . import explanation from . import lime_base @@ -112,7 +113,8 @@ def __init__(self, discretize_continuous=True, discretizer='quartile', sample_around_instance=False, - random_state=None): + random_state=None, + training_data_stats=None): """Init function. Args: @@ -153,11 +155,18 @@ def __init__(self, random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. + training_data_stats: a dict object having the details of training data + statistics.If None, training data information will be used.only matters + if discretize_continuous is True """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance + self.training_data_stats = training_data_stats + + if self.training_data_stats is not None: + self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: categorical_features = [] @@ -169,18 +178,24 @@ def __init__(self, self.discretizer = None if discretize_continuous: + # Set the discretizer if training data stats are provided + if self.training_data_stats is not None: + discretizer = StatsDiscretizer(training_data, self.categorical_features, + self.feature_names, labels=training_labels, + data_stats=self.training_data_stats) + if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: @@ -188,7 +203,10 @@ def __init__(self, ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) - discretized_training_data = self.discretizer.discretize( + + # Get the discretized_training_data when the stats are not provided + if(self.training_data_stats is None): + discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: @@ -203,21 +221,27 @@ def kernel(d, kernel_width): self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) - self.scaler = None self.class_names = class_names + + # Though set has no role to play if training data stats are provided + self.scaler = None self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: - if self.discretizer is not None: - column = discretized_training_data[:, feature] - else: - column = training_data[:, feature] + if training_data_stats is None: + if self.discretizer is not None: + column = discretized_training_data[:, feature] + else: + column = training_data[:, feature] - feature_count = collections.Counter(column) - values, frequencies = map(list, zip(*(feature_count.items()))) + feature_count = collections.Counter(column) + values, frequencies = map(list, zip(*(feature_count.items()))) + else: + values = training_data_stats["feature_values"][feature] + frequencies = training_data_stats["feature_frequencies"][feature] self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / @@ -229,6 +253,14 @@ def kernel(d, kernel_width): def convert_and_round(values): return ['%.2f' % v for v in values] + @staticmethod + def validate_training_data_stats(training_data_stats: dict): + stat_keys = list(training_data_stats.keys()) + valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"] + missing_keys = list(set(valid_stat_keys) - set(stat_keys)) + if len(missing_keys) > 0: + raise Exception("Missing keys in training_data_stats.Details:" % (missing_keys)) + def explain_instance(self, data_row, predict_fn, @@ -274,9 +306,9 @@ def explain_instance(self, scaled_data = (data - self.scaler.mean_) / self.scaler.scale_ distances = sklearn.metrics.pairwise_distances( - scaled_data, - scaled_data[0].reshape(1, -1), - metric=distance_metric + scaled_data, + scaled_data[0].reshape(1, -1), + metric=distance_metric ).ravel() yss = predict_fn(inverse) @@ -344,7 +376,7 @@ def explain_instance(self, discretized_feature_names = copy.deepcopy(feature_names) for f in self.discretizer.names: discretized_feature_names[f] = self.discretizer.names[f][int( - discretized_instance[f])] + discretized_instance[f])] domain_mapper = TableDomainMapper(feature_names, values, @@ -371,13 +403,13 @@ def explain_instance(self, (ret_exp.intercept[label], ret_exp.local_exp[label], ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data( - scaled_data, - yss, - distances, - label, - num_features, - model_regressor=model_regressor, - feature_selection=self.feature_selection) + scaled_data, + yss, + distances, + label, + num_features, + model_regressor=model_regressor, + feature_selection=self.feature_selection) if self.mode == "regression": ret_exp.intercept[1] = ret_exp.intercept[0] @@ -414,8 +446,8 @@ def __data_inverse(self, categorical_features = range(data_row.shape[0]) if self.discretizer is None: data = self.random_state.normal( - 0, 1, num_samples * data_row.shape[0]).reshape( - num_samples, data_row.shape[0]) + 0, 1, num_samples * data_row.shape[0]).reshape( + num_samples, data_row.shape[0]) if self.sample_around_instance: data = data * self.scaler.scale_ + data_row else: @@ -505,7 +537,7 @@ def __init__(self, training_data, mode="classification", # Reshape X n_samples, n_timesteps, n_features = training_data.shape training_data = np.transpose(training_data, axes=(0, 2, 1)).reshape( - n_samples, n_timesteps * n_features) + n_samples, n_timesteps * n_features) self.n_timesteps = n_timesteps self.n_features = n_features @@ -515,20 +547,20 @@ def __init__(self, training_data, mode="classification", # Send off the the super class to do its magic. super(RecurrentTabularExplainer, self).__init__( - training_data, - mode=mode, - training_labels=training_labels, - feature_names=feature_names, - categorical_features=categorical_features, - categorical_names=categorical_names, - kernel_width=kernel_width, - kernel=kernel, - verbose=verbose, - class_names=class_names, - feature_selection=feature_selection, - discretize_continuous=discretize_continuous, - discretizer=discretizer, - random_state=random_state) + training_data, + mode=mode, + training_labels=training_labels, + feature_names=feature_names, + categorical_features=categorical_features, + categorical_names=categorical_names, + kernel_width=kernel_width, + kernel=kernel, + verbose=verbose, + class_names=class_names, + feature_selection=feature_selection, + discretize_continuous=discretize_continuous, + discretizer=discretizer, + random_state=random_state) def _make_predict_proba(self, func): """ From b9c3991152b757975beb17259d22962cb76ddd91 Mon Sep 17 00:00:00 2001 From: Sowmya V Kollipara Date: Thu, 31 Jan 2019 15:52:17 +0530 Subject: [PATCH 4/4] Formatting conflicts --- lime/discretize.py | 6 ++-- lime/lime_tabular.py | 78 +++++++++++++++++++++++--------------------- 2 files changed, 44 insertions(+), 40 deletions(-) diff --git a/lime/discretize.py b/lime/discretize.py index ca1a943d..3520a9ea 100644 --- a/lime/discretize.py +++ b/lime/discretize.py @@ -49,7 +49,7 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando bins = [np.unique(x) for x in bins] # Read the stats from data_stats if exists - if(data_stats is not None): + if data_stats: self.means = self.data_stats.get("means") self.stds = self.data_stats.get("stds") self.mins = self.data_stats.get("mins") @@ -70,7 +70,7 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando discretized = self.lambdas[feature](data[:, feature]) # If data stats are provided no need to compute the below set of details - if(data_stats is not None): + if data_stats: continue self.means[feature] = [] @@ -132,7 +132,7 @@ def get_inverse(q): class StatsDiscretizer(BaseDiscretizer): """ - Class to be used to supply the data stats info when descritize_continuos is true + Class to be used to supply the data stats info when discretize_continuous is true """ def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py index aec2dbdc..06ee92dc 100644 --- a/lime/lime_tabular.py +++ b/lime/lime_tabular.py @@ -155,8 +155,8 @@ def __init__(self, random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. - training_data_stats: a dict object having the details of training data - statistics.If None, training data information will be used.only matters + training_data_stats: a dict object having the details of training data + statistics.If None, training data information will be used, only matters if discretize_continuous is True """ self.random_state = check_random_state(random_state) @@ -165,7 +165,8 @@ def __init__(self, self.sample_around_instance = sample_around_instance self.training_data_stats = training_data_stats - if self.training_data_stats is not None: + # Check and raise proper error in stats are supplied in non-descritized path + if self.training_data_stats: self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: @@ -179,23 +180,23 @@ def __init__(self, self.discretizer = None if discretize_continuous: # Set the discretizer if training data stats are provided - if self.training_data_stats is not None: + if self.training_data_stats: discretizer = StatsDiscretizer(training_data, self.categorical_features, self.feature_names, labels=training_labels, data_stats=self.training_data_stats) if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'decile': self.discretizer = DecileDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif discretizer == 'entropy': self.discretizer = EntropyDiscretizer( - training_data, self.categorical_features, - self.feature_names, labels=training_labels) + training_data, self.categorical_features, + self.feature_names, labels=training_labels) elif isinstance(discretizer, BaseDiscretizer): self.discretizer = discretizer else: @@ -255,11 +256,14 @@ def convert_and_round(values): @staticmethod def validate_training_data_stats(training_data_stats: dict): + """ + Method to validate the structure of training data stats + """ stat_keys = list(training_data_stats.keys()) valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"] missing_keys = list(set(valid_stat_keys) - set(stat_keys)) if len(missing_keys) > 0: - raise Exception("Missing keys in training_data_stats.Details:" % (missing_keys)) + raise Exception("Missing keys in training_data_stats. Details:" % (missing_keys)) def explain_instance(self, data_row, @@ -306,9 +310,9 @@ def explain_instance(self, scaled_data = (data - self.scaler.mean_) / self.scaler.scale_ distances = sklearn.metrics.pairwise_distances( - scaled_data, - scaled_data[0].reshape(1, -1), - metric=distance_metric + scaled_data, + scaled_data[0].reshape(1, -1), + metric=distance_metric ).ravel() yss = predict_fn(inverse) @@ -376,7 +380,7 @@ def explain_instance(self, discretized_feature_names = copy.deepcopy(feature_names) for f in self.discretizer.names: discretized_feature_names[f] = self.discretizer.names[f][int( - discretized_instance[f])] + discretized_instance[f])] domain_mapper = TableDomainMapper(feature_names, values, @@ -403,13 +407,13 @@ def explain_instance(self, (ret_exp.intercept[label], ret_exp.local_exp[label], ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data( - scaled_data, - yss, - distances, - label, - num_features, - model_regressor=model_regressor, - feature_selection=self.feature_selection) + scaled_data, + yss, + distances, + label, + num_features, + model_regressor=model_regressor, + feature_selection=self.feature_selection) if self.mode == "regression": ret_exp.intercept[1] = ret_exp.intercept[0] @@ -537,7 +541,7 @@ def __init__(self, training_data, mode="classification", # Reshape X n_samples, n_timesteps, n_features = training_data.shape training_data = np.transpose(training_data, axes=(0, 2, 1)).reshape( - n_samples, n_timesteps * n_features) + n_samples, n_timesteps * n_features) self.n_timesteps = n_timesteps self.n_features = n_features @@ -547,20 +551,20 @@ def __init__(self, training_data, mode="classification", # Send off the the super class to do its magic. super(RecurrentTabularExplainer, self).__init__( - training_data, - mode=mode, - training_labels=training_labels, - feature_names=feature_names, - categorical_features=categorical_features, - categorical_names=categorical_names, - kernel_width=kernel_width, - kernel=kernel, - verbose=verbose, - class_names=class_names, - feature_selection=feature_selection, - discretize_continuous=discretize_continuous, - discretizer=discretizer, - random_state=random_state) + training_data, + mode=mode, + training_labels=training_labels, + feature_names=feature_names, + categorical_features=categorical_features, + categorical_names=categorical_names, + kernel_width=kernel_width, + kernel=kernel, + verbose=verbose, + class_names=class_names, + feature_selection=feature_selection, + discretize_continuous=discretize_continuous, + discretizer=discretizer, + random_state=random_state) def _make_predict_proba(self, func): """