From 27eecd36e127aa8579adc1c0f6efcbb4cf043d45 Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 14:27:13 +0200 Subject: [PATCH 1/8] [Faris] ignores local files --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ff8ef67..d65bc29 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ wheels/ .installed.cfg *.egg MANIFEST +requirements.txt # PyInstaller # Usually these files are written by a python script from a template @@ -104,4 +105,7 @@ venv.bak/ .mypy_cache/ # PyCharm -.idea/ \ No newline at end of file +.idea/ + +# vscode +.vscode/ \ No newline at end of file From ed36d2d3d8600a80cbdf357d59405f256cc83fbd Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 14:28:36 +0200 Subject: [PATCH 2/8] test utils: get_cols_features --- tests/utils_test.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/utils_test.py diff --git a/tests/utils_test.py b/tests/utils_test.py new file mode 100644 index 0000000..f6e1dac --- /dev/null +++ b/tests/utils_test.py @@ -0,0 +1,27 @@ +import unittest +from causallift import utils +import pandas as pd +import numpy as np + + +class UtilsTest(unittest.TestCase): + + def setUp(self): + pass + + def test_get_cols_features_should_return_feature_columns_excluding_default_non_feature(self): + df = pd.DataFrame(data=np.random.rand(3, 6), columns=['var1', 'var2', 'var3', 'Treatment', 'Outcome', 'Propensity']) + + result = utils.get_cols_features(df) + + self.assertEqual(['var1', 'var2', 'var3'], result) + + def test_get_cols_features_should_return_feature_columns_excluding_non_default_non_feature(self): + df = pd.DataFrame(data=np.random.rand(3, 6), columns=['var1', 'var2', 'var3', 'MarketedTo', 'Outcome', 'Probability']) + + result = utils.get_cols_features(df, non_feature_cols=['MarketedTo', 'Outcome', 'Probability']) + + self.assertEqual(['var1', 'var2', 'var3'], result) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From dc0db6d167ffe71bd5c80a8b78b18aa93f5c9995 Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 15:49:38 +0200 Subject: [PATCH 3/8] [Faris] test concat methods --- tests/utils_test.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/utils_test.py b/tests/utils_test.py index f6e1dac..19babdf 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -23,5 +23,23 @@ def test_get_cols_features_should_return_feature_columns_excluding_non_default_n self.assertEqual(['var1', 'var2', 'var3'], result) + def test_concat_train_test_should_concatnate_both_sets_into_series_with_keys(self): + train_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3']) + test_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3']) + + result = utils.concat_train_test(train=train_df, test=test_df) + + pd.testing.assert_series_equal(pd.Series(train_df), result.xs('train')) + pd.testing.assert_series_equal(pd.Series(test_df), result.xs('test')) + + def test_concat_train_test_df_should_concatnate_both_sets_into_frames_with_keys(self): + train_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3']) + test_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3']) + + result = utils.concat_train_test_df(train=train_df, test=test_df) + + pd.testing.assert_frame_equal(train_df, result.xs('train')) + pd.testing.assert_frame_equal(test_df, result.xs('test')) + if __name__ == '__main__': unittest.main() \ No newline at end of file From 3970d774345766091f483195829f08f494664be1 Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 16:29:40 +0200 Subject: [PATCH 4/8] [Faris] test len_t and len_o --- tests/utils_test.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/utils_test.py b/tests/utils_test.py index 19babdf..a0b6f01 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -2,6 +2,7 @@ from causallift import utils import pandas as pd import numpy as np +import random class UtilsTest(unittest.TestCase): @@ -41,5 +42,58 @@ def test_concat_train_test_df_should_concatnate_both_sets_into_frames_with_keys( pd.testing.assert_frame_equal(train_df, result.xs('train')) pd.testing.assert_frame_equal(test_df, result.xs('test')) + def test_len_t_should_return_the_number_of_records_where_treatment_equals_1(self): + df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2']) + df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(6)] + + length = df[df['Treatment'] == 1].shape[0] + result = utils.len_t(df) + + self.assertEqual(length, result) + + def test_len_t_should_return_the_number_of_records_where_treatment_equals_0(self): + df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2']) + df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(6)] + + length = df[df['Treatment'] == 0].shape[0] + result = utils.len_t(df, treatment=0) + + self.assertEqual(length, result) + + def test_len_t_should_return_the_number_of_records_where_treatment_equals_0_and_treatment_col_is_not_default(self): + df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2']) + df['MarketedTo'] = [random.sample(range(2), 1)[0] for i in range(6)] + + length = df[df['MarketedTo'] == 0].shape[0] + result = utils.len_t(df, treatment=0, col_treatment='MarketedTo') + + self.assertEqual(length, result) + + def test_len_o_should_return_the_number_of_records_where_outcome_is_1(self): + df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2']) + df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(6)] + + length = df[df['Outcome'] == 1].shape[0] + result = utils.len_o(df) + + self.assertEqual(length, result) + + def test_len_o_should_return_the_number_of_records_where_outcome_is_0(self): + df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2']) + df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(6)] + + length = df[df['Outcome'] == 0].shape[0] + result = utils.len_o(df, outcome=0) + + self.assertEqual(length, result) + + def test_len_o_should_return_the_number_of_records_where_outcome_equals_0_and_outcome_col_is_not_default(self): + df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2']) + df['Result'] = [random.sample(range(2), 1)[0] for i in range(6)] + + length = df[df['Result'] == 0].shape[0] + result = utils.len_o(df, outcome=0, col_outcome='Result') + + self.assertEqual(length, result) if __name__ == '__main__': unittest.main() \ No newline at end of file From 844569d37aad62bf08c86ca05e499eb8931dee59 Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 19:19:55 +0200 Subject: [PATCH 5/8] [Faris] test len_to --- tests/utils_test.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/utils_test.py b/tests/utils_test.py index a0b6f01..c4929ab 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -95,5 +95,37 @@ def test_len_o_should_return_the_number_of_records_where_outcome_equals_0_and_ou result = utils.len_o(df, outcome=0, col_outcome='Result') self.assertEqual(length, result) + + def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_is_1(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)] + df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)] + + length = df[(df['Treatment'] == 1) & (df['Outcome'] == 1)].shape[0] + result = utils.len_to(df) + + self.assertEqual(length, result) + + def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_are_different(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)] + df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)] + + length = df[(df['Treatment'] == 1) & (df['Outcome'] == 0)].shape[0] + result = utils.len_to(df, outcome=0) + + self.assertEqual(length, result) + + def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_are_different_with_custom_column_names(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['result'] = [random.sample(range(2), 1)[0] for i in range(12)] + df['marketed_to'] = [random.sample(range(2), 1)[0] for i in range(12)] + + length = df[(df['marketed_to'] == 1) & (df['result'] == 0)].shape[0] + result = utils.len_to(df, outcome=0, col_outcome='result', col_treatment='marketed_to') + + self.assertEqual(length, result) + + if __name__ == '__main__': unittest.main() \ No newline at end of file From e15b963111e09e42da87376b2f359d2ddf79b7e7 Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 19:58:44 +0200 Subject: [PATCH 6/8] [Faris] test methods that compute fractions of treatment and outcome --- tests/utils_test.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/utils_test.py b/tests/utils_test.py index c4929ab..aafcd07 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -126,6 +126,41 @@ def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_ self.assertEqual(length, result) + def test_treatment_fraction_should_compute_percentage_of_treated(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)] + + value = len(df[df['Treatment'] == 1])/len(df) + result = utils.treatment_fraction_(df) + + self.assertEqual(value, result) + + def test_treatment_fraction_should_compute_percentage_of_treated_with_custom_name(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['marketed'] = [random.sample(range(2), 1)[0] for i in range(12)] + + value = len(df[df['marketed'] == 1])/len(df) + result = utils.treatment_fraction_(df, col_treatment='marketed') + + self.assertEqual(value, result) + + def test_outcome_fraction_should_compute_percentage_of_positive_outcome(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)] + + value = len(df[df['Outcome'] == 1])/len(df) + result = utils.outcome_fraction_(df) + + self.assertEqual(value, result) + + def test_outcome_fraction_should_compute_percentage_of_positive_outcome_with_custom_name(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['result'] = [random.sample(range(2), 1)[0] for i in range(12)] + + value = len(df[df['result'] == 1])/len(df) + result = utils.outcome_fraction_(df, col_outcome='result') + + self.assertEqual(value, result) if __name__ == '__main__': unittest.main() \ No newline at end of file From 72f416f27215844860ad926e286a0b4e5aee210d Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 20:57:04 +0200 Subject: [PATCH 7/8] [Faris] test uplift gain for 'sure thing'. Also fixed default column names --- causallift/utils.py | 4 ++-- tests/utils_test.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/causallift/utils.py b/causallift/utils.py index 39286fe..c4ef3ce 100644 --- a/causallift/utils.py +++ b/causallift/utils.py @@ -45,8 +45,8 @@ def outcome_fraction_(df, col_outcome='Outcome'): def overall_uplift_gain_(df, treatment=1.0, outcome=1.0, col_treatment='Treatment', col_outcome='Outcome'): overall_uplift_gain = \ - (len_to(df, col_treatment='Treatment', col_outcome='Outcome') / len_t(df, col_treatment=col_treatment)) \ - - (len_to(df, 0, 1, col_treatment='Treatment', col_outcome='Outcome') / len_t(df, 0, + (len_to(df, col_treatment=col_treatment, col_outcome=col_outcome) / len_t(df, col_treatment=col_treatment)) \ + - (len_to(df, 0, 1, col_treatment=col_treatment, col_outcome=col_outcome) / len_t(df, 0, col_treatment=col_treatment)) return overall_uplift_gain diff --git a/tests/utils_test.py b/tests/utils_test.py index aafcd07..2ef55a6 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -162,5 +162,36 @@ def test_outcome_fraction_should_compute_percentage_of_positive_outcome_with_cus self.assertEqual(value, result) + def test_overall_uplift_gain_should_compute_uplift_for_sure_things(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)] + df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)] + + no_treated_positive_outcome = df[(df['Treatment'] == 1) & (df['Outcome'] == 1)].shape[0] + no_not_treated_positive_outcome = df[(df['Treatment'] == 0) & (df['Outcome'] == 1)].shape[0] + no_treated = df[df['Treatment'] == 1].shape[0] + no_not_treated = df[df['Treatment'] == 0].shape[0] + + gain = (no_treated_positive_outcome/no_treated) - (no_not_treated_positive_outcome/no_not_treated) + result = utils.overall_uplift_gain_(df) + + self.assertEqual(gain, result) + + def test_overall_uplift_gain_should_compute_uplift_for_sure_things_with_custom_colum_names(self): + df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2']) + df['Result'] = [random.sample(range(2), 1)[0] for i in range(12)] + df['Contacted'] = [random.sample(range(2), 1)[0] for i in range(12)] + + no_treated_positive_outcome = df[(df['Contacted'] == 1) & (df['Result'] == 1)].shape[0] + no_not_treated_positive_outcome = df[(df['Contacted'] == 0) & (df['Result'] == 1)].shape[0] + no_treated = df[df['Contacted'] == 1].shape[0] + no_not_treated = df[df['Contacted'] == 0].shape[0] + + gain = (no_treated_positive_outcome/no_treated) - (no_not_treated_positive_outcome/no_not_treated) + result = utils.overall_uplift_gain_(df, col_treatment='Contacted', col_outcome='Result') + + self.assertEqual(gain, result) + + if __name__ == '__main__': unittest.main() \ No newline at end of file From 85262c2cf9d4f9534756e2c538b4dcb912fc72cd Mon Sep 17 00:00:00 2001 From: Faris Osman Date: Sun, 28 Apr 2019 21:24:00 +0200 Subject: [PATCH 8/8] [Faris] imports missing numpy module --- causallift/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/causallift/utils.py b/causallift/utils.py index c4ef3ce..68c3d1d 100644 --- a/causallift/utils.py +++ b/causallift/utils.py @@ -2,6 +2,7 @@ """ Utility functions """ import pandas as pd +import numpy as np from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score from sklearn.metrics import confusion_matrix