Merge pull request #2 from farismosman/master

1. Unit testing utility functions in the module named utils. 2. Include a minor bug fix for the function overall_uplift_gain_ where col_treatment and col_outcome might not default to 'Treatment' and 'Outcome' respectively. 3. Imports missing numpy module. 4. The functions score_df and conf_mat_df were not unit-tested. I will have to think of a good test.
Minyus · Apr 29, 2019 · c1fae27 · c1fae27
2 parents 33a3848 + 436c06a
commit c1fae27
Show file tree

Hide file tree

Showing 3 changed files with 204 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,7 @@ wheels/
 .installed.cfg
 *.egg
 MANIFEST
+requirements.txt
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -113,3 +114,6 @@ Pipfile.lock
 # Output CSV files
 examples/CATE_for_Train.csv
 examples/CATE_for_Test.csv
+
+# vscode
+.vscode/
diff --git a/causallift/utils.py b/causallift/utils.py
@@ -2,6 +2,7 @@
 """ Utility functions """
 
 import pandas as pd
+import numpy as np
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
 from sklearn.metrics import confusion_matrix
 
@@ -45,8 +46,8 @@ def outcome_fraction_(df, col_outcome='Outcome'):
 def overall_uplift_gain_(df, treatment=1.0, outcome=1.0,
                          col_treatment='Treatment', col_outcome='Outcome'):
     overall_uplift_gain = \
-        (len_to(df, col_treatment='Treatment', col_outcome='Outcome') / len_t(df, col_treatment=col_treatment)) \
-        - (len_to(df, 0, 1, col_treatment='Treatment', col_outcome='Outcome') / len_t(df, 0,
+        (len_to(df, col_treatment=col_treatment, col_outcome=col_outcome) / len_t(df, col_treatment=col_treatment)) \
+        - (len_to(df, 0, 1, col_treatment=col_treatment, col_outcome=col_outcome) / len_t(df, 0,
                                                                                       col_treatment=col_treatment))
     return overall_uplift_gain
 

diff --git a/tests/utils_test.py b/tests/utils_test.py
@@ -0,0 +1,197 @@
+import unittest
+from causallift import utils
+import pandas as pd
+import numpy as np
+import random
+
+
+class UtilsTest(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def test_get_cols_features_should_return_feature_columns_excluding_default_non_feature(self):
+        df = pd.DataFrame(data=np.random.rand(3, 6), columns=['var1', 'var2', 'var3', 'Treatment', 'Outcome', 'Propensity'])
+
+        result = utils.get_cols_features(df)
+
+        self.assertEqual(['var1', 'var2', 'var3'], result)
+
+    def test_get_cols_features_should_return_feature_columns_excluding_non_default_non_feature(self):
+        df = pd.DataFrame(data=np.random.rand(3, 6), columns=['var1', 'var2', 'var3', 'MarketedTo', 'Outcome', 'Probability'])
+
+        result = utils.get_cols_features(df, non_feature_cols=['MarketedTo', 'Outcome', 'Probability'])
+
+        self.assertEqual(['var1', 'var2', 'var3'], result)
+
+    def test_concat_train_test_should_concatnate_both_sets_into_series_with_keys(self):
+        train_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])
+        test_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])
+
+        result = utils.concat_train_test(train=train_df, test=test_df)
+
+        pd.testing.assert_series_equal(pd.Series(train_df), result.xs('train'))
+        pd.testing.assert_series_equal(pd.Series(test_df), result.xs('test'))
+
+    def test_concat_train_test_df_should_concatnate_both_sets_into_frames_with_keys(self):
+        train_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])
+        test_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])
+
+        result = utils.concat_train_test_df(train=train_df, test=test_df)
+
+        pd.testing.assert_frame_equal(train_df, result.xs('train'))
+        pd.testing.assert_frame_equal(test_df, result.xs('test'))
+
+    def test_len_t_should_return_the_number_of_records_where_treatment_equals_1(self):
+        df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
+        df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(6)]
+
+        length = df[df['Treatment'] == 1].shape[0]
+        result = utils.len_t(df)
+
+        self.assertEqual(length, result)
+
+    def test_len_t_should_return_the_number_of_records_where_treatment_equals_0(self):
+        df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
+        df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(6)]
+
+        length = df[df['Treatment'] == 0].shape[0]
+        result = utils.len_t(df, treatment=0)
+
+        self.assertEqual(length, result)
+
+    def test_len_t_should_return_the_number_of_records_where_treatment_equals_0_and_treatment_col_is_not_default(self):
+        df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
+        df['MarketedTo'] = [random.sample(range(2), 1)[0] for i in range(6)]
+
+        length = df[df['MarketedTo'] == 0].shape[0]
+        result = utils.len_t(df, treatment=0, col_treatment='MarketedTo')
+
+        self.assertEqual(length, result)
+
+    def test_len_o_should_return_the_number_of_records_where_outcome_is_1(self):
+        df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
+        df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(6)]
+
+        length = df[df['Outcome'] == 1].shape[0]
+        result = utils.len_o(df)
+
+        self.assertEqual(length, result)
+
+    def test_len_o_should_return_the_number_of_records_where_outcome_is_0(self):
+        df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
+        df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(6)]
+
+        length = df[df['Outcome'] == 0].shape[0]
+        result = utils.len_o(df, outcome=0)
+
+        self.assertEqual(length, result)
+
+    def test_len_o_should_return_the_number_of_records_where_outcome_equals_0_and_outcome_col_is_not_default(self):
+        df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
+        df['Result'] = [random.sample(range(2), 1)[0] for i in range(6)]
+
+        length = df[df['Result'] == 0].shape[0]
+        result = utils.len_o(df, outcome=0, col_outcome='Result')
+
+        self.assertEqual(length, result)
+
+    def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_is_1(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
+        df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        length = df[(df['Treatment'] == 1) & (df['Outcome'] == 1)].shape[0]
+        result = utils.len_to(df)
+
+        self.assertEqual(length, result)
+
+    def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_are_different(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
+        df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        length = df[(df['Treatment'] == 1) & (df['Outcome'] == 0)].shape[0]
+        result = utils.len_to(df, outcome=0)
+
+        self.assertEqual(length, result)
+
+    def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_are_different_with_custom_column_names(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['result'] = [random.sample(range(2), 1)[0] for i in range(12)]
+        df['marketed_to'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        length = df[(df['marketed_to'] == 1) & (df['result'] == 0)].shape[0]
+        result = utils.len_to(df, outcome=0, col_outcome='result', col_treatment='marketed_to')
+
+        self.assertEqual(length, result)
+
+    def test_treatment_fraction_should_compute_percentage_of_treated(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        value = len(df[df['Treatment'] == 1])/len(df)
+        result = utils.treatment_fraction_(df)
+
+        self.assertEqual(value, result)
+
+    def test_treatment_fraction_should_compute_percentage_of_treated_with_custom_name(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['marketed'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        value = len(df[df['marketed'] == 1])/len(df)
+        result = utils.treatment_fraction_(df, col_treatment='marketed')
+
+        self.assertEqual(value, result)
+
+    def test_outcome_fraction_should_compute_percentage_of_positive_outcome(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        value = len(df[df['Outcome'] == 1])/len(df)
+        result = utils.outcome_fraction_(df)
+
+        self.assertEqual(value, result)
+
+    def test_outcome_fraction_should_compute_percentage_of_positive_outcome_with_custom_name(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['result'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        value = len(df[df['result'] == 1])/len(df)
+        result = utils.outcome_fraction_(df, col_outcome='result')
+
+        self.assertEqual(value, result)
+
+    def test_overall_uplift_gain_should_compute_uplift_for_sure_things(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
+        df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        no_treated_positive_outcome = df[(df['Treatment'] == 1) & (df['Outcome'] == 1)].shape[0]
+        no_not_treated_positive_outcome = df[(df['Treatment'] == 0) & (df['Outcome'] == 1)].shape[0]
+        no_treated = df[df['Treatment'] == 1].shape[0]
+        no_not_treated = df[df['Treatment'] == 0].shape[0]
+
+        gain = (no_treated_positive_outcome/no_treated) - (no_not_treated_positive_outcome/no_not_treated)
+        result = utils.overall_uplift_gain_(df)
+
+        self.assertEqual(gain, result)
+
+    def test_overall_uplift_gain_should_compute_uplift_for_sure_things_with_custom_colum_names(self):
+        df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
+        df['Result'] = [random.sample(range(2), 1)[0] for i in range(12)]
+        df['Contacted'] = [random.sample(range(2), 1)[0] for i in range(12)]
+
+        no_treated_positive_outcome = df[(df['Contacted'] == 1) & (df['Result'] == 1)].shape[0]
+        no_not_treated_positive_outcome = df[(df['Contacted'] == 0) & (df['Result'] == 1)].shape[0]
+        no_treated = df[df['Contacted'] == 1].shape[0]
+        no_not_treated = df[df['Contacted'] == 0].shape[0]
+
+        gain = (no_treated_positive_outcome/no_treated) - (no_not_treated_positive_outcome/no_not_treated)
+        result = utils.overall_uplift_gain_(df, col_treatment='Contacted', col_outcome='Result')
+
+        self.assertEqual(gain, result)
+
+
+if __name__ == '__main__':
+    unittest.main()