Skip to content

Commit

Permalink
Merge pull request #2 from farismosman/master
Browse files Browse the repository at this point in the history
1. Unit testing utility functions in the module named utils.
2. Include a minor bug fix for the function overall_uplift_gain_ where col_treatment and col_outcome might not default to 'Treatment' and 'Outcome' respectively.
3. Imports missing numpy module.
4. The functions score_df and conf_mat_df were not unit-tested. I will have to think of a good test.
  • Loading branch information
Minyus authored Apr 29, 2019
2 parents 33a3848 + 436c06a commit c1fae27
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ wheels/
.installed.cfg
*.egg
MANIFEST
requirements.txt

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -113,3 +114,6 @@ Pipfile.lock
# Output CSV files
examples/CATE_for_Train.csv
examples/CATE_for_Test.csv

# vscode
.vscode/
5 changes: 3 additions & 2 deletions causallift/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
""" Utility functions """

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

Expand Down Expand Up @@ -45,8 +46,8 @@ def outcome_fraction_(df, col_outcome='Outcome'):
def overall_uplift_gain_(df, treatment=1.0, outcome=1.0,
col_treatment='Treatment', col_outcome='Outcome'):
overall_uplift_gain = \
(len_to(df, col_treatment='Treatment', col_outcome='Outcome') / len_t(df, col_treatment=col_treatment)) \
- (len_to(df, 0, 1, col_treatment='Treatment', col_outcome='Outcome') / len_t(df, 0,
(len_to(df, col_treatment=col_treatment, col_outcome=col_outcome) / len_t(df, col_treatment=col_treatment)) \
- (len_to(df, 0, 1, col_treatment=col_treatment, col_outcome=col_outcome) / len_t(df, 0,
col_treatment=col_treatment))
return overall_uplift_gain

Expand Down
197 changes: 197 additions & 0 deletions tests/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
import unittest
from causallift import utils
import pandas as pd
import numpy as np
import random


class UtilsTest(unittest.TestCase):

def setUp(self):
pass

def test_get_cols_features_should_return_feature_columns_excluding_default_non_feature(self):
df = pd.DataFrame(data=np.random.rand(3, 6), columns=['var1', 'var2', 'var3', 'Treatment', 'Outcome', 'Propensity'])

result = utils.get_cols_features(df)

self.assertEqual(['var1', 'var2', 'var3'], result)

def test_get_cols_features_should_return_feature_columns_excluding_non_default_non_feature(self):
df = pd.DataFrame(data=np.random.rand(3, 6), columns=['var1', 'var2', 'var3', 'MarketedTo', 'Outcome', 'Probability'])

result = utils.get_cols_features(df, non_feature_cols=['MarketedTo', 'Outcome', 'Probability'])

self.assertEqual(['var1', 'var2', 'var3'], result)

def test_concat_train_test_should_concatnate_both_sets_into_series_with_keys(self):
train_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])
test_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])

result = utils.concat_train_test(train=train_df, test=test_df)

pd.testing.assert_series_equal(pd.Series(train_df), result.xs('train'))
pd.testing.assert_series_equal(pd.Series(test_df), result.xs('test'))

def test_concat_train_test_df_should_concatnate_both_sets_into_frames_with_keys(self):
train_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])
test_df = pd.DataFrame(data=np.random.rand(3, 3), columns=['var1', 'var2', 'var3'])

result = utils.concat_train_test_df(train=train_df, test=test_df)

pd.testing.assert_frame_equal(train_df, result.xs('train'))
pd.testing.assert_frame_equal(test_df, result.xs('test'))

def test_len_t_should_return_the_number_of_records_where_treatment_equals_1(self):
df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(6)]

length = df[df['Treatment'] == 1].shape[0]
result = utils.len_t(df)

self.assertEqual(length, result)

def test_len_t_should_return_the_number_of_records_where_treatment_equals_0(self):
df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(6)]

length = df[df['Treatment'] == 0].shape[0]
result = utils.len_t(df, treatment=0)

self.assertEqual(length, result)

def test_len_t_should_return_the_number_of_records_where_treatment_equals_0_and_treatment_col_is_not_default(self):
df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
df['MarketedTo'] = [random.sample(range(2), 1)[0] for i in range(6)]

length = df[df['MarketedTo'] == 0].shape[0]
result = utils.len_t(df, treatment=0, col_treatment='MarketedTo')

self.assertEqual(length, result)

def test_len_o_should_return_the_number_of_records_where_outcome_is_1(self):
df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(6)]

length = df[df['Outcome'] == 1].shape[0]
result = utils.len_o(df)

self.assertEqual(length, result)

def test_len_o_should_return_the_number_of_records_where_outcome_is_0(self):
df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(6)]

length = df[df['Outcome'] == 0].shape[0]
result = utils.len_o(df, outcome=0)

self.assertEqual(length, result)

def test_len_o_should_return_the_number_of_records_where_outcome_equals_0_and_outcome_col_is_not_default(self):
df = pd.DataFrame(data=np.random.rand(6, 2), columns=['var1', 'var2'])
df['Result'] = [random.sample(range(2), 1)[0] for i in range(6)]

length = df[df['Result'] == 0].shape[0]
result = utils.len_o(df, outcome=0, col_outcome='Result')

self.assertEqual(length, result)

def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_is_1(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]

length = df[(df['Treatment'] == 1) & (df['Outcome'] == 1)].shape[0]
result = utils.len_to(df)

self.assertEqual(length, result)

def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_are_different(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]

length = df[(df['Treatment'] == 1) & (df['Outcome'] == 0)].shape[0]
result = utils.len_to(df, outcome=0)

self.assertEqual(length, result)

def test_len_to_should_return_the_number_of_records_where_outcome_and_treatment_are_different_with_custom_column_names(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['result'] = [random.sample(range(2), 1)[0] for i in range(12)]
df['marketed_to'] = [random.sample(range(2), 1)[0] for i in range(12)]

length = df[(df['marketed_to'] == 1) & (df['result'] == 0)].shape[0]
result = utils.len_to(df, outcome=0, col_outcome='result', col_treatment='marketed_to')

self.assertEqual(length, result)

def test_treatment_fraction_should_compute_percentage_of_treated(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]

value = len(df[df['Treatment'] == 1])/len(df)
result = utils.treatment_fraction_(df)

self.assertEqual(value, result)

def test_treatment_fraction_should_compute_percentage_of_treated_with_custom_name(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['marketed'] = [random.sample(range(2), 1)[0] for i in range(12)]

value = len(df[df['marketed'] == 1])/len(df)
result = utils.treatment_fraction_(df, col_treatment='marketed')

self.assertEqual(value, result)

def test_outcome_fraction_should_compute_percentage_of_positive_outcome(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]

value = len(df[df['Outcome'] == 1])/len(df)
result = utils.outcome_fraction_(df)

self.assertEqual(value, result)

def test_outcome_fraction_should_compute_percentage_of_positive_outcome_with_custom_name(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['result'] = [random.sample(range(2), 1)[0] for i in range(12)]

value = len(df[df['result'] == 1])/len(df)
result = utils.outcome_fraction_(df, col_outcome='result')

self.assertEqual(value, result)

def test_overall_uplift_gain_should_compute_uplift_for_sure_things(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['Outcome'] = [random.sample(range(2), 1)[0] for i in range(12)]
df['Treatment'] = [random.sample(range(2), 1)[0] for i in range(12)]

no_treated_positive_outcome = df[(df['Treatment'] == 1) & (df['Outcome'] == 1)].shape[0]
no_not_treated_positive_outcome = df[(df['Treatment'] == 0) & (df['Outcome'] == 1)].shape[0]
no_treated = df[df['Treatment'] == 1].shape[0]
no_not_treated = df[df['Treatment'] == 0].shape[0]

gain = (no_treated_positive_outcome/no_treated) - (no_not_treated_positive_outcome/no_not_treated)
result = utils.overall_uplift_gain_(df)

self.assertEqual(gain, result)

def test_overall_uplift_gain_should_compute_uplift_for_sure_things_with_custom_colum_names(self):
df = pd.DataFrame(data=np.random.rand(12, 2), columns=['var1', 'var2'])
df['Result'] = [random.sample(range(2), 1)[0] for i in range(12)]
df['Contacted'] = [random.sample(range(2), 1)[0] for i in range(12)]

no_treated_positive_outcome = df[(df['Contacted'] == 1) & (df['Result'] == 1)].shape[0]
no_not_treated_positive_outcome = df[(df['Contacted'] == 0) & (df['Result'] == 1)].shape[0]
no_treated = df[df['Contacted'] == 1].shape[0]
no_not_treated = df[df['Contacted'] == 0].shape[0]

gain = (no_treated_positive_outcome/no_treated) - (no_not_treated_positive_outcome/no_not_treated)
result = utils.overall_uplift_gain_(df, col_treatment='Contacted', col_outcome='Result')

self.assertEqual(gain, result)


if __name__ == '__main__':
unittest.main()

0 comments on commit c1fae27

Please sign in to comment.