-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_mining_code.py
More file actions
1158 lines (834 loc) · 52.9 KB
/
data_mining_code.py
File metadata and controls
1158 lines (834 loc) · 52.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""Data Mining Code.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1aaz-zth0axnIHMevlMazeB47IlS0mOAa
"""
# install the package in google collab
!pip install ydata-profiling
pip install pycountry
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
import plotly.graph_objs as go
from plotly.offline import iplot
import pycountry
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (classification_report, f1_score, recall_score,
roc_auc_score, balanced_accuracy_score)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
# for a more in depth EDA
from ydata_profiling import ProfileReport
"""1. Number of Contacts: Users with a larger number of contacts may be more engaged with the platform's social aspects and less likely to churn.
2. Plan Type: Users on certain plans may be more likely to churn. Higher-tier plans might have more features, which could increase user retention.
3. Notification Preferences: Users who have opted out of marketing notifications (both push and email) may be less engaged with the platform and more likely to churn.
4. Transaction Activity: Users who have not made transactions recently may be more likely to churn. This could be inferred from the "transactions_state" column.
5. Geographic Location: Churn rates may vary based on the user's country, city, or even merchant location (ea_merchant_city, ea_merchant_country). Certain regions may have different economic conditions or competition from other financial service providers, influencing churn rates.
6. Usage of Crypto Feature: Users who have unlocked the crypto feature may be more engaged with the platform and less likely to churn.
7. Device type: Maybe users who use a certain device are more likely to be churn users.
# **1. STARTING EDA**
# **Users exploration**
"""
pwd
users = pd.read_csv("users.csv")
users.head()
users.info()
number_columns = users.select_dtypes(include=['int64', 'float64'])
number_columns.mean()
# Notice that created_date is type object, lets convert it
users['created_date'] = pd.to_datetime(users['created_date'])
print('The last created user was {} and the first {}'.format(users['created_date'].max(), users['created_date'].min()))
# adding describe for a quick EDA, notice the num_referrals & num_successful_referrals column are 0 for mean, min, max meaning they have no values
users.describe()
#The plans are three: GOLD the best and Standard the worst
distinct_plan = users['plan'].unique()
print(distinct_plan)
distinct_user_settings_crypto_unlocked = users['user_settings_crypto_unlocked'].unique()
print("Unique user_settings_crypto_unlocked: {}".format(distinct_user_settings_crypto_unlocked))
distinct_attributes_notifications_marketing_push = users['attributes_notifications_marketing_push'].unique()
print("Unique attributes_notifications_marketing_push: {}".format(distinct_attributes_notifications_marketing_push))
distinct_attributes_notifications_marketing_email = users['attributes_notifications_marketing_email'].unique()
print("Unique attributes_notifications_marketing_email: {}".format(distinct_attributes_notifications_marketing_email))
distinct_num_referrals = users['num_referrals'].unique()
print("Unique num_referrals: {}".format(distinct_num_referrals))
distinct_num_successful_referrals = users['num_successful_referrals'].unique()
print("Unique num_successful_referrals: {}".format(distinct_num_successful_referrals))
#num_referrals and num_succeddful_referrals are all 0 so we can drop these two columns as they do not give us any information to distinguish churn users
distinct_country = users['country'].unique()
print(distinct_country)
distinct_city = users['city'].unique()
print(distinct_city)
print('The oldest user was born in {} and the youngest {}'.format(users['birth_year'].min(), users['birth_year'].max()))
plt.figure(figsize=(10, 6))
plt.hist(users['birth_year'], bins=range(users['birth_year'].min(), users['birth_year'].max() + 1), edgecolor='black')
plt.title('Distribution of Users per Birth Year')
plt.xlabel('Birth Year')
plt.ylabel('Number of Users')
plt.xticks(range(users['birth_year'].min(), users['birth_year'].max() + 1, 5)) # Set the step to 5 for example
plt.grid(True)
plt.show()
# Check for missing values-attributes_notifications_marketing_push and attributes_notifications_marketing_email have many null values
print(users.isnull().sum())
"""The below plot is to check outliers. num_contacts has a lot of outliers with the highest number being 3000. This could be real so I will not remove the outliers."""
users.iloc[:, 3:11].plot.box(figsize=(10,6))
users_filtered = users.drop(columns=['num_successful_referrals', 'num_referrals'])
# Histogram of a numerical column
plt.hist(users_filtered['num_contacts'])
plt.xlabel('Number of Contacts')
plt.ylabel('Frequency')
plt.title('Distribution of Number of Contacts')
plt.show()
# Scatter plot of two numerical columns
plt.scatter(users_filtered['birth_year'], users_filtered['num_contacts'])
plt.xlabel('Birth Year')
plt.ylabel('Number of Contacts')
plt.title('Relationship between Birth Year and Number of Contacts')
plt.show()
# Box plot of a numerical column grouped by a categorical column
sns.boxplot(x='plan', y='num_contacts', data=users_filtered)
plt.xlabel('Plan')
plt.ylabel('Number of Contacts')
plt.title('Distribution of Number of Contacts by Plan')
plt.show()
# correlations between user characteristics after dropping num_referrals & num_successful_referrals
users_filtered.corr(numeric_only = True)
sns.heatmap(users_filtered.corr(numeric_only = True),annot=True)
"""Weak correlations overall; somewhat medium correlation between marketing_push and marketing_email which makes intuitive sense as opting in for push will most likely opt in for email and vice versa. This could be because for instant the opting in for push / email are on the same screen, so users at that time can select either medium for opting in."""
# examine profile report
profile_users = ProfileReport(users_filtered, title = "Users Filtered Report")
profile_users
print("Summary of EDA")
print(" ")
print("1. High EU country presence with UK being #1 (1/3 users from there), and top 5 being in the EU")
print(" ")
print(users_filtered['country'].value_counts().head())
print(" ")
print("2. Regarding City, London (7.5% of users), Paris, Dublin, Warsaw and Bucharest are amongst top 5")
print(users_filtered['city'].value_counts().head())
print(" ")
print("3. User crypto setting highly unbalanced with only 18% having activated the feature")
print(" ")
print("4. User plan highly unbalanced with 93% belonging to the standard plan.")
print(" ")
print("5. Marketing push and marketing email have a high number of missing values. We will need to see how we can treat them during preprocessing")
print(" ")
"""# **Transactions exploration**"""
pwd
transactions = pd.read_csv("transactions.csv")
transactions.head()
transactions.shape
transactions.info()
"""The users who have not made recent transactions may be a churn user."""
transactions['created_date'] = pd.to_datetime(transactions['created_date'])
print('The latest transaction was {} and the first {}'.format(transactions['created_date'].max(), transactions['created_date'].min()))
number_columns = transactions.select_dtypes(include=['int64', 'float64'])
number_columns.describe()
# ea_cardholderpresence, ea_merchant_mcc, ea_merchant_city, ea_merchant_country have null values
print(transactions.isnull().sum())
distinct_transactions_type = transactions['transactions_type'].unique()
print('transactions_type: {}'.format(distinct_transactions_type))
distinct_transactions_currency = transactions['transactions_currency'].unique()
print('transactions_currency: {}'.format(distinct_transactions_currency))
distinct_transactions_state = transactions['transactions_state'].unique()
print('transactions_state: {}'.format(distinct_transactions_state))
distinct_ea_cardholderpresence = transactions['ea_cardholderpresence'].unique()
print('ea_cardholderpresence: {}'.format(distinct_ea_cardholderpresence))
#This column contains the Merchant Category Code (MCC) for the merchant involved in the transaction. The MCC is a four-digit code used to classify merchants by their primary business activities.
distinct_ea_merchant_mcc = transactions['ea_merchant_mcc'].unique()
print('ea_merchant_mcc: {}'.format(distinct_ea_merchant_mcc))
distinct_ea_merchant_city = transactions['ea_merchant_city'].unique()
print('ea_merchant_city: {}'.format(distinct_ea_merchant_city))
distinct_ea_merchant_country = transactions['ea_merchant_country'].unique()
print('ea_merchant_country: {}'.format(distinct_ea_merchant_country))
distinct_direction = transactions['direction'].unique()
print('direction: {}'.format(distinct_direction))
# the thought process was initially to keep the columns and fill them with with the Value of UNKNOWN
# However these values are close to 50% of the column values
# We will proceed with removing these columns and keep a mental note to perhaps repeat in the future with UNKNOWN
transactions.drop(columns = ["ea_cardholderpresence","ea_merchant_mcc","ea_merchant_city","ea_merchant_country"],inplace=True)
#transaction to be 0 seems wrong
print('The largest amount was {} and the smallest {}'.format(transactions['amount_usd'].max(), transactions['amount_usd'].min()))
#it is weird that those transactions were completed but we look for churn users and transactions even with 0 dollars are important to identify churn users so we will not remove them.
print(transactions[(transactions['amount_usd'] == 0.0) & (transactions['transactions_state'] == 'COMPLETED')])
# it seems unbelievable that one transaction was 74641551593.26 usd
transactions.iloc[:, 1:7].plot.box(figsize=(10,6))
z_scores = np.abs(stats.zscore(transactions['amount_usd']))
threshold = 3
transactions = transactions[(z_scores < threshold)]
transactions.iloc[:, 1:7].plot.box(figsize=(10,6))
transactions.info()
transactions.iloc[:,1:-2] # select all the transactions columns we want without having the transaction id, created date and user id
transactions.iloc[:,1:-2]["direction"].value_counts()
transactions.iloc[:,1:-2]["transactions_state"].value_counts()
transactions.iloc[:,1:-2]["transactions_currency"].value_counts()
transactions.iloc[:,1:-2]["transactions_type"].value_counts()
#profile_transactions = ProfileReport(transactions.iloc[:,1:-2], title = "Transactions Report")
#profile_transactions
# crashes in Google Collab , goes over >12.7 GB
"""Based on the above, without being able to run profilereport on transactions we have the following summary:
* Majority are outbound in terms of direction
* Card payments are the majority followed by ransfer and topup
* Completed make the majority of transactions
* EUR and GBP make the majority of currency transactions
# **Notifications exploration**
"""
notifications = pd.read_csv("notifications.csv")
notifications.head()
notifications.info()
notifications['created_date'] = pd.to_datetime(notifications['created_date'])
print('The latest notifications was {} and the first {}'.format(notifications['created_date'].max(), notifications['created_date'].min()))
distinct_reason = notifications['reason'].unique();
distinct_channel = notifications['channel'].unique();
distinct_status = notifications['status'].unique();
print(distinct_reason);
print(distinct_channel);
print(distinct_status);
print(notifications.isnull().sum())
notifications.groupby("user_id")["reason"].count().describe()
notifications.groupby("user_id")["reason"].count().reset_index().boxplot()
"""The middle 50% of users that have received a notification, have 4-8 notifications. The top 25% have above 8, with the max value being 289 (intuitively this is probably a user that frequently engages with the service)"""
notifications["reason"].value_counts()
"""Most notifications are either for promo or reengagement."""
notifications["channel"].value_counts() # farely balanced between email and push, with sms being the minority
notifications["status"].value_counts() # majority are sent. We keep the failed as well as the notification failing may provide an indication on churn (for example having deleted the app)
"""# **Devices exploration**"""
devices = pd.read_csv("devices.csv")
devices.head()
devices.info()
distinct_brand = devices['brand'].unique()
print(distinct_brand)
print(devices.isnull().sum())
# doing a value counts which will indicate the type of phone each user has, balanced dataset. We will keep the unknown
devices["brand"].value_counts()
devices["brand"].value_counts().plot.pie(y="count")
"""# Identify key relationships
Number of Contacts: Users with a larger number of contacts may be more engaged with the platform's social aspects and less likely to churn.
"""
min_num_contacts = users['num_contacts'].min();
mean_num_contacts = users['num_contacts'].mean();
max_num_contacts = users['num_contacts'].max();
print('Min number of contacts: {}, mean {}, max {}'.format(min_num_contacts, mean_num_contacts, max_num_contacts));
quantile_value_least = users['num_contacts'].quantile(0.05)
quantile_value_most = users['num_contacts'].quantile(0.95)
# Filter users who have number of contacts less than or equal to the quantile value
top_5_percent_least_contacts_users = users[users['num_contacts'] <= quantile_value_least]
# Filter users who have number of contacts more than or equal to the quantile value
top_5_percent_most_contacts_users = users[users['num_contacts'] >= quantile_value_most]
print("Users with the least amount of number of contacts (top 5%) with the least having {} and the most {}".format(
top_5_percent_least_contacts_users['num_contacts'].min(),
top_5_percent_least_contacts_users['num_contacts'].max()
))
print("Users with the most amount of number of contacts (top 5%) with the least having {} and the most {}".format(
top_5_percent_most_contacts_users['num_contacts'].min(),
top_5_percent_most_contacts_users['num_contacts'].max()
))
gold_plan_users = users[users['plan'] == 'GOLD']
silver_plan_users = users[users['plan'] == 'SILVER']
standard_plan_users = users[users['plan'] == 'STANDARD']
min_num_contacts_gold = gold_plan_users['num_contacts'].min();
mean_num_contacts_gold = gold_plan_users['num_contacts'].mean();
max_num_contacts_gold = gold_plan_users['num_contacts'].max();
print('Gold plan users: Min number of contacts: {}, mean {}, max {}'.format(min_num_contacts_gold, mean_num_contacts_gold, max_num_contacts_gold));
min_num_contacts_silver = silver_plan_users['num_contacts'].min();
mean_num_contacts_silver = silver_plan_users['num_contacts'].mean();
max_num_contacts_silver = silver_plan_users['num_contacts'].max();
print('Silver plan users: Min number of contacts: {}, mean {}, max {}'.format(min_num_contacts_silver, mean_num_contacts_silver, max_num_contacts_silver));
min_num_contacts_standard = standard_plan_users['num_contacts'].min();
mean_num_contacts_standard = standard_plan_users['num_contacts'].mean();
max_num_contacts_standard = standard_plan_users['num_contacts'].max();
print('Standard plan users: Min number of contacts: {}, mean {}, max {}'.format(min_num_contacts_standard, mean_num_contacts_standard, max_num_contacts_standard));
# Data
plan_names = ['Gold', 'Silver', 'Standard']
mean_contacts = [mean_num_contacts_gold, mean_num_contacts_silver, mean_num_contacts_standard]
# Plotting
x = range(len(plan_names))
width = 0.2
plt.bar([i + width for i in x], mean_contacts, width=width, label='Mean Contacts')
plt.xlabel('Plan')
plt.ylabel('Number of Mean Contacts')
plt.title('Number of Mean Contacts by Plan')
plt.xticks([i + width for i in x], plan_names)
plt.legend()
plt.show()
"""Having a gold plan usually means they have more contacts, the mean is twice as many as standard users. We see that the more expensive a plan is the more contacts a user has. However there are gold plan users with 0 contacts and Standard plan users with many contacts. Just from these two we can't say if a user is a churn user."""
notification_counts = notifications['user_id'].value_counts()
quantile_value_least = notification_counts.quantile(0.05)
least_notifications_users = notification_counts[notification_counts <= quantile_value_least].index.tolist()
quantile_value_most = notification_counts.quantile(0.95)
most_notifications_users = notification_counts[notification_counts >= quantile_value_most].index.tolist()
print("Users with the least amount of notifications (top 5%):")
print(least_notifications_users)
print("\nUsers with the most amount of notifications (top 5%):")
print(most_notifications_users)
#Filter the users with the most notifications and find the num_contacts and plan they have.
filtered_users_most_notifications = users[users['user_id'].isin(most_notifications_users)]
print("Users with the most amount of notifications (top 5%):")
print("They have mean amount of number contacts: {}".format(filtered_users_most_notifications['num_contacts'].mean()))
print("Gold plan user: {}".format(filtered_users_most_notifications[filtered_users_most_notifications['plan'] == 'GOLD'].shape[0]/filtered_users_most_notifications.shape[0]))
print("Silver plan user: {}".format(filtered_users_most_notifications[filtered_users_most_notifications['plan'] == 'SILVER'].shape[0]/filtered_users_most_notifications.shape[0]))
print("Standard plan user: {}".format(filtered_users_most_notifications[filtered_users_most_notifications['plan'] == 'STANDARD'].shape[0]/filtered_users_most_notifications.shape[0]))
#Filter the users with the least notifications and find the num_contacts and plan they have.
filtered_users_least_notifications = users[users['user_id'].isin(least_notifications_users)]
print("Users with the least amount of notifications (top 5%):")
print("They have mean amount of number contacts: {}".format(filtered_users_least_notifications['num_contacts'].mean()))
print("Gold plan user: {}".format(filtered_users_least_notifications[filtered_users_least_notifications['plan'] == 'GOLD'].shape[0]/filtered_users_least_notifications.shape[0]))
print("Silver plan user: {}".format(filtered_users_least_notifications[filtered_users_least_notifications['plan'] == 'SILVER'].shape[0]/filtered_users_least_notifications.shape[0]))
print("Standard plan user: {}".format(filtered_users_least_notifications[filtered_users_least_notifications['plan'] == 'STANDARD'].shape[0]/filtered_users_least_notifications.shape[0]))
# Data
plans = ['Gold', 'Silver', 'Standard']
most_notifications_percentage = [
filtered_users_most_notifications[filtered_users_most_notifications['plan'] == 'GOLD'].shape[0] / filtered_users_most_notifications.shape[0],
filtered_users_most_notifications[filtered_users_most_notifications['plan'] == 'SILVER'].shape[0] / filtered_users_most_notifications.shape[0],
filtered_users_most_notifications[filtered_users_most_notifications['plan'] == 'STANDARD'].shape[0] / filtered_users_most_notifications.shape[0]
]
least_notifications_percentage = [
filtered_users_least_notifications[filtered_users_least_notifications['plan'] == 'GOLD'].shape[0] / filtered_users_least_notifications.shape[0],
filtered_users_least_notifications[filtered_users_least_notifications['plan'] == 'SILVER'].shape[0] / filtered_users_least_notifications.shape[0],
filtered_users_least_notifications[filtered_users_least_notifications['plan'] == 'STANDARD'].shape[0] / filtered_users_least_notifications.shape[0]
]
# Plotting
x = np.arange(len(plans)) # Convert x to array
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, most_notifications_percentage, width, label='Most Notifications (top 5%)')
rects2 = ax.bar(x + width/2, least_notifications_percentage, width, label='Least Notifications (top 5%)')
ax.set_ylabel('Percentage of Users')
ax.set_title('Percentage of Users by Plan and Notification Amount')
ax.set_xticks(x)
ax.set_xticklabels(plans)
ax.legend()
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(round(height*100, 2)),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.show()
"""The more notifications a user has, more likely to have more number contacts and the percentage of having a golden or silver plan is much more."""
transactions_counts = transactions['user_id'].value_counts()
quantile_value_least = transactions_counts.quantile(0.05)
least_transactions_users = transactions_counts[transactions_counts <= quantile_value_least].index.tolist()
quantile_value_most = transactions_counts.quantile(0.95)
most_transactions_users = transactions_counts[transactions_counts >= quantile_value_most].index.tolist()
#Filter the users with the most transactions and find the num_contacts and plan they have.
filtered_users_most_transactions = users[users['user_id'].isin(most_transactions_users)]
print("Users with the most amount of transactions (top 5%):")
print("They have mean amount of number contacts: {}".format(filtered_users_most_transactions['num_contacts'].mean()))
print("Gold plan user: {}".format(filtered_users_most_transactions[filtered_users_most_transactions['plan'] == 'GOLD'].shape[0]/filtered_users_most_transactions.shape[0]))
print("Silver plan user: {}".format(filtered_users_most_transactions[filtered_users_most_transactions['plan'] == 'SILVER'].shape[0]/filtered_users_most_transactions.shape[0]))
print("Standard plan user: {}".format(filtered_users_most_transactions[filtered_users_most_transactions['plan'] == 'STANDARD'].shape[0]/filtered_users_most_transactions.shape[0]))
#Filter the users with the least transactions and find the num_contacts and plan they have.
filtered_users_least_transactions = users[users['user_id'].isin(least_transactions_users)]
print("Users with the least amount of transactions (top 5%):")
print("They have mean amount of number contacts: {}".format(filtered_users_least_transactions['num_contacts'].mean()))
print("Gold plan user: {}".format(filtered_users_least_transactions[filtered_users_least_transactions['plan'] == 'GOLD'].shape[0]/filtered_users_least_transactions.shape[0]))
print("Silver plan user: {}".format(filtered_users_least_transactions[filtered_users_least_transactions['plan'] == 'SILVER'].shape[0]/filtered_users_least_transactions.shape[0]))
print("Standard plan user: {}".format(filtered_users_least_transactions[filtered_users_least_transactions['plan'] == 'STANDARD'].shape[0]/filtered_users_least_transactions.shape[0]))
# Data
plans = ['Gold', 'Silver', 'Standard']
most_transactions_percentage = [
filtered_users_most_transactions[filtered_users_most_transactions['plan'] == 'GOLD'].shape[0] / filtered_users_most_transactions.shape[0],
filtered_users_most_transactions[filtered_users_most_transactions['plan'] == 'SILVER'].shape[0] / filtered_users_most_transactions.shape[0],
filtered_users_most_transactions[filtered_users_most_transactions['plan'] == 'STANDARD'].shape[0] / filtered_users_most_transactions.shape[0]
]
least_transactions_percentage = [
filtered_users_least_transactions[filtered_users_least_transactions['plan'] == 'GOLD'].shape[0] / filtered_users_least_transactions.shape[0],
filtered_users_least_transactions[filtered_users_least_transactions['plan'] == 'SILVER'].shape[0] / filtered_users_least_transactions.shape[0],
filtered_users_least_transactions[filtered_users_least_transactions['plan'] == 'STANDARD'].shape[0] / filtered_users_least_transactions.shape[0]
]
# Plotting
x = np.arange(len(plans)) # Convert x to array
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, most_transactions_percentage, width, label='Most Transactions (top 5%)')
rects2 = ax.bar(x + width/2, least_transactions_percentage, width, label='Least Transactions (top 5%)')
ax.set_ylabel('Percentage of Users')
ax.set_title('Percentage of Users by Plan and Transaction Amount')
ax.set_xticks(x)
ax.set_xticklabels(plans)
ax.legend()
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(round(height*100, 2)),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.show()
"""Again we see the more transactions a user has, more likely to have more number contacts and the percentage of having a golden or silver plan is much more."""
user_transactions = pd.merge(users, transactions_counts, left_on='user_id', right_index=True, how='left')
user_transactions['count'].max()
# Import necessary libraries
import plotly.graph_objects as go
import pycountry
def featureWorldMap(feature, title):
'''
Returns a world heatmap of a given feature
'''
# Convert ISO-3166-1 alpha-2 country codes to alpha-3
iso3_codes = [pycountry.countries.get(alpha_2=code).alpha_3 for code in user_transactions['country']]
# Define the color scale and its corresponding ranges
color_scale = [
[0, "red"], # 0-5
[5/3381, "lightcoral"], # 5-20
[20/3381, "palegreen"], # 20-50
[50/3381, "green"], # 30-infinity
[1, "darkgreen"]
]
# Create the choropleth map figure
return go.Figure(
data = {
'type':'choropleth',
'locations': iso3_codes,
'locationmode': 'ISO-3',
'colorscale': color_scale,
'z': user_transactions[feature],
'colorbar': {'title': title},
'marker': {
'line': {
'color': 'rgb(255,255,255)',
'width': 2
}
}
},
layout = {
'geo': {
'scope': 'world',
}
}
)
featureWorldMap("count", "Number of transactions")
"""# Cutoff period to define churned users"""
# Regarding cutoff_date lets see the spread of frequency users have regarding transaction
transactions.head()
transactions.info()
# change created date to be recognized as tstamp
transactions["created_date"]=pd.to_datetime(transactions["created_date"])
"""What is a churned user? We can define it as:
* Users that have not made a transaction ever (count of transactions = 0)
* Users with > 0 transactions, but have not made a transaction in x amount of time
Side note: in terms of churned behavior, we do not care about the status of the transaction (failed/ success) as engaging with the application via a transaction even if it failed means the user engaged with the app.
Additionally in terms of churned behavior, we do not care of the direction of the transaction as our aim is predicting if a user will stop using the
company's services. Even in the case of the inbound transaction the user is technically using the company's services.
"""
max_tr_date_by_userid= transactions.groupby("user_id")["created_date"].max().reset_index() #per user date of most recent transaction by user
max_tr_date_by_userid.rename(columns={
"created_date":"most_recent_date"
},inplace=True)
#rename column to most_recent_date
max_tr_date_by_userid
cmax_tr_date =transactions["created_date"].max() # date of most recent transaction
cmax_tr_date
transactions_w_recentdate=transactions.merge(max_tr_date_by_userid,on="user_id")
max_tr_date_by_userid["days_since_last_transaction"]=max_tr_date_by_userid["most_recent_date"].apply(lambda x: (cmax_tr_date - x).days)
max_tr_date_by_userid["days_since_last_transaction"].describe()
max_tr_date_by_userid.boxplot(column=["days_since_last_transaction"])
"""We see that our "lowest" quartile (the quartile with users who have had the greatest amount of days since the last transaction, is >=92 days). Thus we can establish a cutoff point for churned users to be 3 months
Let's also explore the quartile range for each user's days since last transaction belonging to a plan as an extra exploration layer
"""
sns.violinplot(x=users.merge(max_tr_date_by_userid,on="user_id")["plan"],y=users.merge(max_tr_date_by_userid,on="user_id")["days_since_last_transaction"],order=["STANDARD","SILVER","GOLD"])
"""What we see is that there is a much lower variation of days since last transaction as we move from standard -> silver -> gold. Additionally, there is a higher distribution of users that are closer to 0 the higher the plan, which intuitively tells us that a user that in a higher plan is much less likely to be considered as opted out"""
users_last_transaction_days = users.merge(max_tr_date_by_userid, on="user_id")
# Filter users based on the conditions
users_lt_90 = users_last_transaction_days[users_last_transaction_days["days_since_last_transaction"] < 90]
users_gt_90 = users_last_transaction_days[users_last_transaction_days["days_since_last_transaction"] > 90]
count_lt_90 = users_lt_90["plan"].value_counts()
count_gt_90 = users_gt_90["plan"].value_counts()
# Calculate the total number of users in each group
total_lt_90 = len(users_lt_90)
total_gt_90 = len(users_gt_90)
# Define the width of each bar
bar_width = 0.35
# Define the x positions for the bars
x_lt_90 = np.arange(len(count_lt_90))
x_gt_90 = np.arange(len(count_gt_90)) + bar_width
# Create bar chart
plt.figure(figsize=(10, 6))
bars_lt_90 = plt.bar(x_lt_90, count_lt_90.values, color="skyblue", width=bar_width, label="< 90 days")
bars_gt_90 = plt.bar(x_gt_90, count_gt_90.values, color="salmon", width=bar_width, label="> 90 days")
plt.xlabel("Plan")
plt.ylabel("Number of Users")
plt.title("Number of Users by Plan and Days Since Last Transaction")
plt.xticks(ticks=np.arange(len(count_lt_90)) + bar_width / 2, labels=count_lt_90.index)
plt.legend()
# Add percentage values on top of each bar for < 90 days
def add_percentage_labels_lt_90(bars):
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, height,
'{:.2f}%'.format((height / total_lt_90) * 100),
ha='center', va='bottom')
# Add percentage values on top of each bar for > 90 days
def add_percentage_labels_gt_90(bars):
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, height,
'{:.2f}%'.format((height / total_gt_90) * 100),
ha='center', va='bottom')
add_percentage_labels_lt_90(bars_lt_90)
add_percentage_labels_gt_90(bars_gt_90)
plt.show()
sns.violinplot(x=devices.merge(max_tr_date_by_userid,on="user_id")["brand"],y=devices.merge(max_tr_date_by_userid,on="user_id")["days_since_last_transaction"],order=["Android","Apple","Unknown"])
"""What we see is that there is a lower variation of days since last transaction as we move from Android -> Apple -> Unknown. However, the distribution of users is almost the same, which tells us that the device does not tell us if a user will be a churned user."""
# Define the cutoff date, 3 months seems to be enough to say this is a churn user
cutoff_date = pd.to_datetime('2019-02-12')
# Filter users who have not had a transaction after the cutoff date
inactive_users = transactions[transactions['created_date'] <= cutoff_date]['user_id'].unique()
print("Users who have not had a transaction after 2019-04-12:")
print(inactive_users)
max_days_since_last_transaction = users_last_transaction_days['days_since_last_transaction'].max()
print("Maximum number of days since last transaction:", max_days_since_last_transaction)
#https://levelup.gitconnected.com/plotting-choropleth-maps-in-python-b74c53b8d0a6
def featureWorldMapDaysSinceLastTransaction(feature, title):
'''
Returns a world heatmap of a given feature
'''
iso3_codes = [pycountry.countries.get(alpha_2=code).alpha_3 for code in users_last_transaction_days['country']]
return go.Figure(
data = {
'type':'choropleth',
'locations':iso3_codes,
'locationmode':'ISO-3',
'colorscale': [
[0, "darkgreen"], # 0-10
[30/498, "green"], # 30-60
[60/498, "palegreen"], # 60-90
[90/498, "red"], # 90-infinity
[1, "red"]
],
'z':users_last_transaction_days[feature],
'colorbar':{'title': title},
'marker': {
'line': {
'color':'rgb(255,255,255)',
'width':2
}
}
},
layout = {
'geo':{
'scope':'world',
}
}
)
featureWorldMapDaysSinceLastTransaction("days_since_last_transaction", "Days since last transaction")
"""# **2. Initial preprocessing**
This part will serve to clean the data, bring the tables together, as well as define churned behavior. Lets start with the users_filtered table, which we have already dropped two empty columns from.
### users_filtered
"""
users_filtered.info()
# rerunning user info, note we can create a new column based on birth_year which will be the user age
# we can create a user_age column based on birth_year
# we can create an age_in_service column based on created date;
# however this will require us to know the timestamp of the latest transaction in the service to be able to use that as a max value (keep a mental note for later)
# we can create a recency of account creation based on created_date, which will count the days of each user based on the most recent created date value
# Notice that created_date is type object, lets convert it
users_filtered["created_date"]=pd.to_datetime(users_filtered["created_date"])
# we can also create created_day, created_month, created_ year from created date
users_filtered["created_day_of_month"]=users_filtered["created_date"].dt.day
users_filtered["created_month_of_year"]=users_filtered["created_date"].dt.month
users_filtered["created_year"]=users_filtered["created_date"].dt.year
users_filtered.info()
"""How will we handle the missing values from marketing push and marketing email?
We could use the route of knn imputer, however:
* with only 15.5K records, and 33% of them missing I'm doubtful of how precise the knn imputer can be
* there is a low correlation between these two columns and any of the rest columns
* we do not have a great amount of columns to infer relationships.
As such, I feel imputing the values is not going to be beneficial and I believe the most appropriate course of action is to remove the columns entirely
Additionally, we had both values 1 (indicating enabled) and 0 (indicating disabled) present. If we had only values 1, we could assume that the null values = 0 with a higher likelihood in order to replace the values with 0. However, this is not the case here.
"""
users_filtered.drop(columns=["attributes_notifications_marketing_push","attributes_notifications_marketing_email"],inplace=True)
users_filtered.head()
users_filtered.info()
"""### transactions"""
transactions.head()
transactions["transactions_state"].unique()
transactions["direction"].unique()
transactions_2 = transactions.copy() # retain original dataframe
transactions_2_sorted=transactions_2.sort_values(by=["user_id","created_date"],ascending=[True,True]) # we sort the dataframe by user id then by created date
# we are thinking about inserting a transaction logic
transactions_2_sorted
# we want to evaluate the balance of the user essentially
# we define a logic regarding the usd amount based on whether it is an inbound or outbound transaction and the state of transaction
# this logic will return the value of amount_usd positive, negative or simply 0
# for example if the transaction is failed or pending, then the balance of the user has not really changed
# if the transaction is complete then the direction of the transaction is respected (inbound means positive amount usd, outbound means negative amount usd)
def amount_calculator_logic(x):
if x["direction"] == "INBOUND":
if x["transactions_state"] == "COMPLETED":
return x["amount_usd"]
elif x["transactions_state"] == "REVERTED" or x["transactions_state"] == "CANCELLED":
return 0 - x["amount_usd"]
else:
return 0
elif x["direction"] == "OUTBOUND":
if x["transactions_state"] == "COMPLETED":
return 0 - x["amount_usd"]
elif x["transactions_state"] == "REVERTED" or x["transactions_state"] == "CANCELLED":
return x["amount_usd"]
else:
return 0
transactions_2["amount_usd_logic"] =transactions_2_sorted.apply(amount_calculator_logic,axis=1) # we apply the function on a new column
# now we can essentially calculate the balance per user and the count of transactions by user
userid_totalamount= transactions_2.groupby("user_id").agg({
"amount_usd":np.sum,
"transaction_id":"count"
}).reset_index().rename(columns={
"amount_usd":"amount_usd_total",
"transaction_id": "count_of_transactions"
})
userid_totalamount
# note the table shows the amount from a user perspective, example user_0's balance negative balance means he has done a greater value of outbound transactions
# now we can add this to the users_filtered table
users_filtered=users_filtered.merge(userid_totalamount,how="left",on="user_id") # mergingon user_id
users_filtered.shape
users_filtered.head()
user_count_transactions_state=pd.crosstab(transactions_2['user_id'], transactions_2['transactions_state']).reset_index()
# we are essentially counting the types of different transactions per user
# we have already added the total count of transactions, however this way we have more information on a per user lever, for example the number of failed transactions
# now its time to merge with the users_filtered
users_filtered=users_filtered.merge(user_count_transactions_state,how="left", on="user_id")
users_filtered.head(3)
users_filtered.shape
max_tr_date_by_userid.head() # remember our previous table that includes the days_since last transaction?
# we can add the days_since column to our dataframe as well
max_tr_date_by_userid.drop("most_recent_date",axis=1,inplace=True)
users_filtered=users_filtered.merge(max_tr_date_by_userid,how="left",on="user_id") # merging again
users_filtered.shape
# another thing we want to do is obtain the latest transaction type, currency, amount usd logic etc for each user
# a way to do it is to make sure we have the df sorted by user id, then by created date in descending order.
# then we assign a rank on a per user id, date level where the most recent date will receive a 1
# we will then filter the dataframe for rank == 1
# we will then merge based on the user id with the users_filtered
transactions_2_sorted=transactions_2.sort_values(by=["user_id","created_date"],ascending=[True,False])
transactions_2_sorted.head()
transactions_2_sorted["rank"]=transactions_2_sorted.groupby("user_id")["created_date"].rank(method="first",ascending=False)
# we create a rank column, groupby user id, and then rank the date assigning a rank 1 to the latest
per_userid_latest_transactions = transactions_2_sorted[transactions_2_sorted["rank"] == 1] # we filter to get only the rank ==1
# we drop the columns transaction_id, amount_usd (since we have amount_usd_logic) and rank
per_userid_latest_transactions=per_userid_latest_transactions.drop(columns=["transaction_id","amount_usd","rank"])
per_userid_latest_transactions.head()
per_userid_latest_transactions.rename(
columns={
"created_date": "last_transaction_date",
"transactions_type" : "last_transaction_type",
"transactions_currency" : "last_transaction_currency",
"transactions_state" : "last_transaction_state",
"direction" : "last_transaction_direction"
}
,inplace=True) # give different name to the date column as users_filtered already has a column with that name
# we can remerge now
users_filtered=users_filtered.merge(per_userid_latest_transactions,how="left",on="user_id")
users_filtered.info()
# notice that we have missing values, which are the users that did not complete a single transaction
users_filtered.columns
# columns - count of transactions until REVERTED can be filled with 0
users_filtered[['count_of_transactions', 'CANCELLED', 'COMPLETED', 'DECLINED', 'FAILED','PENDING', 'REVERTED']]=users_filtered[['count_of_transactions', 'CANCELLED', 'COMPLETED', 'DECLINED', 'FAILED','PENDING', 'REVERTED']].fillna(0)
users_filtered.info()
# days since last transaction can be given an extremely high value such as 999999
# transaction type -> direction can be given value None
# last transaction date can be given for now the most recent transaction date (we want to create a life in service column based on the difference of last trans date and created_date)
# amount_usd_logic and amount usd total given a 0
users_filtered.isnull().sum(axis=0)
(cmax_tr_date - users_filtered["created_date"]).dt.days
#based on the approach you want comment whichever of those two rows
#users_filtered["days_since_last_transaction"]=users_filtered["days_since_last_transaction"].fillna(999999)
users_filtered["days_since_last_transaction"]=users_filtered["days_since_last_transaction"].fillna((cmax_tr_date - users_filtered["created_date"]).dt.days)
users_filtered[['last_transaction_type', 'last_transaction_currency', 'last_transaction_state','last_transaction_direction']]=users_filtered[['last_transaction_type', 'last_transaction_currency', 'last_transaction_state','last_transaction_direction']].fillna("None")
#based on the approach you want comment whichever of those two rows
#users_filtered["last_transaction_date"]=users_filtered["last_transaction_date"].fillna(cmax_tr_date)
users_filtered["last_transaction_date"]=users_filtered["last_transaction_date"].fillna(users_filtered["created_date"])
users_filtered[["amount_usd_logic","amount_usd_total"]]=users_filtered[["amount_usd_logic","amount_usd_total"]].fillna(0)
users_filtered.info() # no nulls are present
users_filtered["age_in_service"]=(users_filtered["last_transaction_date"] - users_filtered["created_date"]).dt.days
users_filtered.rename(
columns={
"CANCELLED": "num_CANCELLED_transactions",
"COMPLETED" : "num_COMPLETED_transactions",
"DECLINED" : "num_DECLINED_transactions",
"FAILED" : "num_FAILED_transactions",
"PENDING" : "num_PENDING_transactions",
"REVERTED" : "num_REVERTED_transactions",
"amount_usd_logic" : "last_amount_usd_transaction"
}
,inplace=True)
"""### notifications"""
notifications.info() # no null values
notifications # we will get a count of notifications by user, as well as the different counts per reason, channel status on a per user level
notif_count_by_user=notifications.groupby("user_id")["status"].count().reset_index().rename(columns={"status":"count of notifications"})
notif_count_by_user_reason=pd.crosstab(notifications['user_id'], notifications["reason"]).reset_index()
notif_count_by_user_channel=pd.crosstab(notifications['user_id'], notifications["channel"]).reset_index()
notif_count_by_user_status=pd.crosstab(notifications['user_id'], notifications["status"]).reset_index()
# time to merge
users_filtered=users_filtered.merge(notif_count_by_user,how="left",on="user_id")
users_filtered=users_filtered.merge(notif_count_by_user_reason,how="left",on="user_id")
users_filtered=users_filtered.merge(notif_count_by_user_channel,how="left",on="user_id")
users_filtered=users_filtered.merge(notif_count_by_user_status,how="left",on="user_id")
users_filtered.shape
notifications_sorted=notifications.sort_values(by=["user_id","created_date"],ascending=[True,False])# now we can repeat what we did for transactions, ie get the latest notification per user
notifications_sorted["rank"]=notifications_sorted.groupby("user_id")["created_date"].rank(method="first",ascending=False)
notifications_sorted_rank_1=notifications_sorted[notifications_sorted["rank"] == 1]
notifications_sorted_rank_1.head()
notifications_sorted_rank_1=notifications_sorted_rank_1.rename(
columns={
"created_date":"latest_notification_date",
"reason":"latest_notification_reason",
"channel":"latest_notification_channel",
"status":"latest_notification_status"
}
)
users_filtered=users_filtered.merge(notifications_sorted_rank_1.iloc[:,:-1],how="left",on="user_id")
users_filtered.info()
users_filtered[['count of notifications',
'BLACK_FRIDAY', 'BLUE_TUESDAY', 'ENGAGEMENT_SPLIT_BILL_RESTAURANT',
'INVEST_IN_GOLD', 'JOINING_ANNIVERSARY', 'LOST_CARD_ORDER',
'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_RESERVE_PLAN',
'NO_INITIAL_CARD_ORDER', 'NO_INITIAL_CARD_USE',
'ONBOARDING_TIPS_ACTIVATED_USERS', 'PROMO', 'PROMO_CARD_ORDER',
'REENGAGEMENT_ACTIVE_FUNDS', 'SILVER_ENGAGEMENT_FEES_SAVED',
'SILVER_ENGAGEMENT_INACTIVE_CARD', 'WELCOME_BACK', 'EMAIL', 'PUSH',
'SMS', 'FAILED', 'SENT']] = users_filtered[['count of notifications',
'BLACK_FRIDAY', 'BLUE_TUESDAY', 'ENGAGEMENT_SPLIT_BILL_RESTAURANT',
'INVEST_IN_GOLD', 'JOINING_ANNIVERSARY', 'LOST_CARD_ORDER',
'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_RESERVE_PLAN',
'NO_INITIAL_CARD_ORDER', 'NO_INITIAL_CARD_USE',
'ONBOARDING_TIPS_ACTIVATED_USERS', 'PROMO', 'PROMO_CARD_ORDER',
'REENGAGEMENT_ACTIVE_FUNDS', 'SILVER_ENGAGEMENT_FEES_SAVED',
'SILVER_ENGAGEMENT_INACTIVE_CARD', 'WELCOME_BACK', 'EMAIL', 'PUSH',
'SMS', 'FAILED', 'SENT']].fillna(0) #fill these na values with 0