-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcust_revenue.R
258 lines (208 loc) · 10.9 KB
/
cust_revenue.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# __________________________________________________________________
# //////////////////////////////////////////////////////////////////
#
# Author - Anupama Rajaram
# Filename - cust_revenue.R
#
# Program Description - Marketing related functions, including
# a. revenue per customer & revenue per segment
# b. Customer Scoring and revenue prediction
#
# Attribution - Some of the code, and the dataset is from a marketing
# analytics course from Essec Business school, freely available
# on Coursera Platform.
# __________________________________________________________________
# //////////////////////////////////////////////////////////////////
# To clean up the memory of your current R session run the following line
rm(list=ls(all=TRUE))
#===================================================================#
#=========== Section 1: Import data and clean/re-arrange============#
# Load text file into local variable called 'data' - 51243 rows and 3 columns
data = read.delim(file = 'purchases.txt', header = FALSE,
sep = '\t', dec = '.')
# Rename the column headers from generic V1, V2,.. to meaningful titles
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
# interpret the last column as a date, and extract year of purchase
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
# the as.Date specifies date in a particular format
# so we can calculate individual components (day, month or year)
# to be used at a later time.
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
data$days_since = as.numeric(difftime(time1 = "2016-01-01",
time2 = data$date_of_purchase,
units = "days"))
# Data exploration - Display the data after transformation
head(data)
summary(data)
# all the snippets titled 'Data exploration' are optional,
# although they are useful to view the data & check our
# program works as expected.
#===================================================================#
#=========== Section 2: Compute average revenue per customer =======#
# First Compute recency, frequency and average purchase amount
library(sqldf) # please make sure this slqdf library is installed already
customers_2015 = sqldf("SELECT customer_id,
MIN(days_since) AS 'recency',
MAX(days_since) AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount',
MAX(purchase_amount) AS 'max_amount'
FROM data GROUP BY 1")
# Note, this dataframe has 18417 rows and 4 columns, since some users
# purchased products more than once
# Data exploration
head(customers)
summary(customers)
# Divide customers into segment based on recency and purchase amount at end of 2015
# customer segments are inactive, cold, warm high, warm low, new warm, active high,
# active low & new active
customers_2015$segment = "NA"
customers_2015$segment[which(customers_2015$recency > 365*3)] = "inactive"
customers_2015$segment[which(customers_2015$recency <= 365*3 &
customers_2015$recency > 365*2)] = "cold"
customers_2015$segment[which(customers_2015$recency <= 365*2 &
customers_2015$recency > 365*1)] = "warm"
customers_2015$segment[which(customers_2015$recency <= 365)] = "active"
customers_2015$segment[which(customers_2015$segment == "warm" &
customers_2015$first_purchase <= 365*2)] = "new warm"
customers_2015$segment[which(customers_2015$segment == "warm" &
customers_2015$amount < 100)] =
"warm low"
customers_2015$segment[which(customers_2015$segment == "warm" &
customers_2015$amount >= 100)] =
"warm high"
customers_2015$segment[which(customers_2015$segment == "active"
& customers_2015$first_purchase <= 365)] = "new active"
customers_2015$segment[which(customers_2015$segment == "active" &
customers_2015$amount < 100)] =
"active low"
customers_2015$segment[which(customers_2015$segment == "active" &
customers_2015$amount >= 100)] =
"active high"
# Next, re-order factor in a way that makes sense, then view segmentation details
customers_2015$segment = factor(x = customers_2015$segment,
levels = c("inactive", "cold",
"warm high",
"warm low",
"new warm", "active high",
"active low", "new active"
))
table(customers_2015$segment)
aggregate(x = customers_2015[, 2:4], by = list(customers_2015$segment), mean)
# Retrospective database segmenting (for 2014)
# helps us compute differences and predict values for the future
customers_2014 = sqldf("SELECT customer_id,
MIN(days_since) - 365 AS 'recency',
MAX(days_since) - 365 AS 'first_purchase',
COUNT(*) AS 'frequency',
AVG(purchase_amount) AS 'amount',
MAX(purchase_amount) AS 'max_amount'
FROM data
WHERE days_since > 365
GROUP BY 1")
# Segmenting customers for 2014
customers_2014$segment = "NA"
customers_2014$segment[which(customers_2014$recency > 365*3)] = "inactive"
customers_2014$segment[which(customers_2014$recency <= 365*3 &
customers_2014$recency > 365*2)] = "cold"
customers_2014$segment[which(customers_2014$recency <= 365*2 &
customers_2014$recency > 365*1)] = "warm"
customers_2014$segment[which(customers_2014$recency <= 365)] = "active"
customers_2014$segment[which(customers_2014$segment == "warm" &
customers_2014$first_purchase <= 365*2)] = "new warm"
customers_2014$segment[which(customers_2014$segment == "warm"
& customers_2014$amount < 100)] = "warm low"
customers_2014$segment[which(customers_2014$segment == "warm"
& customers_2014$amount >= 100)] = "warm high"
customers_2014$segment[which(customers_2014$segment == "active" &
customers_2014$first_purchase <= 365)] = "new active"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount <
100)] = "active low"
customers_2014$segment[which(customers_2014$segment == "active" & customers_2014$amount >=
100)] = "active high"
# Re-order 2014 customers & Show segmentation results
customers_2014$segment = factor(x = customers_2014$segment,
levels = c("inactive", "cold",
"warm high", "warm low", "new warm",
"active high", "active low",
"new active"))
# Data exploration - viewing details of this table
table(customers_2014$segment)
aggregate(x = customers_2014[, 2:5], by = list(customers_2014$segment),
mean)
# calculate revenue - Notice that people with no revenue in 2015 do NOT appear here
revenue_2015 = sqldf("SELECT customer_id, SUM(purchase_amount)
AS 'revenue_2015'
FROM data
WHERE year_of_purchase = 2015
GROUP BY 1")
# Merge 2015 customers and 2015 revenue tables(correct)
actual = merge(customers_2015, revenue_2015, all.x = TRUE)
actual$revenue_2015[is.na(actual$revenue_2015)] = 0
# =========== Show average revenue per customer ====================#
aggregate(x = actual$revenue_2015, by = list(customers_2015$segment),
mean)
# Merge 2014 customers and 2015 revenue tables(correct)
forward = merge(customers_2014, revenue_2015, all.x = TRUE)
forward$revenue_2015[is.na(forward$revenue_2015)] = 0
# ========== Show average revenue per segment ======================#
r = aggregate(x = forward$revenue_2015, by = list(customers_2014$segment),
mean)
print(r)
# Re-order and display results in descending order of amount
r = r[order(r$x, decreasing = TRUE), ]
print(r)
barplot(r$x, names.arg = r$Group.1)
#======================================================================#
#=========== Section 3: Customer Scoring and revenue prediction =======#
# Calibrate the "probability" model
in_sample = forward
in_sample$active_2015 = as.numeric(in_sample$revenue_2015 > 0)
library(nnet)
prob.model = multinom(formula = active_2015 ~ recency +
first_purchase + frequency + amount +
max_amount, data = in_sample)
coef = summary(prob.model)$coefficients
std = summary(prob.model)$standard.errors
print(coef)
print(std)
print(coef / std)
# For the monetary model, select only those who made a purchase
z = which(in_sample$active_2015 == 1)
head(in_sample[z, ])
summary(in_sample[z, ])
# Calibrate the "monetary" model, using a log-transform
amount.model = lm(formula = log(revenue_2015) ~ log(amount) + log(max_amount), data = in_sample[z, ])
summary(amount.model)
# Plot the results of this new monetary model
plot(x = log(in_sample[z, ]$revenue_2015), y = amount.model$fitted.values)
# --- APPLY THE MODELS TO TODAY'S DATA ---------------------
# Predict the target variables based on today's data
customers_2015$prob_predicted = predict(object = prob.model,
newdata = customers_2015,
type = "probs")
customers_2015$revenue_predicted = exp(predict(object = amount.model,
newdata = customers_2015))
customers_2015$score_predicted = customers_2015$prob_predicted * customers_2015$revenue_predicted
summary(customers_2015$prob_predicted)
summary(customers_2015$revenue_predicted)
summary(customers_2015$score_predicted)
hist(customers_2015$score_predicted)
# How many customers have an expected revenue of more than $50
z = which(customers_2015$score_predicted > 50)
print(length(z))
# Data exploration - print details of customers with expected revenue > $100
# only among the first 1000 customers
# we basically append each chosen customer into a new dataframe
z4 <- data.frame( "customer_id" = numeric(), "amount" = numeric(),
"revenue_predicted" = numeric())
for(i in 1:50) {
if(customers_2015$score_predicted[i] > 50) {
newrow4 <- (customers_2015[i,c("customer_id",
"amount",
"revenue_predicted")])
z4 <- rbind(z4, newrow4)
}
}
colnames(z4) = c('customer_id', 'amount', 'revenue')
print(z4)