-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCustomer Phone Calls - Template.Rmd
389 lines (288 loc) · 9.97 KB
/
Customer Phone Calls - Template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
---
title: "June 18, 2020 Exam PA Rmd file"
---
Task 1: Explore the data
```{r}
# Load needed libraries.
library(ggplot2)
library(dplyr)
library(tidyr)
library(caret)
library(ExamPAData)
# Read the data.
df <- customer_phone_calls
summary(df)
# Create a histogram for irate.
ggplot(df, aes(x = irate)) +
geom_histogram(binwidth = 5, fill = "royalblue", col = "royalblue")
# Create bar chart for month.
ggplot(df, aes(x = month)) +
geom_bar(stat = "count", fill = "royalblue", col = "royalblue") +
theme(axis.text = element_text(size = 6))
# Create a bar chart for edu_years.
ggplot(df, aes(x = edu_years)) +
geom_bar(stat = "count", fill = "royalblue", col = "royalblue") +
theme(axis.text = element_text(size = 6))
# Create a boxplot of the age distribution for different jobs.
boxplot(age ~ job, data = df, ylab = "Age Distribution", cex.axis = 0.5)
# Create a graph showing the proportion purchasing by age.
ggplot(df) +
aes(x = age, fill = factor(purchase)) +
labs(y = "Proportion of Purchases") +
ggtitle("Proportion of Purchases by age") +
geom_bar(position = "fill")
# Create a graph showing the proportion purchasing by month.
ggplot(df) +
aes(x = month, fill = factor(purchase)) +
labs(y = "Proportion of Purchases") +
ggtitle("Proportion of Purchases by month") +
geom_bar(position = "fill")
```
Task 2: Consider the education variable.
No code provided.
Task 3: Handle missing values.
```{r}
# Check missing values. Display missing proportions for each variable that has them.
missing_proportion <- colMeans(is.na(df))
missing_data <- data.frame(colnames = colnames(df), missing_proportion = missing_proportion)
missing_data %>%
filter(missing_proportion > 0) %>%
ggplot(aes(x = colnames, y = missing_proportion, label = missing_proportion)) +
geom_bar(stat = "identity", fill = "royalblue", col = "royalblue")
# The code below calculates the proportion of purchases for NAs and for nonNAs for each variable that has NAs.
#
print("Purchase Proportions by variable, for missing and non missing values")
print(sprintf("%10s %15s %15s", "Variable", "PP_for_NAs", "PP_for_non_NAs"))
varnames <- c("housing", "job", "loan", "marital", "edu_years")
for (t in varnames)
{
ind <- is.na(df[t])
print(sprintf("%10s %15.2f %15.2f", t, mean(df["purchase"][ind]), mean(df["purchase"][!ind])))
}
```
The following code can be used to handle missing values.
```{r}
# Use one of the four choices below to deal with the NAs.
# Choose for each variable that has NAs.
# Replace varname with the actual variable name.
# Remove column
df$varname <- NULL
# Remove rows
df <- df[!is.na(df$varname), ]
# Convert missing values to "unknown" (works only for factor variables)
# First create a new level.
levels(df$varname)[length(levels(df$varname)) + 1] <- "unknown"
# Then use the new level to indicate NA.
df$varname[is.na(df$varname)] <- "unknown"
# Impute using the mean (works only for numeric variables)
df$varname[is.na(df$varname)] <- mean(df$varname, na.rm = TRUE)
```
Task 4: Investigate correlations.
```{r}
tmp <- dplyr::select(df, age, edu_years, CPI, CCI, irate, employment)
cor(tmp, use = "complete.obs")
```
Task 5: Conduct a principal components analysis (PCA)
```{r}
# Perform Principal Components Analysis of the four variables below.
# The variables are standardized. Then the components are calculated.
tmp <- dplyr::select(df, CPI, CCI, irate, employment)
apply(tmp, 2, mean)
apply(tmp, 2, sd)
pca <- prcomp(tmp, scale = TRUE)
pca
# Create a bi-plot.
biplot(pca, cex = 0.8, xlabs = rep(".", nrow(tmp)))
# Consider the variance explained by the principal components.
# Use the output to decide how many principal components to use in the GLM models.
vars_pca <- apply(pca$x, 2, var)
vars_pca / sum(vars_pca)
```
Calculate the principal components and add the first one to the data frame.
```{r}
pred <- as.data.frame(predict(pca, newdata = df[, c("CPI", "CCI", "irate", "employment")]))
df$PC1 <- pred$PC1
```
Split the data into training and testing. Check that the split looks reasonable.
```{r}
set.seed(1875)
train_ind <- createDataPartition(df$purchase, p = 0.7, list = FALSE)
data_train <- df[train_ind, ]
data_test <- df[-train_ind, ]
rm(train_ind)
print("Mean value of purchase on train and test data splits")
mean(data_train$purchase)
mean(data_test$purchase)
```
TASK 6: Create a generalized linear model (GLM).
```{r}
# Construct GLM using only age as an independent variable.
glm <- glm(purchase ~ age,
data = data_train,
family = binomial(link = "logit")
)
summary(glm)
# Evaluate GLM: construct ROC and calculate AUC
library(pROC)
glm_probs <- predict(glm, data_train, type = "response")
glm_probs_test <- predict(glm, data_test, type = "response")
roc <- roc(data_train$purchase, glm_probs)
par(pty = "s")
plot(roc)
pROC::auc(roc)
roc <- roc(data_test$purchase, glm_probs_test)
plot(roc)
pROC::auc(roc)
# Construct a GLM using a full set of independent variables and the first PC.
glm <- glm(purchase ~ age + job + marital + edu_years + housing + loan + phone + month + weekday + PC1,
data = data_train,
family = binomial(link = "logit")
)
summary(glm)
# Evaluate GLM: construct ROC and calculate AUC
glm_probs <- predict(glm, data_train, type = "response")
glm_probs_test <- predict(glm, data_test, type = "response")
roc <- roc(data_train$purchase, glm_probs)
par(pty = "s")
plot(roc)
pROC::auc(roc)
roc <- roc(data_test$purchase, glm_probs_test)
plot(roc)
pROC::auc(roc)
```
TASK 7: Select features using stepwise selection.
```{r}
# The following code executes the stepAIC procedure with backward selection and AIC. It first runs a new GLM adding the square of age.
glm2 <- glm(purchase ~ age + I(age^2) + job + marital + edu_years + housing + loan + phone + month + weekday + PC1,
data = data_train,
family = binomial(link = "logit")
)
library(MASS)
stepAIC(glm2,
direction = "backward",
k = 2
)
```
```{r}
# To evaluate the selected model, change the variable list to those selected by stepAIC.
glm.reduced <- glm(purchase ~ age + I(age^2) + job + marital + edu_years + housing + loan + phone + month + weekday + PC1,
data = data_train,
family = binomial(link = "logit")
)
summary(glm.reduced)
```
TASK 8: Evaluate the model.
```{r}
# Evaluate GLM: construct ROC and calculate AUC
glm_probs <- predict(glm.reduced, data_train, type = "response")
glm_probs_test <- predict(glm.reduced, data_test, type = "response")
roc <- roc(data_train$purchase, glm_probs)
par(pty = "s")
plot(roc)
pROC::auc(roc)
roc <- roc(data_test$purchase, glm_probs_test)
plot(roc)
pROC::auc(roc)
```
Task 9: Investigate a shrinkage method.
```{r}
library(glmnet)
set.seed(42)
X.train <- model.matrix(purchase ~ age + I(age^2) + job + marital + edu_years + housing + loan + phone + month + weekday + PC1,
data = data_train
)
X.test <- model.matrix(purchase ~ age + I(age^2) + job + marital + edu_years + housing + loan + phone + month + weekday + PC1,
data = data_test
)
m <- cv.glmnet(
x = X.train,
y = data_train$purchase,
family = "binomial",
type.measure = "class",
alpha = 0.5
) # alpha = 1 implies LASSO, alpha = 0 implies ridge, values between 0 and 1 imply elastic net
plot(m)
```
Use the cross-validation results to run the final elastic net regression model.
```{r}
# Fit the model
m.final <- glmnet(
x = X.train,
y = data_train$purchase,
family = "binomial",
lambda = m$lambda.min,
alpha = 0.5
)
# List variables
m.final$beta
# Evaluate against train and test sets
# Predict on training data
enet.pred.train <- predict(m.final, X.train, type = "response")
roc <- roc(as.numeric(data_train$purchase), enet.pred.train[, 1])
par(pty = "s")
plot(roc)
pROC::auc(roc)
# Predict on test data
enet.pred.test <- predict(m.final, X.test, type = "response")
roc <- roc(as.numeric(data_test$purchase), enet.pred.test[, 1])
par(pty = "s")
plot(roc)
pROC::auc(roc)
```
Task 10: Construct a decision tree.
```{r}
# Load the two needed libraries
library(rpart)
library(rpart.plot)
set.seed(1234)
formula <- "purchase ~ age + job + marital + edu_years + housing + loan + phone + month + weekday + CPI + CCI + irate"
tree1 <- rpart(formula,
data = data_train, method = "class",
control = rpart.control(minbucket = 5, cp = 0.0005, maxdepth = 7),
parms = list(split = "gini")
)
rpart.plot(tree1, type = 0, digits = 4)
# Obtain predicted probabilities for train and for test.
pred.prob.tr <- predict(tree1, type = "prob")
pred.prob.te <- predict(tree1, type = "prob", newdata = data_test)
# Construct ROC and calculate AUC for the training data.
library(pROC)
print("Training ROC and AUC")
roc <- roc(data_train$purchase, pred.prob.tr[, "1"])
par(pty = "s")
plot(roc)
pROC::auc(roc)
# Do the same for test.
print("Test ROC and AUC")
roc2 <- roc(data_test$purchase, pred.prob.te[, "1"])
par(pty = "s")
plot(roc2)
pROC::auc(roc2)
```
Task 11: Employ cost-complexity pruning to construct a smaller tree.
```{r}
tree1$cptable # This code displays the complexity parameter table for tree1.
# Select the optimal pruning parameter from the table.
```
```{r}
# Replace XX in the code below with the selected complexity parameter.
tree2 <- prune(tree1, cp = XX, "CP")
# Show the pruned tree.
rpart.plot(tree2)
# Obtain predicted probabilities for train and for test.
pred.prune.prob.tr <- predict(tree2, type = "prob")
pred.prune.prob.te <- predict(tree2, type = "prob", newdata = data_test)
# Construct ROC and calculate AUC for the training data.
library(pROC)
print("Training ROC and AUC")
roc <- roc(data_train$purchase, pred.prune.prob.tr[, "1"])
par(pty = "s")
plot(roc)
pROC::auc(roc)
# Do the same for test.
print("Test ROC and AUC")
roc2 <- roc(data_test$purchase, pred.prune.prob.te[, "1"])
par(pty = "s")
plot(roc2)
pROC::auc(roc2)
```