-
Notifications
You must be signed in to change notification settings - Fork 0
/
Decision tree classification - Direct Marketing.R
189 lines (163 loc) · 9.17 KB
/
Decision tree classification - Direct Marketing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
##Classification Trees using rpart()
library(dplyr)
library(irr)
library(rpart) #contains relevant functions for classification as well as regression decision trees
library(caret)
#Tree plotting
#install.packages("rattle")
library(rattle) #Contains multiple utils, dplyr, ggplot etc
#install.packages("rpart.plot")
library(rpart.plot)
library(RColorBrewer)
setwd("C:\\xxxxx\Decison Trees")
dm<-read.csv("dm.csv")
summary(dm)
dm%>%mutate(Target=ifelse(AmountSpent>mean(AmountSpent),1,0))->dm #Creating new factor variable for AmountSpent
names(dm)
dm<- dm[,-10] #removing Amount spent
#Data Prep - Handle missing value
summary(dm$History)
dm$History1<-ifelse(is.na(dm$History),"Missing",as.character(dm$History))
dm$History1<-as.factor(dm$History1)
summary(dm$History1)
str(dm) #Checking which attributes can be converted to factors and then converting
dm$Children<-as.factor(dm$Children)
dm$Catalogs<-as.factor(dm$Catalogs)
names(dm)
dm<-dm[,-8]
mod<-rpart(Target~.,data=dm,
control=rpart.control(maxdepth =5),method="class",parms=list(split="gini"))
fancyRpartPlot(mod, cex = 0.7) #rattle package, cex is used to control the font sizes
fancyRpartPlot(mod)
mod #to see the actual rules
class(mod)
#there are just three depths (horizintal lines) because there is a default cp value of 0.01
#When 0s have high percentage after the split the color is green and the topmost number inside box is 0
#When 1s have high percentage after the split the color is blue and the topmost number inside box is 1
#%age is the size of the parent/child node as a % of the overall rows of data
#method = 'class' for a classification tree and 'anova' for a regression tree
#split must decrease the overall lack of fit by a factor of 0.01 (cost complexity factor) before being attempted
#gini is the default split method in R, so you can also not specify it in the above code
#let us just run by cp =0.01 and see the result
mod<-rpart(Target~.,data=dm,
control=rpart.control(cp=0.01),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
#there are just 3 depths which means cp is overriding the maxdepth (since I gave maxdepth = 5 and it still created 3 horizontal lines/splits)
mod<-rpart(Target~.,data=dm,
control=rpart.control(cp=0.002),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
#there are 8 depths
#cp can be imagined as how much the missclassification rate is decreasing by making the split
#So when cp values are less, even for small values of missclassification rate decrease there is a split
mod<-rpart(Target~.,data=dm,
control=rpart.control(cp=0.002, maxdepth=5),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
#there are 5 depths
mod1<-rpart(Target~.,data=dm,
control=rpart.control(cp=0.01, maxdepth=5),method="class",parms=list(split="gini"))
fancyRpartPlot(mod1)
#again back to three splits because of cp value. For a cp value of 0.01 it doesn't go beyond 3 splits
mod<-rpart(Target~.,data=dm,
control=rpart.control(cp=0.002, maxdepth=5),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
#we observe small blocks below 3% being split. Apply minsplit in the next code to specify minimum block size
#since we have 1000 records we will specify that 100 or block size should be atleast 10% to be split
#minsplit specifies the minimum size of the node that can be split (parent node) and not the node size after split
#to specify a limit on the number of records in the terminal node minsplit can be used (parent becomes terminal when it can no more be split)
mod<-rpart(Target~.,data=dm[,-9],
control=rpart.control(minsplit=100,cp=0.002, maxdepth=5),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
#the result contains only blocks that are greater than 10% undergoing splitting
#CHECK FOR OVERFITTING
mod<-rpart(Target~.,data=dm[,-9],
control=rpart.control(minsplit=100,cp=0.002, maxdepth=5),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
printcp(mod)
plotcp(mod)
#misslcassification error estimation to understand printcp output
table(dm$Target)
predict(mod, type="class")
sum(predict(mod, type="class")!=dm$Target) #111 are missclassified
sum(predict(mod, type="class")!=dm$Target)/nrow(dm) #0.111 percent are missclassified
y <- sum(predict(mod, type="class")!=dm$Target)/nrow(dm) #y is missclassification error
#Relative error is the ratio of missclassified error to the root node error which is 399/1000 = 0.399
y/0.399 #since root node error is constant across splits in printcp output we can as well consider y as relative error
#relative error is for training data whereas x error is for validation data
#R uses 10-fold cross validation method. Defintion of this is in the end of this code (which is why the calculated relative error above has a tiny tiny difference with the one in printcp output)
#If xerror(cross validated error) is continuously decreasing and therefore there is no overfitting
#If xerror decreases and increases, that means the model starts overfitting at that point
#for different runs of same above code the errors will be different because of different cross validation data chosen by R
#run it a few times and ensure that xerror is continuously decreasing
#so the model is good and can be finalized
#nsplit represents the number of parent nodes that have been split.
#cp is equal to difference in relative error by splitting that node. In the printcp output check the difference when the nsplit has 2 consecutive values
#Although there are six splits printcp prints only 4 as R output limits them. So only when consecutive outputs are present cp is the difference between relative values
#suppose we had a overfitting case like the one below
mod<-rpart(Target~.,data=dm[,-9],
control=rpart.control(minsplit=50,cp=0.002),method="class",parms=list(split="gini"))
fancyRpartPlot(mod)
printcp(mod)
plotcp(mod)
#xerror decreases and then increases (not always)
#not always because every time we run the above query we will get different xerrors due to different-
#-cross validation data R choses by itself
#rerun the above code to see different xerrors
#We can also use the standard thumbrule to cut-off cp. That is take cp in plotcp for which xerror is less than minimum(xerror)+xstd and prune the tree using prune command
#Note that "cp value in printcp and rpart" is different from "cp value in plot cp and prune"
#this is because:
#cp0 <- p.rpart[, 1L] which is printcp's cp (in R's source code)
#cp <- sqrt(cp0 * c(Inf, cp0[-length(cp0)])) which is plotcp's cp (in R's source code) - Plotcp cp = geometric mean (princtcp cp)
#we need to consider only plotcp's cp for taking cut-off. The horizontal line in plot cp corresponds to minimum(xerror)+xstd
#The y axis values of relative error for the horizontal line is calculated as the relative error corresponding to xerror = xerror+xstd (therefore it always appears inflated in y axis)
#now take the cp whose xerror is just below it. 0.035 and 0.0039 are close
#then we will apply that as below
mod1<-prune(mod,cp= 0.035)
fancyRpartPlot(mod1)
plotcp(mod1)
printcp(mod1)
#another way of plotting but difficult to interpret
#plot(mod, margin=0.1, main="Classification Tree for Direct Marketing")
#text(mod, use.n=TRUE, all=TRUE, cex=.7)
#Rules derivation
mod1
node4
#if history1={Low,Medium,Missing} and Salary < 58650, then 0 (bad)
#If History1={low,medium,missing}.... excel sheet
#Confusion Matrix
actual<-dm$Target
predicted<-predict(mod,type = "class")
head(predicted)
head(as.numeric(predicted))
predicted<-as.numeric(predicted)
predicted<-ifelse(predicted==2,1,0)
head(predicted)
predicted<-as.factor(predicted)
actual<-as.factor(actual)
#actual<-as.numeric(actual)
confusionMatrix(predicted,actual,positive="1")
mod1$variable.importance
#kappa metric
#kappa2(data.frame(actual,predicted))
#ROC curve analysis
install.packages("ROCR")
library(ROCR)
pred<-prediction(actual,predicted)
perf<-performance(pred,"tpr","fpr")
plot(perf,col="red")
abline(0,1, lty = 8, col = "grey")
auc<-performance(pred,"auc")
unlist(auc@y.values)
#10 fold cross validation?
#In k-fold cross-validation, the original sample is randomly partitioned into k equal size subsamples.-
#-Of the k subsamples, a single subsample is retained as the validation data for testing the model, -
#-and the remaining k-1 subsamples are used as training data. The cross-validation process is then -
#-repeated k times (the folds), with each of the k subsamples used exactly once as the validation data.-
#-The k results from the folds can then be averaged (or otherwise combined) to produce a single estimation.-
#-The advantage of this method is that all observations are used for both training and validation, -
#-and each observation is used for validation exactly once.
#Summarizing:
#Step A. It divides data into 10 parts.
#Step B. Build model on 9 parts(train) and cross validate on remaining 1 part.
#Step B above is performed on 10 combinations (10 different sets of 9 parts and 1 part)
#Models can be averaged, but in decision tree since we want best decision tree the best one model out of 10-
#- with least cross validation error is displayed