-
Notifications
You must be signed in to change notification settings - Fork 0
/
student dropout challenge.Rmd
339 lines (326 loc) · 16 KB
/
student dropout challenge.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
---
title: "R Notebook"
output: html_notebook
---
This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*.
Student Static Data
```{r}
setwd("C:/studentdropout/Student Retention Challenge Data/Student Static Data")
stFall2011 <- read.csv("Fall 2011_ST.csv",header = T)
stFall2012 <- read.csv("Fall 2012.csv",header = T)
stFall2013 <- read.csv("Fall 2013.csv",header = T)
stFall2014 <- read.csv("Fall 2014.csv",header = T)
stFall2015 <- read.csv("Fall 2015.csv",header = T)
stFall2016 <- read.csv("Fall 2016.csv",header = T)
stSpring2012 <- read.csv("Spring 2012_ST.csv",header = T)
stSpring2013 <- read.csv("Spring 2013.csv",header = T)
stSpring2014 <- read.csv("Spring 2014.csv",header = T)
stSpring2015 <- read.csv("Spring 2015.csv",header = T)
stSpring2016 <- read.csv("Spring 2016.csv",header = T)
studentStaticData <- rbind(stFall2011,stFall2012, stFall2013, stFall2014, stFall2015, stFall2016, stSpring2012, stSpring2013, stSpring2014, stSpring2015,
stSpring2016)
head(studentStaticData)
```
Student Progress data
```{r}
setwd("C:/studentdropout/Student Retention Challenge Data/Student Progress Data")
spFall2011<- read.csv("Fall 2011_SP.csv", header=TRUE)
spFall2012 <- read.csv("Fall 2012_SP.csv", header=TRUE)
spFall2013 <- read.csv("Fall 2013_SP.csv", header=TRUE)
spFall2014 <- read.csv("Fall 2014_SP.csv", header=TRUE)
spFall2015 <- read.csv("Fall 2015_SP.csv", header=TRUE)
spFall2016 <- read.csv("Fall 2016_SP.csv", header=TRUE)
spSpring2012 <- read.csv("Spring 2012_SP.csv", header=TRUE)
spSpring2013 <- read.csv("Spring 2013_SP.csv", header=TRUE)
spSpring2014 <- read.csv("Spring 2014_SP.csv", header=TRUE)
spSpring2015 <- read.csv("Spring 2015_SP.csv", header=TRUE)
spSpring2016 <- read.csv("Spring 2016_SP.csv", header=TRUE)
spSpring2017 <- read.csv("Spring 2017_SP.csv", header=TRUE)
spSum2012 <- read.csv("Sum 2012.csv", header=TRUE)
spSum2013 <- read.csv("Sum 2013.csv", header=TRUE)
spSum2014 <- read.csv("Sum 2014.csv", header=TRUE)
spSum2015 <- read.csv("Sum 2015.csv", header=TRUE)
spSum2016 <- read.csv("Sum 2016.csv", header=TRUE)
spSum2017 <- read.csv("Sum 2017.csv", header=TRUE)
studentProgressData_max <- rbind(spFall2011,spFall2012, spFall2013, spFall2014, spFall2015, spFall2016, spSpring2012, spSpring2013, spSpring2014, spSpring2015, spSpring2016,spSpring2017,spSum2012,spSum2013,spSum2014,spSum2015,spSum2016,spSum2017)
head(studentProgressData_max)
```
Financial Aid Data
```{r}
setwd("C:/studentdropout/Student Retention Challenge Data/Student Financial Aid Data")
financialData <- read.csv("2011-2017_Cohorts_Financial_Aid_and_Fafsa_Data.csv", header=TRUE)
head(financialData)
```
Training labels
```{r}
setwd("C:/studentdropout/Student Retention Challenge Data")
TrainLabels <- read.csv("DropoutTrainLabels.csv", header = T)
```
Test IDs
```{r}
setwd("C:/studentdropout/Student Retention Challenge Data/Test Data")
testIds <- read.csv("TestIDs.csv", header = T)
```
Exploratory Data Analysis
```{r}
summary(financialData)
summary(studentStaticData)
summary(studentProgressData_max)
```
Data Cleaning for Financial Aid data
```{r}
financialData$Marital.Status <- sub("^$", "Single", financialData$Marital.Status)
financialData$Housing <- sub("^$", "Off Campus", financialData$Housing)
#Imputing the empty values of parent’s Highest Grade level with ‘Unknown’.
financialData$Father.s.Highest.Grade.Level <- sub("^$", "Unknown", financialData$Father.s.Highest.Grade.Level)
financialData$Mother.s.Highest.Grade.Level <- sub("^$", "Unknown", financialData$Mother.s.Highest.Grade.Level)
library(imputeTS)
financialData <- na_replace(financialData, 0)
```
Data Cleaning for Student Static Data
```{r}
#All the values for Campus variable are missing for all students, not significant in analysis
studentStaticData$Campus <- NULL
#Imputing the missing value with mean for birth year
studentStaticData$BirthYear <- na_replace(studentStaticData$BirthYear, 1989)
#Converting the different columns of ethnicity to one row for simplicity of analysis
for (i in (1:nrow(studentStaticData))){
if(studentStaticData$Hispanic[i] == 1) {
studentStaticData$Ethnicity[i] <- 'Hispanic'
} else if (studentStaticData$AmericanIndian[i] == 1) {
studentStaticData$Ethnicity[i] <- 'AmericanIndian'
} else if (studentStaticData$Asian[i] == 1) {
studentStaticData$Ethnicity[i] <- 'Asian'
} else if ( studentStaticData$Black[i] == 1) {
studentStaticData$Ethnicity[i] <- 'Black'
} else if ( studentStaticData$NativeHawaiian[i] == 1) {
studentStaticData$Ethnicity[i] <- 'NativeHawaiian'
} else if ( studentStaticData$White[i] == 1) {
studentStaticData$Ethnicity[i] <- 'White'
} else if ( studentStaticData$TwoOrMoreRace[i] == 1) {
studentStaticData$Ethnicity[i] <- 'TwoOrMoreRace'
} else {
studentStaticData$Ethnicity[i] <- 'Unknown'
}
}
studentStaticData$Ethnicity <- as.factor(studentStaticData$Ethnicity)
#Missing values for HSDip is imputed as 1,and the reason is to get a admission in college, student requires high school completion certificate
studentStaticData$HSDip <- ifelse(studentStaticData$HSDip == -1, NA, studentStaticData$HSDip)
studentStaticData$HSDip <- na_replace(studentStaticData$HSDip, 1)
#Imputing themissing value of HSGPAUnwtd as zero
studentStaticData$HSGPAUnwtd <- ifelse(studentStaticData$HSGPAUnwtd == -1, NA
, studentStaticData$HSGPAUnwtd)
studentStaticData$HSGPAUnwtd <- na_replace(studentStaticData$HSGPAUnwtd, 0)
#All values of HSGPAWtd, FirstGen are missing, removing the column for analys
is
#All values of DualHSSummerEnroll are 0, which means Not past dual enrollment nor summer enrollee, removing column for analysis
studentStaticData$HSGPAWtd <- NULL
studentStaticData$FirstGen <- NULL
studentStaticData$DualHSSummerEnroll <- NULL
#Imputed the missing values to zero of credit attempt transfer
studentStaticData$NumColCredAttemptTransfer <- ifelse((studentStaticData$NumColCredAttemptTransfer == -1), NA, studentStaticData$NumColCredAttemptTransfer
)
studentStaticData$NumColCredAttemptTransfer <- na_replace(studentStaticData$NumColCredAttemptTransfer, 0)
#Imputed the missing values to zero of credit attempt transfer
studentStaticData$NumColCredAcceptTransfer <- ifelse((studentStaticData$NumColCredAcceptTransfer == -1), NA, studentStaticData$NumColCredAcceptTransfer)
studentStaticData$NumColCredAcceptTransfer <- na_replace(studentStaticData$NumColCredAcceptTransfer, 0)
#CumLoanAtEntry
#All the values are missing or unknown, removing column for analysis
studentStaticData$CumLoanAtEntry <- NULL
```
Data Cleaning for Student Progress Data
```{r}
#Imputing the missing values Major1 as zero
studentProgressData_max$Major1 <- as.numeric(studentProgressData_max$Major1)
studentProgressData_max$Major1 <- ifelse(studentProgressData_max$Major1 == -1
, NA, studentProgressData_max$Major1)
studentProgressData_max$Major1 <- na_replace(studentProgressData_max$Major1,
0)
#All the values for Complete2 are zero, removing the column for analysis
studentProgressData_max$Complete2 <- NULL
#All the values for CIP2 are unknown, so removing the column for analysis
studentProgressData_max$CompleteCIP2 <- NULL
#All the values for TransferIntent are missing, so removing the column for analysis
studentProgressData_max$TransferIntent <- NULL
#All students are pursuing bachelor's degree, so removing the column for analysis
studentProgressData_max$DegreeTypeSought <- NULL
```
Merge financial data, static data and progress data
```{r}
combinedStudentData <- merge(x = studentProgressData_max, y = studentStaticData,by = c("StudentID","Cohort","CohortTerm"))
financialData.static.progress <- merge(x = combinedStudentData, y = financialData,
by.y = "ï..ID.with.leading", by.x = "StudentID")
```
Merge the train labels with combined dataset
```{r}
CombinedData.trainLabels <- merge(x = TrainLabels, y = financialData.static.progress,
by.y = "StudentID", by.x = "StudentID")
CombinedData.trainLabels$Dropout <- as.factor(CombinedData.trainLabels$Dropout)
barplot(table(CombinedData.trainLabels$Dropout))
```
Split dataset into training and testing
```{r}
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(31)
intrain <- createDataPartition(CombinedData.trainLabels$Dropout,p=0.75,list =
FALSE)
train1 <- CombinedData.trainLabels[intrain,]
test1 <- CombinedData.trainLabels[-intrain,]
trctrl <- trainControl(method = "cv", number = 5)
```
Model Type: Classification Tree
```{r}
model1 <- train(Dropout ~ Cohort + CohortTerm + Gender + BirthYear + BirthMonth + HSDipYr
+ HSGPAUnwtd + EnrollmentStatus + Ethnicity + HSDip + HSGPAUnwtd + NumColCredAttemptTransfer + NumColCredAcceptTransfer
+ HighDeg + MathPlacement + EngPlacement
+ GatewayMathStatus + GatewayEnglishStatus
+ Marital.Status + Adjusted.Gross.Income + Parent.Adjusted.Gross.Income
+ Father.s.Highest.Grade.Level + Mother.s.Highest.Grade.Level
+ Housing + X2012.Loan + X2012.Scholarship + X2012.Work.Study
+ X2012.Grant
+ X2013.Loan + X2013.Scholarship + X2013.Work.Study + X2013.Grant
+ X2014.Loan + X2014.Scholarship + X2014.Work.Study + X2014.Grant
+ X2015.Loan + X2015.Scholarship + X2015.Work.Study + X2015.Grant
+ X2016.Loan + X2016.Scholarship + X2016.Work.Study + X2016.Grant
+ X2017.Loan + X2017.Scholarship + X2017.Work.Study + X2017.Grant
+ Term + AcademicYear + CompleteDevEnglish
+ CompleteDevMath + Major2 + CompleteCIP1
+ Major1 + Complete1 + TermGPA + CumGPA
, data = train1, method = "rpart", trControl=trctrl)
predictions1 <- predict(model1, newdata = test1)
confusionMatrix(predictions1, test1$Dropout)$overall[1]
bagImp1 <- varImp(model1, scale=TRUE)
```
Model Type: Kth Nearest Neighbor
```{r}
model2 <- train(Dropout ~ Cohort + CohortTerm + Gender + BirthYear + BirthMonth + HSDipYr
+ HSGPAUnwtd + EnrollmentStatus + Ethnicity + HSDip + HSGPAUnwtd + NumColCredAttemptTransfer + NumColCredAcceptTransfer
+ HighDeg + MathPlacement + EngPlacement
+ GatewayMathStatus + GatewayEnglishStatus
+ Marital.Status + Adjusted.Gross.Income + Parent.Adjusted.Gross.Income
+ Father.s.Highest.Grade.Level + Mother.s.Highest.Grade.Level
+ Housing + X2012.Loan + X2012.Scholarship + X2012.Work.Study
+ X2012.Grant
+ X2013.Loan + X2013.Scholarship + X2013.Work.Study + X2013.Grant
+ X2014.Loan + X2014.Scholarship + X2014.Work.Study + X2014.Grant
+ X2015.Loan + X2015.Scholarship + X2015.Work.Study + X2015.Grant
+ X2016.Loan + X2016.Scholarship + X2016.Work.Study + X2016.Grant
+ X2017.Loan + X2017.Scholarship + X2017.Work.Study + X2017.Grant
+ Term + AcademicYear + CompleteDevEnglish
+ CompleteDevMath + Major2 + CompleteCIP1
+ Major1 + Complete1
+ TermGPA + CumGPA
, data = train1, method = "knn", trControl=trctrl)
predictions2 <- predict(model2, newdata = test1)
confusionMatrix(predictions2, test1$Dropout)$overall[1]
bagImp2 <- varImp(model2, scale=TRUE)
```
Model Type: Bagging
```{r}
model4 <- train(Dropout ~ Cohort + CohortTerm + Gender + BirthYear + BirthMonth + HSDipYr
+ HSGPAUnwtd + EnrollmentStatus + Ethnicity + HSDip + HSGPAUnwtd + NumColCredAttemptTransfer + NumColCredAcceptTransfer
+ HighDeg + MathPlacement + EngPlacement
+ GatewayMathStatus + GatewayEnglishStatus
+ Marital.Status + Adjusted.Gross.Income + Parent.Adjusted.Gross.Income
+ Father.s.Highest.Grade.Level + Mother.s.Highest.Grade.Level
+ Housing + X2012.Loan + X2012.Scholarship + X2012.Work.Study
+ X2012.Grant
+ X2013.Loan + X2013.Scholarship + X2013.Work.Study + X2013.Grant
+ X2014.Loan + X2014.Scholarship + X2014.Work.Study + X2014.Grant
+ X2015.Loan + X2015.Scholarship + X2015.Work.Study + X2015.Grant
+ X2016.Loan + X2016.Scholarship + X2016.Work.Study + X2016.Grant
+ X2017.Loan + X2017.Scholarship + X2017.Work.Study + X2017.Grant
+ Term + AcademicYear + CompleteDevEnglish
+ CompleteDevMath + Major2 + CompleteCIP1
+ Major1 + Complete1
+ TermGPA + CumGPA
, data = train1, method = "treebag", trControl=trctrl)
predictions4 <- predict(model4, newdata = test1)
confusionMatrix(predictions4, test1$Dropout)$overall[1]
bagImp4 <- varImp(model4, scale=TRUE)
```
Model Type: Logistic Regression
```{r}
model5 <- train(Dropout ~ Cohort + CohortTerm + Gender + BirthYear + BirthMonth + HSDipYr
+ HSGPAUnwtd + EnrollmentStatus + Ethnicity + HSDip + HSGPAUnwtd + NumColCredAttemptTransfer + NumColCredAcceptTransfer
+ HighDeg + MathPlacement + EngPlacement
+ GatewayMathStatus + GatewayEnglishStatus
+ Marital.Status + Adjusted.Gross.Income + Parent.Adjusted.Gross.Income
+ Father.s.Highest.Grade.Level + Mother.s.Highest.Grade.Level
+ Housing + X2012.Loan + X2012.Scholarship + X2012.Work.Study
+ X2012.Grant
+ X2013.Loan + X2013.Scholarship + X2013.Work.Study + X2013.Grant
+ X2014.Loan + X2014.Scholarship + X2014.Work.Study + X2014.rant
+ X2015.Loan + X2015.Scholarship + X2015.Work.Study + X2015.Grant
+ X2016.Loan + X2016.Scholarship + X2016.Work.Study + X2016.Grant
+ X2017.Loan + X2017.Scholarship + X2017.Work.Study + X2017.Grant
+ Term + AcademicYear + CompleteDevEnglish
+ CompleteDevMath + Major2 + CompleteCIP1
+ Major1 + Complete1
+ TermGPA + CumGPA
, data = train1, method = "glm", family="binomial", trControl
=trctrl)
predictions5 <- predict(model5, newdata = test1)
confusionMatrix(predictions5, test1$Dropout)$overall[1]
bagImp5 <- varImp(model5, scale=TRUE)
```
Model Type: Model Stacking with random forest
```{r}
#Construct data frame with predictions
predDF <- data.frame(predictions1,predictions2, predictions4, class = test1$Dropout)
predDF$class <- as.factor(predDF$class)
#Combine models using random forest
combModFit.rf <- train(class ~ .
, method = "rf", data = predDF, distribution = 'binomial')
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
combPred.rf <- predict(combModFit.rf, predDF)
confusionMatrix(combPred.rf, predDF$class)$overall[1]
```
ROC curve
```{r}
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roccurve1 <- roc(test1$Dropout ~ as.numeric(predictions1))
roccurve2 <- roc(test1$Dropout ~ as.numeric(predictions2))
roccurve4 <- roc(test1$Dropout ~ as.numeric(predictions4))
roccurve5 <- roc(test1$Dropout ~ as.numeric(predictions5))
#ROC Curve for model stacking
roccurve <- roc(predDF$class ~ as.numeric(combPred.rf))
roccurve$auc
## Area under the curve: 0.9517
roccurve$sensitivities
## [1] 1.00000 0.93153 0.00000
roccurve$specificities
## [1] 0.0000000 0.9718235 1.0000000
plot(roccurve1, print.auc = TRUE, col = "blue",print.auc.y = .7)
plot(roccurve2, print.auc = TRUE,
col = "green", print.auc.y = .6, add = TRUE)
plot(roccurve4, print.auc = TRUE,
col = "navy blue", print.auc.y = .8, add = TRUE)
plot(roccurve5, print.auc = TRUE,
col = "orange", print.auc.y = .5, add = TRUE)
plot(roccurve, print.auc = TRUE,
col = "red", print.auc.y = .9, add = TRUE)
```
Results
```{r}
financialData.static.testIDs <- merge(x = testIds, y = financialData.static.p
rogress,
by.y = "StudentID", by.x = "StudentID")
predictions1 <- predict(model1, newdata = financialData.static.testIDs)
predictions2 <- predict(model2, newdata = financialData.static.testIDs)
predictions4 <- predict(model4, newdata = financialData.static.testIDs)
test_predDF <- data.frame( predictions1, predictions2, predictions4)
test_combPred.rf <- predict(combModFit.rf,newdata = test_predDF)
```
Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.