-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep2.R
161 lines (124 loc) · 4.97 KB
/
deep2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# ---- User Data Tracking
library(RCurl)
library(jsonlite)
library(caret)
library(e1071)
library(dplyr)
library(readr)
library(skimr)
library(statmod)
library(MASS)
library(nnet) # only one hidden layer
library(neuralnet) # only one hidden layer, advanced training possibs
library(RSNNS) # variety of NN models
library(deepnet) # DBN and RBM capabilities
library(darch) # same, but pure R code (slow)
library(h2o) # java-based, fast
# install.packages(c("parallel", "foreach", "doSNOW"))
library(parallel)
library(foreach)
library(doSNOW)
# ---- DATA LOADING
us.train.x <- read.table("data/UCI HAR Dataset/train/X_train.txt")
us.train.y <- read.table("data/UCI HAR Dataset/train/y_train.txt")[[1]]
# [[1]] needed so that we get a sole vector (as we read only one col)
# important for the expected data model of RSNNS
us.test.x <- read.table("data/UCI HAR Dataset/test/X_test.txt")
us.test.y <- read.table("data/UCI HAR Dataset/test/y_test.txt")[[1]]
us.labels <- read.table("data/UCI HAR Dataset/activity_labels.txt")
barplot(table(us.train.y))
# MODEL AND CLUSTER SET-UP
# tuning param list
tuning <- list(
size = c(40,20,20,50,50), # hidden neurons
maxit = c(60,100,100,100,100), # iterations
shuffle = c(FALSE,FALSE,TRUE,FALSE,FALSE), # changes the order of rows for the selected variables
params = list(FALSE, FALSE, FALSE, FALSE, c(0.1, 20, 3)) # 0.1 indicates the weight decay
)
# cluster setup
c2 <- makeCluster(4)
clusterEvalQ(c2, {
library(RSNNS)
})
clusterExport(c2,
c("tuning", "us.train.x", "us.train.y",
"us.test.x", "us.test.y")
)
registerDoSNOW(c2)
# MODEL ARCHITECTURE
# build models
us.models <- foreach(i = 1:5, .combine = "c") %dopar% {
if (tuning$params[[i]][1]) {
set.seed(1234)
list(Model = mlp(
as.matrix(us.train.x),
decodeClassLabels(us.train.y),
size = tuning$size[[i]],
learnFunc = "Rprop",
shufflePatterns = tuning$shuffle[[i]],
learnFuncParams = tuning$params[[i]],
maxit = tuning$maxit[[i]])
)
} else {
set.seed(1234)
list(Model = mlp(
as.matrix(us.train.x),
decodeClassLabels(us.train.y),
size = tuning$size[[i]],
learnFunc = "Rprop",
shufflePatterns = tuning$shuffle[[i]],
maxit = tuning$maxit[[i]])
)
}
}
clusterExport(c2, "us.models")
us.yhat <- foreach(i = 1:5, .combine = "c") %dopar% {
list(list(
Insample = encodeClassLabels(fitted.values(us.models[[i]])), # Insample as list name, fitted.values to access insample fits
Outsample = encodeClassLabels(predict(us.models[[i]], # predict for actual test predicitons
newdata = as.matrix(us.test.x)))
))
}
# INSAMPLE PERFORMANCE
us.insample <- cbind(Y = us.train.y,
do.call(cbind.data.frame, lapply(us.yhat, `[[`, "Insample")))
# `[[` needed to access the specifically named list
colnames(us.insample) <- c("Y", paste0("Yhat", 1:5))
# difference between substitute and quote
# expr <- substitute(x + y, list(x = 1))
# print(expr) # 1 + y
# eval(expr, list(y = 2)) # 3
performance.insample <- do.call(rbind, lapply(1:5, function(i) { # rbind because I want to have the models below each other
us.dat <- us.insample[us.insample[,paste0("Yhat", i)] != 0, ] # needed to throw out the zeros as those are uncertain predictions
us.dat$Y <- factor(us.dat$Y, levels = 1:6) # to factor
us.dat[, paste0("Yhat", i)] <- factor(us.dat[, paste0("Yhat", i)], levels = 1:6) # factor
f <- substitute(~ Y + x, list(x = as.name(paste0("Yhat", i))))
# create the expression "~Y + Yhat1 .... needed for the confusionMatrix later
# as.name needed for the substitute function
res <- caret::confusionMatrix(xtabs(f, data = us.dat))
cbind(Size = tuning$size[[i]],
Maxit = tuning$maxit[[i]],
Shuffle = tuning$shuffle[[i]],
as.data.frame(t(res$overall[c("AccuracyNull", "Accuracy", "AccuracyLower", "AccuracyUpper")])))
# access acc information with [c()], transpose
}))
# OUTSAMPLE PERFORMANCE
us.outsample <- cbind(Y = us.test.y,
do.call(cbind.data.frame, lapply(us.yhat, `[[`, "Outsample")))
colnames(us.outsample) <- c("Y", paste0("Yhat", 1:5))
performance.outsample <- do.call(rbind, lapply(1:5, function(i) {
us.dat <- us.outsample[us.outsample[,paste0("Yhat", i)] != 0, ]
us.dat$Y <- factor(us.dat$Y, levels = 1:6)
us.dat[, paste0("Yhat", i)] <- factor(us.dat[, paste0("Yhat", i)], levels = 1:6)
f <- substitute(~ Y + x, list(x = as.name(paste0("Yhat", i))))
res <- caret::confusionMatrix(xtabs(f, data = us.dat))
cbind(Size = tuning$size[[i]],
Maxit = tuning$maxit[[i]],
Shuffle = tuning$shuffle[[i]],
as.data.frame(t(res$overall[c("AccuracyNull", "Accuracy", "AccuracyLower", "AccuracyUpper")])))
}))
# Next Steps:
# 1) compare to RF and XGBoost Model Performance (worth it?)
# 2) look at other UCI datasets to predict user behavior
# 3) introduce shap values
# 4) combine features with PCA - NN approach