-
Notifications
You must be signed in to change notification settings - Fork 0
/
gradient_boosting.R
109 lines (88 loc) · 3.64 KB
/
gradient_boosting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
##########################################################################
### Script for loading data and fitting gradient boosting algorithms ###
### VERSION 2: the "TOTBSQ" and "TOTUSJH" variables are used ###
##########################################################################
# libraries
library(reticulate)
# use_condaenv("baseclone")
use_python("/usr/bin/python3", required=TRUE)
py_config()
library(dplyr)
library(ggplot2)
library(e1071)
### data
path_to_data = "/home/taras/taras_personal/BigDataCup/"
file_name = "trainingSet.RData"
load(paste0(path_to_data, file_name))
# mean/variance analysis
variables = c("ABSNJZH", "R_VALUE", "SAVNCPP", "TOTBSQ","TOTUSJH")
#variables = c("TOTUSJH", "TOTBSQ")
#variables = c("EPSX", "MEANALP")
summarisedTOT = data %>%
mutate(ID = as.factor(ID), LABEL = as.factor(LABEL), TIME=as.factor(TIME)) %>%
select(variables, TIME, ID, LABEL) %>%
group_by(ID, LABEL) %>%
summarise_if(is.numeric, list(~mean(., na.rm=TRUE), ~sd(., na.rm=TRUE), ~skewness(., na.rm=TRUE)), na.rm=TRUE)
#summarise_if(is.numeric, list(~mean(., na.rm=TRUE), ~sd(., na.rm=TRUE), ~quantile(., probs=0.25, na.rm=TRUE), ~quantile(., probs=c(0.75), na.rm=TRUE)), na.rm=TRUE)
#summarisedTOT %>% na.omit() %>%
# # filter(TOTBSQ > 0 & TOTBSQ < 5e10) %>%
# ggplot(aes(x=sd, y=skewness, colour=LABEL)) + geom_point(size=0.2)
# trasforming data for log regression
filtered = summarisedTOT %>% na.omit() %>% ungroup() %>% select(-ID)
#filtered[-1] = scale(filtered[-1], center=TRUE, scale=TRUE)
sample_size = dim(filtered)[1]
train_size = round(sample_size*.8)
test_size = sample_size - train_size
train_rows = sample(sample_size, size=train_size)
#data_train = filtered[1:train_size,]
#data_test = filtered[(train_size+1):sample_size,]
data_train = filtered[train_rows,]
data_test = filtered[-train_rows,]
#y_train = filtered[1:train_size, 1]
#y_test = filtered[(train_size+1):sample_size, 1]
library(xgboost)
fit_xgboost = xgboost(data = as.matrix(data_train[,-1]), label = as.numeric(data_train$LABEL)-1, max_depth = 5, eta = 1,
nrounds = 5, objective = "binary:logistic")
summary(fit_xgboost)
xgb.dump(fit_xgboost, 'model.dump')
predicted = plogis(predict(fit_xgboost, as.matrix(data_test[,-1])))
optCutOff <- InformationValue::optimalCutoff(data_test$LABEL, predicted)
predictions = ifelse(predicted > optCutOff, 1, 0) # using a cutoff = 0.5
predictions %>% table
# calculate metrics on test dataset
cat("Accuracy =", MLmetrics::Accuracy(predictions, data_test$LABEL))
# Accuracy = 0.8859593
cat("F1_Score =", MLmetrics::F1_Score(data_test$LABEL, as.integer(predictions)))
# F1_Score = 0.9324417
library(caret)
#library(doParallel)
#cl <- makeForkCluster()
#registerDoParallel(cl)
#stopCluster(cl)
library(doMC)
registerDoMC(10)
f1 <- function(data, lev = NULL, model = NULL) {
# print(data)
f1_val <- MLmetrics::F1_Score(y_pred = data$pred, y_true = data$obs, positive="1")
c(F1 = f1_val)
}
#mat = lapply(c("LogitBoost", 'xgbTree', 'rf', 'svmRadial'),
# function (met) {
# train(LABEL ~ ., method=met, data=filtered)
# })
xgb_grid <- expand.grid(
nrounds = c(10, 100, 200),
eta = c(0.3, 0.1),
max_depth = c(3,5,7),
gamma = 0,
colsample_bytree=c(.8, 1.0),
min_child_weight=1,
subsample=c(0.8, 1.0)
)
fit_xgboost = train(LABEL ~ ., method="xgbTree", data=filtered, metric="F1",
trControl = trainControl(number = 5,
summaryFunction=f1),
tuneGrid = xgb_grid)
predictions = predict(fit_rf, data_test[,-1])
predictions %>% table
MLmetrics::F1_Score(y_true=data_test$LABEL, y_pred=predictions, positive="1")