-
Notifications
You must be signed in to change notification settings - Fork 0
/
decision_tree_on_multiple_vars.R
92 lines (66 loc) · 2.99 KB
/
decision_tree_on_multiple_vars.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
####################################################################
### Script for loading data and fitting a decision tree to data ###
### VERSION 2: the "TOTBSQ" and "TOTUSJH" variables are used ###
####################################################################
#
library(dplyr)
library(ggplot2)
library(e1071)
library(rpart)
library(rpart.plot)
library(MLmetrics)
path_to_data = "/home/taras/R_scripts/BigDataCup/"
file_name = "trainingSet.RData"
load(paste0(path_to_data, file_name))
# mean/variance analysis
variables = c("TOTBSQ","TOTUSJH")
summarisedTOT = data %>%
mutate(ID = as.factor(ID), LABEL = as.factor(LABEL), TIME=as.factor(TIME)) %>%
select(variables, TIME, ID, LABEL) %>%
group_by(ID, LABEL) %>%
summarise_if(is.numeric, list(~mean(., na.rm=TRUE), ~sd(., na.rm=TRUE), ~skewness(., na.rm=TRUE)), na.rm=TRUE)
#summarisedTOT %>% na.omit() %>%
# # filter(TOTBSQ > 0 & TOTBSQ < 5e10) %>%
# ggplot(aes(x=sd, y=skewness, colour=LABEL)) + geom_point(size=0.2)
# trasforming data for decision tree
filtered = summarisedTOT %>% na.omit() %>% ungroup() %>% select(-ID) #select(LABEL, mean, sd, skewness)
sample_size = dim(filtered)[1]
train_size = round(sample_size*.8)
test_size = sample_size - train_size
data_train = filtered[1:train_size,]
data_test = filtered[(train_size+1):sample_size,]
#y_train = filtered[1:train_size, 1]
#y_test = filtered[(train_size+1):sample_size, 1]
# decision tree
fit <- rpart(LABEL~., data = data_train, method = 'class', model = T)
# draw the decision tree
rpart.plot(fit, type = 4, extra = 101)
predictions = as.numeric(rpart.predict(fit, data_test[,-1], type="vector")-1)
# calculate metrics on test dataset
cat("Accuracy =", MLmetrics::Accuracy(predictions, data_test$LABEL))
# Accuracy = 0.8746192
cat("F1_Score =", MLmetrics::F1_Score(data_test$LABEL, as.integer(predictions)))
# F1_Score = 0.9234831
#########################
### read test dataset ###
#########################
file_name = "testSet.RData"
load(paste0(path_to_data, file_name))
test_summarisedTOT = test_data %>%
mutate(ID = as.factor(ID)) %>%
select(variables, ID) %>%
group_by(ID) %>%
summarise_if(is.numeric, list(~mean(., na.rm=TRUE), ~sd(., na.rm=TRUE), ~skewness(., na.rm=TRUE)), na.rm=TRUE)
# check for NAs
#sum(is.na(test_summarisedTOT$TOTUSJH_mean))
#sum(is.na(test_summarisedTOT$TOTUSJH_sd))
#sum(is.na(test_summarisedTOT$TOTUSJH_skewness))
#test_summarisedTOT$TOTBSQ_skewness[is.na(test_summarisedTOT$TOTBSQ_skewness)] = 0
test_summarisedTOT[is.na(test_summarisedTOT)] = 0
# make predictions
predictions = as.numeric(rpart.predict(fit, test_summarisedTOT[,-1], type="vector")-1)
predictions %>% table
# write submission file
results = data.frame(Id = 1:length(predictions), ClassLabel = predictions)
readr::write_csv(x=results, path="submissions/submission9_a_decision_tree_on_2_vars.csv")
# kaggle competitions submit bigdata2019-flare-prediction -f submissions/submission9_a_decision_tree_on_2_vars.csv -m "My 9th submission: a decision tree on TOTBSQ and TOTUSJH"