-
Notifications
You must be signed in to change notification settings - Fork 1
/
Kaggle_credit_card_fraud_detection_H2O.R
57 lines (38 loc) · 1.66 KB
/
Kaggle_credit_card_fraud_detection_H2O.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#1-Data Acquisition
setwd("D:/R Docs/ML code/Kaggle")
library(data.table)
#loading creditcard dataset using fread
credit <- fread(file = "Credit Card Fraud Detection/creditcard.csv", stringsAsFactors = FALSE)
#check that no datapoint is missing, otherwise we need to fix the dataset
##apply(credit,2,function(x) sum(is.na(x)))
colSums(is.na(credit)) #very fast than apply function whooouuu:)
#2-Divide Dataset
#we will divide dataset into training data and testing data
set.seed(2)
library(caTools) #sample.split function is present in this package
split <- sample.split(credit$Class, SplitRatio = 0.7)
#split
training_credit <- subset(credit, split== 'TRUE')
testing_credit <- subset(credit, split== 'FALSE')
# Load the H2O R package and start an local H2O cluster
library(h2o)
localH2O <- h2o.init(nthreads = -1)
h2o.init()
#put data to h2o cluster
train.h2o <- as.h2o(training_credit)
test.h2o <- as.h2o(testing_credit)
#dependent variable (Class)
y.dep <- 31
#independent variables (dropping ID variables)
x.indep <- c(1:30)
#4-Implement Logistic model
regression.model <- h2o.glm( y = y.dep, x = x.indep, training_frame = train.h2o, family = "binomial")
h2o.performance(regression.model)
#6-Model validation
#predicting values of 'Class' for testing dataset
predict.reg <- as.data.frame(h2o.predict(regression.model, test.h2o))
#"prediction" returns Probabilities , so we convert them into "0" and "1" by using if-else
predicted_test=ifelse(predict.reg$p1 > 0.5, 1, 0)
#now lets compare our solution with Kaggle's solution
library(caret)
confusionMatrix(table(Prediction=predicted_test, Actual = testing_credit$Class))