-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNaive_Bayes_Random_Forest_SanjayKPattanayak.R
134 lines (93 loc) · 4.3 KB
/
Naive_Bayes_Random_Forest_SanjayKPattanayak.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Major : Business Intelligence and Analytics
# Subject : CS-513/Knowledge Discovery in Databses
# Purpose : Assignment-5/Create C4.5 classification tree
# First Name : Sanjay Kumar
# Last Name : Pattanayak
# Id : 10431486
# Date : Apr'24th 2018
############################################## ### QUESTION 1 ############################################## ###
## Question 2-The "breast cancer dataset" in CANVAS was obtained from the University of Wisconsin Hospitals,
#Madison from Dr. William H. Wolberg. The features in the dataset, described below, have been categorized from 1 to 10.
#Use these categorized features to answer the following questions.
#Important: make sure your categories are represented by the "factor" data type in R and DO NOT replace the missing values.
#Features Domain
#-- -----------------------------------------
# Sample code number id number
#F1. Clump Thickness 1 - 10
#F2. Uniformity of Cell Size 1 - 10
#F3. Uniformity of Cell Shape 1 - 10
#F4. Marginal Adhesion 1 - 10
#F5. Single Epithelial Cell Size 1 - 10
#F6. Bare Nuclei 1 - 10
#F7. Bland Chromatin 1 - 10
#F8. Normal Nucleoli 1 - 10
#F9. Mitoses 1 - 10
#Diagnosis Class: (2 for benign, 4 for malignant)
############################################## ### QUESTION 1 ############################################## ###
#7.1 Use the Naïve Bayes methodology to develop a classification model for the Diagnosis.
############################################## ### QUESTION 1 ############################################## ###
#7.2 Use the Random Forest methodology to develop a classification model for the Diagnosis. What are the top three important features?
############################################## ### SOLUTION 1 ############################################## ###
#########################################################
## Step 0: Clear the environment and load the data
##
##
#########################################################
#install.packages('e1071', dependencies = TRUE)
library(class)
library(e1071)
rm(list=ls())
#Assign CSV file name
BCW<-file("C://Users/sanja/Google Drive/2ndSem/CS513_KDD_KashaDehnad/Assignments/Assignment5_C50/breast-cancer-wisconsin.data.csv",'r')
#Read CSV file and name the file as EDA_BCW
dsn<-read.csv(BCW, header = TRUE)
#dsn<-na.omit(dsn)
#sum(is.na(dsn))
#Close the CSV file
close(BCW)
View(dsn)
### remove all the records with missing value(Not Required here as question states not to remove missing values)
dsn2<-dsn[,-1]
#?na.omit()
#dsn2<-na.omit(dsn)
set.seed(234)
#?ifelse
View(dsn2)
summary(dsn2)
str(dsn2)
#Converting the features into Factors
cols <- c("F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9", "Class")
dsn2[,cols] <- data.frame(apply(dsn2[cols], 2, as.factor))
str(dsn2)
index<-sort(sample(nrow(dsn2),round(.25*nrow(dsn2))))
training<-dsn2[-index,]
test<-dsn2[index,]
nBayes_all <- naiveBayes(Class ~., data =training)
## Naive Bayes classification using all variables
Predicted_Class<-predict(nBayes_all,test)
table(NBayes_all=Predicted_Class,Class=test$Class)
NB_wrong<-sum(Predicted_Class!=test$Class)
NB_error_rate<-NB_wrong/length(Predicted_Class)
paste0('The Naive Bayes model gives an Error Rate of:',round(NB_error_rate*100,2),'%')
NB_error_rate
#Random Forest
library(randomForest)
dsn2<-na.omit(dsn2)
str(dsn2)
index1<-sort(sample(nrow(dsn2),round(.25*nrow(dsn2))))
training1<-dsn2[-index1,]
test1<-dsn2[index1,]
fit <- randomForest( Class~., data=training1, importance=TRUE, ntree=2000)
fit
summary(fit)
features<-importance(fit)
varImpPlot(fit)
Prediction <- predict(fit, test1)
table(actual=test1[,10],Prediction)
wrong<- (test1[,10]!=Prediction )
error_rate<-sum(wrong)/length(wrong)
paste0('The Random Forest model gives an Error Rate of:',round(error_rate*100,2),'%')
#Mean Decrease in Accuracy the number or proportion of observations that are incorrectly
#classified by removing the feature (or values from the feature)
paste0('The three most important factors in terms of Mean Decrease Accuracy are:F6, F2 and F3')
error_rate