diff --git a/Homework 1/solution_2.2-1.R b/Homework 1/solution_2.2-1.R new file mode 100644 index 0000000..5a1d819 --- /dev/null +++ b/Homework 1/solution_2.2-1.R @@ -0,0 +1,379 @@ +# -------------------- Code for Question 2.2 part 1 ----------------------------- +# Clear environment + +rm(list = ls()) + +# Load the kernlab library (which contains the ksvm function) and read in the data +# + +library(kernlab) + +# ---------------------------- Data manipulation ------------------------------------- + +data <- read.table("/Users/Chewy/Downloads/credit_card_data.txt", stringsAsFactors = FALSE, header = FALSE) + +# +# optional check to make sure the data is read correctly +# + +head(data) + +# Console output for head(data) +## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 +## 1 1 30.83 0.000 1.25 1 0 1 1 202 0 1 +## 2 0 58.67 4.460 3.04 1 0 6 1 43 560 1 +## 3 0 24.50 0.500 1.50 1 1 0 1 280 824 1 +## 4 1 27.83 1.540 3.75 1 0 5 0 100 3 1 +## 5 1 20.17 5.625 1.71 1 1 0 1 120 0 1 +## 6 1 32.08 4.000 2.50 1 1 0 0 360 0 1 +# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT +# + +# Setting the random number generator seed so that our results are reproducible +# (Your solution doesn't need this, but it's usually good practice to do) + +set.seed(1) + +# -------------------------- Creating the models ------------------------------------ + +# -------------------------- Scaled=TRUE model ------------------------------------ + +# Fit the model using scaled=TRUE. +# V11 is response, other variables are predictors +# + +model_scaled <- ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), + type = "C-svc", # Use C-classification method + kernel = "vanilladot", # Use simple linear kernel + C = 100, + scaled=TRUE) # have ksvm scale the data for you + +# or you could use this call; it does the same thing + +model_scaled <- ksvm(V11~.,data=data, + type = "C-svc", # Use C-classification method + kernel = "vanilladot", # Use simple linear kernel + C = 100, + scaled=TRUE) # have ksvm scale the data for you + +#Attributes model show what the data structure model has to reference +#For example, we use model@b to get the intercept and model@coef to get the coefficients +#Those references (b and coef) can be found listed in the console by using attributes(model) + +attributes(model_scaled) + +# Console output for attributes(model_scaled) is left out since it is a long output + +#model lists some high level information about the model data structure + +model_scaled + +# Console output for model_scaled +## +## Support Vector Machine object of class "ksvm" +## SV type: C-svc (classification) +## parameter : cost C = 100 +## Linear (vanilla) kernel function. +## Number of Support Vectors : 189 +## Objective Function Value : -17887.92 +## Training error : 0.136086 + +# -------------------------- Calculating the a coefficients ------------------------------------ +# +#Classification is done using linear kernel, a*scaled(x) + a0. +# Unfortunately, the model does not output a directly, but we can use the model output to find a. +# calculate a1 to am using the stored data point values in the model data structure and corresponding coefficients +# multiplying the xmatrix by the coef gives the linear combination of data points that define a1,...,am +# we use the xmatrix attribute since the model stores these data points as scaled + +a_scaled <- colSums(model_scaled@xmatrix[[1]] * model_scaled@coef[[1]]) + +# +# a0 is just -model_scaled@b + +a0_scaled<- -model_scaled@b + +# + +a_scaled +a0_scaled + +#Console output for a_scaled +## V1 V2 V3 V4 V5 +## -0.0010065348 -0.0011729048 -0.0016261967 0.0030064203 1.0049405641 +## V6 V7 V8 V9 V10 +## -0.0028259432 0.0002600295 -0.0005349551 -0.0012283758 0.1063633995 + +#Console output for a0_scaled +## [1] 0.08158492 + +# -------------------------- Calculating the predicted values ------------------------------------ +# +#The ksvm package provides a predict() function that implements this for us, but we also +#show how to get the predicted values using the a coefficients + +# Calculate the predicted values using the a's we got above and our data set. +# The coefficients for this model are based on the SCALED data points, so we need to +# scale our data points to get the correct predictions. We do this by using the scaled +# mean and standard deviation values for V1 to V10 stored in the model data structure as: +# model@scaling$x.scale$`scaled:center` (means for V1 to V10) +# model@scaling$x.scale$`scaled:scale` (standard deviation for V1 to V10) +# Then we transform the data points into their scaled equivalent by using the function: +# scaled data point[i,1:10] = (data point[i,1:10] - model@scaling$x.scale$`scaled:center`)/model@scaling$x.scale$`scaled:scale` +# +#Create predicted vector (to hold our calculated predicted values) + +predicted_scaled<-rep(0,nrow(data)) + +#For each data point, perform the transformation, calculate a*scaled(data point)+a0, +#and predict value of data point based on the resulting value + +for (i in 1:nrow(data)){ + + #If the data point is above the classifier, predicted value = 1 + + if (sum(a_scaled*(data[i,1:10]-model_scaled@scaling$x.scale$`scaled:center`)/model_scaled@scaling$x.scale$`scaled:scale`) + a0_scaled >= 0){ + predicted_scaled[i] <- 1 + } + + #If the data point is below the classifier, predicted value = 0 + + if (sum(a_scaled*(data[i,1:10]-model_scaled@scaling$x.scale$`scaled:center`)/model_scaled@scaling$x.scale$`scaled:scale`) + a0_scaled < 0){ + predicted_scaled[i] <- 0 + } +} +predicted_scaled + +# Output from predicted_scaled +## [1] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [42] 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 +## [83] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [124] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [165] 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [206] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 +## [247] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [288] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [329] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [370] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [411] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [452] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [493] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [534] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 +## [575] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [616] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + +# Get prediction from ksvm model we created, model_scaled +# Note that we could also get the predicted values of the model using model_scaled@fitted +# + +pred_scaled <- predict(model_scaled,data[,1:10]) +pred_scaled + +#Output from pred_scaled +## [1] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [42] 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 +## [83] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [124] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [165] 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [206] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 +## [247] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [288] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [329] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [370] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [411] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [452] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [493] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +## [534] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 +## [575] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [616] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + +# typing "pred_scaled" will give the sequence of 1s and 0s showing the model's classification +# As you can see in the outputs, pred and predicted have the same predicted values +# so we know that our a coefficients are correct for the SCALED data version of the model + +# -------------------------- Calculating the model's accuracy ------------------------------------ +# +# I will use a simple accuracy measure that outputs the +# percent of testing observations that are correctly classified. + +sum(pred_scaled == data$V11) / nrow(data) +sum(predicted_scaled == data$V11) / nrow(data) + +#Output from sum(pred_scaled == data$V11) / nrow(data) +## [1] 0.8639144 +# +#Output from sum(predicted_scaled == data$V11) / nrow(data) +## [1] 0.8639144 +# Note that this result is found by a wide range of values of C. + + + + + + + + + +# -------------------------- Scaled=FALSE model ------------------------------------ + +# Fit the model using scaled=FALSE. +# V11 is response, other variables are predictors +# + +model_unscaled <- ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), + type = "C-svc", # Use C-classification method + kernel = "vanilladot", # Use simple linear kernel + C = 100, + scaled=FALSE) # ksvm will not scale the data for you + +# or you could use this call; it does the same thing + +model_unscaled <- ksvm(V11~.,data=data, + type = "C-svc", # Use C-classification method + kernel = "vanilladot", # Use simple linear kernel + C = 100, + scaled=FALSE) # ksvm will not scale the data for you + +#Attributes model show what the data structure model has to reference +#For example, we use model_unscaled@b to get the intercept and model_unscaled@coef to get the coefficients +#Those references (b and coef) can be found listed in the console by using attributes(model_unscaled) + +attributes(model_unscaled) + +# Console output for attributes(model_unscaled) is left out since it is a long output + +#model lists some high level information about the model data structure + +model_unscaled + +# Console output for model_unscaled +## +## Support Vector Machine object of class "ksvm" +## SV type: C-svc (classification) +## parameter : cost C = 100 +## Linear (vanilla) kernel function. +## Number of Support Vectors : 186 +## Objective Function Value : -2213.731 +## Training error : 0.278287 + +# -------------------------- Calculating the a coefficients ------------------------------------ +# +#Classification is done using linear kernel, a*unscaled(x) + a0 = a*x + a0. +# Unfortunately, the model does not output a directly, but we can use the model output to find a. +# calculate a1 to am using the stored data point values in the model data structure and corresponding coefficients +# multiplying the xmatrix by the coef gives the linear combination of data points that define a1,...,am +# we use the xmatrix attribute since the model stores these data points as unscaled + +a_unscaled <- colSums(model_unscaled@xmatrix[[1]] * model_unscaled@coef[[1]]) + +# +# a0 is just -model_unscaled@b + +a0_unscaled <- -model_unscaled@b + +# + +a_unscaled +a0_unscaled + +#Console output for a_unscaled +## V1 V2 V3 V4 V5 +## -0.0483050561 -0.0083148473 -0.0836550114 0.1751121271 1.8254844547 +## V6 V7 V8 V9 V10 +## 0.2763673361 0.0654782414 -0.1108211169 -0.0047229653 -0.0007764962 + +#Console output for a0_unscaled +## 0.5255393 + +# -------------------------- Calculating the predicted values ------------------------------------ +# +#The ksvm package provides a predict() function that implements this for us, but we also +#show how to get the predicted values using the a coefficients + +# Calculate the predicted values using the a's we got above and our data set +# The coefficients for this model are based on the UNSCALED data points, so we do not need to +# scale our data points to get the correct predictions. + +#Create predicted vector (to hold our calculated predicted values) + +predicted_unscaled<-rep(0,nrow(data)) + +#For each data point, calculate a*(data point)+a0, +#and predict value of data point based on the resulting value + +for (i in 1:nrow(data)){ + + #If the data point is above the classifier, predicted value = 1 + + if (sum(a_unscaled*data[i,1:10]) + a0_unscaled >= 0){ + predicted_unscaled[i] <- 1 + } + + #If the data point is below the classifier, predicted value = 0 + + if (sum(a_unscaled*data[i,1:10]) + a0_unscaled < 0){ + predicted_unscaled[i] <- 0 + } +} +predicted_unscaled + +# Output from predicted_unscaled +## [1] 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 0 0 1 1 1 +## [42] 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 1 1 +## [83] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 +## [124] 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 +## [165] 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 +## [206] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 +## [247] 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [288] 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 +## [329] 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 +## [370] 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 +## [411] 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 +## [452] 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 +## [493] 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 +## [534] 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 +## [575] 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 +## [616] 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 + +# Get prediction from ksvm model we created +# + +pred_unscaled <- predict(model_unscaled,data[,1:10]) +pred_unscaled + +#Output from pred_unscaled +## [1] 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 0 0 1 1 1 +## [42] 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 1 1 +## [83] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 +## [124] 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 +## [165] 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 +## [206] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 +## [247] 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +## [288] 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 +## [329] 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 +## [370] 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 +## [411] 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 +## [452] 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 +## [493] 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 +## [534] 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 +## [575] 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 +## [616] 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 + +# typing "pred_unscaled" will give the sequence of 1s and 0s showing the model's classification +# As you can see in the outputs, pred and predicted have the same predicted values +# so we know that our a coefficients are correct for the SCALED data version of the model + +# -------------------------- Calculating the model's accuracy ------------------------------------ +# +# I will use a simple accuracy measure that outputs the +# percent of testing observations that are correctly classified. + +sum(pred_unscaled == data$V11) / nrow(data) +sum(predicted_unscaled == data$V11) / nrow(data) + +#Output from sum(pred_unscaled == data$V11) / nrow(data) +## [1] 0.7217125 +# +#Output from sum(predicted_unscaled == data$V11) / nrow(data) +## [1] 0.7217125 diff --git a/Homework 1/solution_2.2-3.R b/Homework 1/solution_2.2-3.R new file mode 100644 index 0000000..9aa3c63 --- /dev/null +++ b/Homework 1/solution_2.2-3.R @@ -0,0 +1,75 @@ +# -------------------- Code for Question 2.2 part 3 ----------------------------- +# Clear environment + +rm(list = ls()) + +#First, load the kknn library (which contains the kknn function) and read in the data +# + +library(kknn) + +data <- read.table("credit_card_data.txt", stringsAsFactors = FALSE, header = FALSE) + +# +# optional check to make sure the data is read correctly +# + +head(data) + +## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 +## 1 1 30.83 0.000 1.25 1 0 1 1 202 0 1 +## 2 0 58.67 4.460 3.04 1 0 6 1 43 560 1 +## 3 0 24.50 0.500 1.50 1 1 0 1 280 824 1 +## 4 1 27.83 1.540 3.75 1 0 5 0 100 3 1 +## 5 1 20.17 5.625 1.71 1 1 0 1 120 0 1 +## 6 1 32.08 4.000 2.50 1 1 0 0 360 0 1 +# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT +# + +# Create a function to calculate the accuracy of the model with k=X +# + +check_accuracy = function(X){ + predicted <- rep(0,(nrow(data))) # predictions: start with a vector of all zeros + + # for each row, estimate its response based on the other rows + + for (i in 1:nrow(data)){ + + # data[-i] means we remove row i of the data when finding nearest neighbors... + #...otherwise, it'll be its own nearest neighbor! + + model=kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,data[-i,],data[i,],k=X, scale = TRUE) # use scaled data + + # record whether the prediction is at least 0.5 (round to one) or less than 0.5 (round to zero) + + predicted[i] <- as.integer(fitted(model)+0.5) # round off to 0 or 1 + } + + # calculate fraction of correct predictions + + accuracy = sum(predicted == data[,11]) / nrow(data) + return(accuracy) +} + +# +# Now call the function for values of k from 1 to 20 (you could try higher values of k too) +# + +acc <- rep(0,20) # set up a vector of 20 zeros to start +for (X in 1:20){ + acc[X] = check_accuracy(X) # test knn with X neighbors +} + +# +# report accuracies +# + +acc + +## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] +##[1,] 0.8149847 0.8149847 0.8149847 0.8149847 0.851682 0.8455657 0.8470948 0.8486239 +## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] +##[1,] 0.8470948 0.8501529 0.851682 0.853211 0.851682 0.851682 0.853211 0.851682 0.851682 +## [,18] [,19] [,20] +##[1,] 0.851682 0.8501529 0.8501529 diff --git a/Homework 1/solution_3.1-a.R b/Homework 1/solution_3.1-a.R new file mode 100644 index 0000000..5abb523 --- /dev/null +++ b/Homework 1/solution_3.1-a.R @@ -0,0 +1,177 @@ +# ------------------------ Code for Question 3.1-A ------------------------------------- + +# Clear environment + +rm(list = ls()) + +# Installing and calling packages + +install.packages("kknn") +library(kknn) + +# Reading the data + +data <- read.table("credit_card_data.txt", stringsAsFactors = FALSE, header = FALSE) + +# +# optional check to make sure the data is read correctly +# + +head(data) + +## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 +## 1 1 30.83 0.000 1.25 1 0 1 1 202 0 1 +## 2 0 58.67 4.460 3.04 1 0 6 1 43 560 1 +## 3 0 24.50 0.500 1.50 1 1 0 1 280 824 1 +## 4 1 27.83 1.540 3.75 1 0 5 0 100 3 1 +## 5 1 20.17 5.625 1.71 1 1 0 1 120 0 1 +## 6 1 32.08 4.000 2.50 1 1 0 0 360 0 1 +# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT +# +# Fit the model. +# V11 is response, other variables are predictors +# + +############# METHOD 1: Using train.kknn ############# +# +# This method uses n-fold cross-validation, where n is the number +# of data points, because that's how train.kknn does +# cross validation. It's also called "leave-one-out" cross +# validation. + +# Setting the random number generator seed so that our results are reproducible + +set.seed(1) + +# set maximum value of k (number of neighbors) to test + +kmax <- 30 + +# use train.kknn for leave-one-out cross-validation up to k=kmax + +model <- train.kknn(V11~.,data,kmax=kmax,scale=TRUE) + +# create array of prediction qualities + +accuracy <- rep(0,kmax) + +# calculate prediction qualities + +for (k in 1:kmax) { + predicted <- as.integer(fitted(model)[[k]][1:nrow(data)] + 0.5) # round off to 0 or 1 + accuracy[k] <- sum(predicted == data$V11) +} + +# show accuracies + +accuracy + +## [1] 533 533 533 533 557 553 554 555 554 557 557 558 557 557 558 558 558 557 556 556 555 554 552 553 553 552 550 548 549 550 + +############# METHOD 2: cv.kknn from kknn package ############# + +# Setting the random number generator seed so that our results are reproducible + +set.seed(1) + +# set maximum value of k (number of neighbors) to test + +kmax <- 30 + +# create array of prediction qualities + +accuracy_cv <- rep(0,kmax) + +# calculate prediction qualities + +for (k in 1:kmax) { + + # run cross-validation for each value of k (number of neighbors) + model <- cv.kknn(V11~.,data, + kcv=10, # 10-fold cross-validation + k=k, # number of neighbors + scale=TRUE) # scale data + + predicted <- as.integer(model[[1]][,2] + 0.5) # round off to 0 or 1 + accuracy_cv[k] <- sum(predicted == data$V11) +} + +# show accuracies + +accuracy_cv + +## [1] 524 533 534 526 549 560 552 552 557 557 554 556 556 558 544 552 564 551 557 558 551 558 555 +##[24] 550 553 545 549 547 554 553 + +############# METHOD 3: Using caret package ############# + +# Caret is a powerful package that uses a lot of other packages to give a comprehensive +# toolkit for model building and validation. + +# Load the caret library to perform k-fold cross validation +# There could be issues installing this package and dependencies as our TAs faced + +install.packages("caret",dependencies = TRUE) +install.packages("quantreg") +library(caret) + +# Setting the random number generator seed so that our results are reproducible + +set.seed(1) + +# set number of values of k (number of neighbors) to test +# the default here is to try odd numbers, to avoid ties + +kmax <- 15 + +# note that the double use of "k" (k-nearest neighbors and k-fold cross validation) can be confusing + +knn_fit <- train(as.factor(V11)~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10, + data, + method = "knn", # choose knn model + trControl=trainControl( + method="repeatedcv", # k-fold cross validation + number=10, # number of folds (k in cross validation) + repeats=5), # number of times to repeat k-fold cross validation + preProcess = c("center", "scale"), # standardize the data + tuneLength = kmax) # max number of neighbors (k in nearest neighbor) + +# We now check the result to identify the best value of k and the associated accuracy + +knn_fit + +# The result from the model is summarized below + +##k-Nearest Neighbors +## +##654 samples +## 10 predictor +## 2 classes: '0', '1' +## +##Pre-processing: centered, scaled +##Resampling: Cross-Validated (10 fold, repeated 5 times) +## +##Summary of sample sizes: 589, 589, 588, 588, 589, 588, ... +## +##Resampling results across tuning parameters: +## +## k Accuracy Kappa Accuracy SD Kappa SD +## 5 0.8458633 0.6898403 0.04193016 0.08381092 +## 7 0.8454707 0.6897756 0.04212679 0.08410594 +## 9 0.8375406 0.6738288 0.03968248 0.07920391 +## 11 0.8335688 0.6657697 0.04631226 0.09307300 +## 13 0.8335166 0.6653878 0.04783943 0.09597735 +## 15 0.8298571 0.6576198 0.05000524 0.10027540 +## 17 0.8335734 0.6653173 0.04494915 0.09025457 +## 19 0.8384454 0.6745661 0.04424897 0.08887068 +## 21 0.8409496 0.6794267 0.04723457 0.09518618 +## 23 0.8415554 0.6801108 0.04560664 0.09256435 +## 25 0.8366410 0.6694879 0.04341832 0.08854984 +## 27 0.8390746 0.6739308 0.04163978 0.08506440 +## 29 0.8433781 0.6820578 0.04426343 0.09061453 +## 31 0.8424452 0.6798053 0.04738523 0.09727813 +## 33 0.8427531 0.6802807 0.04846920 0.09929343 +## +##Accuracy was used to select the optimal model using the largest value. +##The final value used for the model was k = 5. + diff --git a/Homework 1/solution_3.1-b.R b/Homework 1/solution_3.1-b.R new file mode 100644 index 0000000..a912f33 --- /dev/null +++ b/Homework 1/solution_3.1-b.R @@ -0,0 +1,196 @@ +# ------------------------ Code for Question 3.1-B ------------------------------------- + +# Clear environment + +rm(list = ls()) + +# Load the kernlab library (which contains the ksvm function) and read in the data +# + +library(kernlab) + +# Installing and calling kknn packages + +install.packages("kknn") +library(kknn) + +# Reading the data + +data <- read.table("credit_card_data.txt", stringsAsFactors = FALSE, header = FALSE) + +# +# optional check to make sure the data is read correctly +# + +head(data) + +## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 +## 1 1 30.83 0.000 1.25 1 0 1 1 202 0 1 +## 2 0 58.67 4.460 3.04 1 0 6 1 43 560 1 +## 3 0 24.50 0.500 1.50 1 1 0 1 280 824 1 +## 4 1 27.83 1.540 3.75 1 0 5 0 100 3 1 +## 5 1 20.17 5.625 1.71 1 1 0 1 120 0 1 +## 6 1 32.08 4.000 2.50 1 1 0 0 360 0 1 +# NOTE: ALL ROWS OF THIS FILE STARTING WITH "##" DENOTE R OUTPUT +# +# Fit the model. +# V11 is response, other variables are predictors +# + +# Setting the random number generator seed so that our results are reproducible +# (Your solution doesn't need this, but it's usually good practice to do) + +set.seed(1) + +# --------- Split data into training, validation, and test sets --------- + +# Creating a mask using the sample function for the split +# The "mask" is the set of row indices -- for example, +# if rows 1, 4, 5, and 8 are chosen, then mask will be +# (1,4,5,8). + +# 60% for training -- "sample" selects a sample of data points + +mask_train = sample(nrow(data), size = floor(nrow(data) * 0.6)) +cred_train = data[mask_train,] # training data set + +# Using the remaining data for test and validation split + +remaining = data[-mask_train, ] # all rows except training + +# Half of what's left for validation, half for test + +mask_val = sample(nrow(remaining), size = floor(nrow(remaining)/2)) + +cred_val = remaining[mask_val,] # validation data set +cred_test = remaining[-mask_val, ] # test data set + +# +# We'll pick the best of 9 SVM models and 20 KNN models + +acc <- rep(0,29) # 1-9 are SVM, 10-29 are KNN + +# +# --------------- Train SVM models ------------------- +# + +# values of C to test + +amounts <- c(0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000) + +for (i in 1:9) { + + # fit model using training set + + model_scaled <- ksvm(as.matrix(cred_train[,1:10]), + as.factor(cred_train[,11]), + type = "C-svc", # Use C-classification method + kernel = "vanilladot", # Use simple linear kernel + C = amounts[i], + scaled=TRUE) # have ksvm scale the data for you + + # compare models using validation set + + pred <- predict(model_scaled,cred_val[,1:10]) + acc[i] = sum(pred == cred_val$V11) / nrow(cred_val) +} + +acc[1:9] + +## [1] 0.5114504 0.5114504 0.7328244 0.8320611 0.8320611 0.8320611 0.8320611 0.8320611 0.8320611 + +# find best-performing SVM model on validation data + +# Note: "\n" is a newline character + +cat("Best SVM model is number ",which.max(acc[1:9]),"\n") +cat("Best C value is ",amounts[which.max(acc[1:9])],"\n") +cat("Best validation set correctness is ",max(acc[1:9]),"\n") + +## Best SVM model is number 4 +## Best C value is 0.01 +## Best validation set correctness is 0.8320611 +# +# Note that as you can see above when we printed acc[1:9], +# all C values we tested from 0.01 to 1000 look the same +# + +# retrain the best model (since I've overwritten it above) + + model_scaled <- ksvm(as.matrix(cred_train[,1:10]), + as.factor(cred_train[,11]), + type = "C-svc", # Use C-classification method + kernel = "vanilladot", # Use simple linear kernel + C = amounts[which.max(acc[1:9])], + scaled=TRUE) # have ksvm scale the data for you + + +cat("Performance on test data = ",sum(predict(model_scaled,cred_test[,1:10]) == cred_test$V11) / nrow(cred_test),"\n") + +## Performance on test data = 0.8549618 + +# +# --------------- Train KNN models ------------------- +# + +for (k in 1:20) { + + # fit k-nearest-neighbor model using training set, validate on test set + + knn_model <- kknn(V11~.,cred_train,cred_val,k=k,scale=TRUE) + + # compare models using validation set + + pred <- as.integer(fitted(knn_model)+0.5) # round off to 0 or 1 + + acc[k+9] = sum(pred == cred_val$V11) / nrow(cred_val) +} + +acc[10:29] + +## [1] 0.7862595 0.7862595 0.7862595 0.7862595 0.7938931 0.7862595 0.7786260 0.7862595 0.7786260 +##[10] 0.7938931 0.7938931 0.7938931 0.7862595 0.7938931 0.7938931 0.8091603 0.8091603 0.8091603 +##[19] 0.8091603 0.8091603 + +# find best-performing KNN model on validation data + +cat("Best KNN model is k=",which.max(acc[10:29]),"\n") +cat("Best validation set correctness is ",max(acc[10:29]),"\n") + +## Best KNN model is k= 16 +## Best validation set correctness is 0.8091603 + +# run best model on test data + + knn_model <- kknn(V11~.,cred_train,cred_test, + k=which.max(acc[10:29]), + scale=TRUE) + + pred <- as.integer(fitted(knn_model)+0.5) # round off to 0 or 1 + +cat("Performance on test data = ",sum(pred == cred_test$V11) / nrow(cred_test),"\n") + +## Performance on test data = 0.8778626 + +# +# --------------- Evaluate overall best model on test data ------------------- +# + +if (which.max(acc) <= 9) { # if a ksvm method is best + + # evaluate the ksvm method on the test set to find estimated quality + + cat("Use ksvm with C = ",amounts[which.max(acc[1:9])],"\n") + cat("Test performace = ",sum(predict(model_scaled,cred_test[,1:10]) == cred_test$V11) / nrow(cred_test),"\n") + +} else { # the best is a knn method + + # evaluate the knn method on the test set to find estimated quality + + cat("Use knn with k = ",which.max(acc[10:29]),"\n") + cat("Test performance = ",sum(pred == cred_val$V11) / nrow(cred_val),"\n") + +} + +## Use ksvm with C = 0.01 +## Test performace = 0.8549618