-
Notifications
You must be signed in to change notification settings - Fork 0
/
Kickstarter_projects.Rmd
506 lines (350 loc) · 16.3 KB
/
Kickstarter_projects.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
---
title: "Kickstarter_projects"
author: "Mattia Brocco"
output:
html_document:
df_print: paged
---
```{r, include = FALSE}
library(caret)
library(dplyr)
library(psych)
library(glmnet)
library(mosaic)
library(ggplot2)
library(reshape2)
library(lubridate)
library(tigerstats)
# MAIN REFERENCE: http://topepo.github.io/caret/index.html
```
```{r}
df <- read.csv(".\\data\\ks-projects-201801.csv", sep = ",")
head(df)
```
```{r}
dim(df)
summary(df)
```
## Data Exploration
Additional information on columns:
* *usd_pledged*: conversion in USD of the _pledged_ column (done by kickstarter).
* *usd pledge real*: conversion in USD of the _pledged_ column (conversion from Fixer.io API).
* *usd goal real*: conversion in US dollars of the _goal_ column (conversion from Fixer.io API).
```{r}
ggplot(df, aes(main_category)) + geom_bar(fill = "#0073C2FF") + theme_classic()
ggplot(df, aes(currency)) + geom_bar(fill = "#0073C2FF") + theme_classic()
ggplot(df, aes(country)) + geom_bar(fill = "#0073C2FF") + theme_classic()
ggplot(df, aes(state)) + geom_bar(fill = "#E76F51") + theme_classic()
```
```{r}
densityplot(df$usd_goal_real)
densityplot(df$usd_pledged_real)
```
```{r}
ddd <- subset(df, currency == "USD" | currency == "EUR" | currency == "GBP" | currency == "CAD")
ggplot(ddd, aes(country)) + geom_bar(fill = "#0073C2FF") + theme_classic()
```
## Data Cleaning
```{r}
# Drop all rows that contain NA values
df <- na.omit(df)
```
```{r}
# Reduce the levels of the feature "currency" to the most-frequent 4.
# Then, convert the character type to the value w.r.t. USD at August 30, 2020.
df <- subset(df, currency == "USD" | currency == "EUR" |
currency == "GBP" | currency == "CAD")
curr.fun <- function(el){
if (el == "USD"){target.el <- 1}
else if (el == "GBP"){target.el <- 0.75}
else if (el == "EUR"){target.el <- 0.84}
else {target.el <- 1.31}
return(target.el)}
df$currency <- as.numeric( lapply(df$currency, curr.fun) )
```
```{r}
# Some rows had a weird value, that is directly dropped
df <- subset(df, country != "N,0\"")
country.coords <- read.table("coords.txt", sep = ",", header = T)
country.lat <- function(el){
return(country.coords[country.coords$Abbreviation == el, ][[2]]) }
country.lon <- function(el){
return(country.coords[country.coords$Abbreviation == el, ][[3]]) }
df$latitude <- as.numeric(lapply(df$country, country.lat))
df$longitude <- as.numeric(lapply(df$country, country.lon))
```
```{r}
# Convert the following columns to obtain dates
df$deadline <- ymd(df$deadline)
df$launched <- ymd_hms(df$launched)
# Add columns as year, month, day of venture's launch and deadline.
df$year.launch <- year(df$launched)
df$month.launch <- month(df$launched)
df$day.launch <- mday(df$launched)
df$year.deadl <- year(df$deadline)
df$month.deadl <- month(df$deadline)
df$day.deadl <- mday(df$deadline)
# Create the column "duration" as difference between "deadline" and "launched".
df$duration <- as.numeric( difftime(df$deadline, df$launched,
units = c("days")) )
```
```{r}
# Reduce dimensionality of the column main_category
# Then, assign to each a number and convert it to a factor
# 0 <- Dance, Theater, Art, Design
# 1 <- Publishing, Journalism, Comics
# 2 <- Film & Video, Photography
# 3 <- Music, Games, Technology
# 4 <- Food
# 5 <- Fashion
# 6 <- Crafts
cat.fun <- function(el){
if(el == "Art"){ new.el <- 0 }
else if(el == "Theater"){ new.el <- 0 }
else if(el == "Dance"){ new.el <- 0 }
else if(el == "Design"){ new.el <- 0 }
else if(el == "Publishing"){ new.el <- 1 }
else if(el == "Journalism"){ new.el <- 1 }
else if(el == "Comics"){ new.el <- 1 }
else if(el == "Film & Video"){ new.el <- 2 }
else if(el == "Photography"){ new.el <- 2 }
else if(el == "Music"){ new.el <- 3 }
else if(el == "Games"){ new.el <- 3 }
else if(el == "Technology"){ new.el <- 3 }
else if(el == "Food"){new.el <- 4 }
else if(el == "Fashion"){new.el <- 5 }
else if(el == "Crafts"){new.el <- 6 }
return(new.el)}
#df$main_category <- as.factor(as.character(lapply(df$main_category,
# cat.fun)))
```
```{r}
# The target column (state) will be defined as:
# 0: failed
# 1: successful
df <- df[(df$state == "failed") | (df$state == "successful"),]
```
```{r}
## Remove Outliers (~2.5% of data - CLT - for the following columns)
df <- subset(df, df$duration < mean(df$duration) + 2*sd(df$duration))
df <- subset(df, df$backers < mean(df$backers) + 2*sd(df$backers))
df <- subset(df, df$usd.pledged < mean(df$usd.pledged) + 2*sd(df$usd.pledged))
df <- subset(df, df$usd_pledged_real < mean(df$usd_pledged_real) + 2*sd(df$usd_pledged_real))
df <- subset(df, df$usd_goal_real < mean(df$usd_goal_real) + 2*sd(df$usd_goal_real))
dim(df)
```
```{r}
#### TIME-SERIES ANALYSIS
library(DIMORA)
df.sorted.date <- df
df.sorted.date$year.month.launch <- paste(as.character(df.sorted.date$year.launch), " ",
as.character(df.sorted.date$month.launch))
pivot.usd <- df.sorted.date %>% group_by(year.month.launch) %>%
summarize(sum_values = sum(usd_goal_real))
pivot.usd <- pivot.usd[order(pivot.usd$year.month.launch),]
plot(pivot.usd$sum_values, type = "l")
bm <- BM(pivot.usd$sum_values, display = T)
summary(bm)
```
```{r}
symnum(cor(df[,-c(1, 2, 3, 4, 6, 8, 10, 12, 25)]))
```
```{r}
# Drop columns: category (I will rely on "main_category"),
# ID, pledged and goal (since there is the currency and the conversion in USD), name
df <- subset(df, select = -c(ID, name, main_category, currency,
deadline, launched, goal, pledged,
# DUE TO COLLINEARITY
usd.pledged, year.deadl,
month.deadl, day.deadl ) )
```
## Data Preparation
* Scaling (min-max scaler)
* Sampling
* Train-test split
```{r}
# The scaling technique is the Min-Max Scaler, so that the values of latitude
# and longitude still have reciprocal significance and, for all the other
# features, the scaling looks reliable after many outliers have been removed.
#minmax.scal <- function(el){ return( (((el - min(el))) / (max(el) - min(el))) ) }
#for (i in colnames(df)[3:15]) { df[[i]] <- minmax.scal(df[[i]]) }
```
```{r}
set.seed(42)
sample.df <- sample(df, 20000)
sample.df <- subset(sample.df, select = -c(orig.id))
```
```{r}
set.seed(101)
split.df <- sample(seq_len(nrow(sample.df)), size = 0.7*nrow(sample.df))
train.df <- sample.df[split.df, ]
test.df <- sample.df[- split.df, ]
dim(train.df)
dim(test.df)
```
```{r}
# Are the class balanced?
# For sure these two classes are not balanced, nonetheless the gap may be not worthy to be lowered, hence
# a chance of prediction will be given in this "moderately unbalanced" condition.
ggplot(train.df, aes(state)) + geom_bar(fill = "#8196D2") + theme_classic()
```
```{r}
# Assessment of the correlation between features
to.corr <- round(cor(sample.df[,c(3, 5, 6, 7, 8, 9, 10, 11, 12)]), 3)
melted_corr <- melt(to.corr)
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) + geom_tile()
```
## Classification Models
```{r include = FALSE}
library(doParallel)
registerDoParallel(cores = 8)
```
VALUES WRITTEN WITHIN EX-POST CONSIDERATIONS MAY DIFFER FROM ACTUAL SINCE NO RANDOM STATE IS SET.
#### Logistic Regression
```{r}
y.train <- train.df$state
y.train <- ifelse(y.train == "failed" , 0 , 1)
log.reg <- glm(y.train~., train.df[, -2], family = "binomial")
summary(log.reg)
```
Through these lines of code it is possible to understand that the most relevant features for the logistic regression model are:
* Backers
* usd_pledged_real
* usd_goal_real
#### Boosted Logistic Regression
```{r}
library(caTools)
logreg_tune <- data.frame(nIter = seq(1,20, by = 1))
logreg_mod <- train(state ~., data = train.df, method = "LogitBoost",
tuneGrid = logreg_tune,
trControl = trainControl(method = "cv", number = 3,
allowParallel = TRUE))
summary(logreg_mod)
```
```{r}
ggplot(logreg_mod) +
geom_line(color = "#8196D2") + geom_point(color = "#8196D2") + theme_classic()
```
```{r}
# Performance assessment
logreg.pred <- predict(logreg_mod, test.df, type = "raw")
confusionMatrix(data = logreg.pred, reference = as.factor(test.df$state))
```
This models suits almost perfectly this binary classification task, as values of _sensitivity_ shows; nonetheless, there is a very slight underperformance of _specificity_ with respect of sensitivity. Also the Mcnemar's test p-value is above the threshold of 5%, meaning that the related null hypothesis should be accepted (i.e. the difference in discordant pairs is attributable to pure chance)
#### Discriminant analysis
This is the first model I've never tried to deploy, and I decided to start from a Discriminant Analysis, which is the earliest classifier - introduced by R. A. Fisher in 1936.
```{r}
library(MASS)
lda.mod <- train(state ~., data = train.df, method = "lda",
trControl = trainControl(method = "cv", number = 3,
allowParallel = TRUE))
lda.mod
```
```{r}
# Performance assessment
lda.pred <- predict(lda.mod, test.df)
confusionMatrix(data = lda.pred, reference = as.factor(test.df$state))
```
Accuracy and Kappa shown by the `train` output are moderate values, that present a more-than-fair performance of this basic model (it is not tuned because the `caret` package does not provide any variable to tune this specific model).
Nonetheless, we can see how this model is really good at predicting *failures* (0s), while it suffers a lot when it comes to *successes* (1s). This leads to a _balanced accuracy_ of $0.77$. This result may be related to the slight imbalance that exist between the classes.
#### Flexible Discrimant Analysis
Then, I wanted to add something to the LDA. FDA uses optimal scoring to transform the response variable so that the data are in a better form for linear separation.
* `degree`: max degree of interaction _(default = 1)_
* `nprune`: max number of terms in the pruned model _(default = NULL)_
```{r}
library(mda)
library(earth)
fda_tune <- data.frame(degree = seq(1,10, by = 1), nprune = seq(1,10, by = 1))
fda_mod <- train(state ~., data = train.df, method = "fda", tuneGrid = fda_tune,
trControl = trainControl(method = "cv", number = 3,
allowParallel = TRUE))
fda_mod
```
```{r}
plot(fda_mod)
```
```{r}
# Performance assessment
fda_pred <- predict(fda_mod, test.df)
confusionMatrix(data = fda_pred, reference = as.factor(test.df$state))
```
The tuning of a flexible discriminant analysis led to a drastic increase in model's performance. Accuracy has overcome $0.95$ threshold, and the same stands for sensitivity and specificity. In this case, the _Mcnemar's test p-value_ is far above the 0.05 threshold, meaning that the related null hypothesis should be accepted (i.e. the difference in discordant pairs is attributable to pure chance).
#### Naive Bayes
* `usekernel`: set to `FALSE` so that is not applied to every numeric variable _(default = FALSE)_
* `laplace`: used for additive Laplace smoothing _(default = 0)_
* `adjust`: related to bandwidth, only if "usekernel" is TRUE
```{r}
library(caret)
library(naivebayes)
nb_tune <- data.frame(usekernel = FALSE, laplace = seq(0,5, by = 1),
adjust = seq(0,5, by = 1))
nb_mod <- train(state ~., data = train.df, method = "naive_bayes",
tuneGrid = nb_tune,
trControl = trainControl(method = "repeatedcv", number = 3,
repeats = 3, allowParallel = TRUE))
nb_mod
```
```{r}
plot(nb_mod)
```
```{r}
# Performance assessment
nb_pred <- predict(nb_mod, test.df)
confusionMatrix(data = nb_pred, reference = as.factor(test.df$state))
```
The Naive Bayes shortcomings are emphasized by _specificity_, even if all the other metrics don't seem to reflect good performance of the model. So far, only the Linear Discriminant analysis has performed worse, but its sensitivity was still higher. In a real world case, for sure the choice of the model won't be a Naive Bayes.
#### Random Forest Classifier
As a final model, I decided to add a sophisticated algorithm in order to see how much this classification could be improved and at the same time to get closer to the state-of-the-art than with previous models.
* `nsets`: number of score sets tried prior to the approximation of the optimal score set
* `ntreeperdiv`: number of trees in the smaller forests constructed for each "nsets" score tried
* `ntreefinal`: number of trees in the larger forest constructed using the optimized score set
```{r}
library(e1071)
library(ranger)
library(dplyr)
library(ordinalForest)
# nsets default: 1000
# ntreeperdiv default: 100
# ntreefinal default: 5000
rf.tune <- data.frame(nsets = c(100, 500, 1000, 1200),
ntreeperdiv = c(50, 100, 150, 200),
ntreefinal = c(100, 1000, 5000, 6000))
rf.mod <- train(state ~., data = train.df, method = "ordinalRF",
tuneGrid = rf.tune,
trControl = trainControl(method = "cv", number = 3,
allowParallel = TRUE))
rf.mod
```
```{r}
# Performance assessment
rf.pred <- predict(rf.mod, test.df)
confusionMatrix(data = rf.pred, reference = as.factor(test.df$state))
```
The model tuning led to these parameters: `nsets` = 100, `ntreeperdiv` = 50 and `ntreefinal` = 100. So far, this looks as the best model in terms of _specificity_ (which has been quite a struggle throughout the process), while for what concerns _sensitivity_ the Boosted Logistic Regression model has shown, with a slight gap, the best performance so far. In addition, the _Mcnemar's test p-value_ is almost zero.
#### Comparison
The output is the ordered ranking of accuracy. The Boosted Logistic Regression and the Random Forest Classifier performed really well under this metric.
```{r}
df.params <- data.frame(model = c("LR", "LDA", "FDA", "NB", "RF"),
accuracy = c(0.9925, 0.8002, 0.955, 0.5823, 0.987),
sensitivity = c(0.9908, 0.9072, 0.9366, 0.3970, 0.9855),
specificity = c(0.9951, 0.6363, 0.9831, 0.8662, 0.99),
prevalence = c(0.6074, 0.6925, 0.5733, 0.6050, 0.6682),
balanced.acc = c(0.9929, 0.7717, 0.9599, 0.6316, 0.9877))
df.params <- df.params[order(-df.params$accuracy), ]
df.params[,1:2]
```
The following graph shows how the models behave according to balanced accuracy.Since it is the average between sensitivity and specificity, the difference between the best three models lies in how well they have been able to predict the negative class (specificity).
```{r}
ggplot(df.params, aes(x = model, y = balanced.acc, group = 1)) +
geom_line(color = "#8196D2", size = 1.5) +
geom_point(color = "#8196D2", size = 4) +
ylim(min(df.params$balanced.acc), 1) + theme_classic()
```
In this final graph it is possible to see the comparison of four performance's metrics: _sensitivity_ and _specificity_ are continuous lines, and we can see how sensitivity is always higher than specificity, and accordingly how these models are much better at predicting failures (0s, that is the *positive class*) than successes (1s). This is may be caused by the disproportion between classes, and consequently it affects the gap between _accuracy_ (dotted line) and _balanced accuracy_ (dashed line).
```{r}
ggplot(df.params, aes(x = model, group = 1)) +
geom_line(aes(y = accuracy), color = "#FFA385", size = 1.5, linetype = "dotted") +
geom_line(aes(y = sensitivity), color= "#006D77", size = 1.5) +
geom_line(aes(y = specificity), color = "#83C5BE", size = 1.5) +
geom_line(aes(y = balanced.acc), color = "#D45F35", size = 1.5, linetype = "dashed") + theme_classic()
```