forked from johaupt/supervised_randomization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_uplift19.R
107 lines (93 loc) · 4.95 KB
/
load_uplift19.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
library(data.table)
load_uplift19 <- function(path){
data <- fread(path, check.names = TRUE)
# Keep only campaigns that assign a 15 absolute amount coupon for consistency
data <- data[campaignValue==15 & campaignUnit=="CURRENCY", ]
# Keep only view counts<=60
data <- data[targetViewCount<=60, ]
# Rename Treatment Indicator
data$treatmentGroup <- abs(data$controlGroup-1)
data$controlGroup <- NULL
# Ignore unrelated discounts by adding them back to the basket
# Simultanously add back the coupon cost of 15, where it applied
data[, checkoutAmount := checkoutAmount+checkoutDiscount]
data[, checkoutDiscount := NULL]
# Drop variables
dropVar <- c(
# Index or and campaign descriptors
"epochSecond","campaignId", "campaignUnit", "campaignValue", "campaignTags",
# Transformed outcome variable
"label",
# Cancellation during conversion process?
"dropOff",
# Confirm or Abort on banner that treatment was seen?
"confirmed", "aborted",
# Duplicate variables
"HoursSinceOldestSession",
# Baseline dummy for time of day
"IsEarlyMorning"
)
data[, (dropVar) := NULL]
# Zip Code Simplification
data[,ZipCode := as.character(ZipCode)]
# All German Zip Codes have 5 digits
data[nchar(ZipCode) != 5, ZipCode := "Foreign"]
# The first digits indicates the most general zone
data[ZipCode != "Foreign", ZipCode := substring(ZipCode,1,1)]
# Factor variables
data[, ZipCode := factor(ZipCode)]
data[, DeviceCategory := factor(DeviceCategory)]
# Drop binary variables with <0.01 of positive values
dropVar <- c("HasConfirmedBefore", "DidConfirmLastWeek", "DidConfirmLastYear", "DidConfirmLastMonth", "InitPageWas.sale.", "ChannelIs.EMAIL.", "ChannelIs.PAID.", "ChannelIs.SOCIAL.",
"ScreenTypeIs.home.", "ScreenTypeIs.sale.", "ScreenTypeIs.account.", "ScreenTypeIs.about.")
data[, (dropVar) := NULL]
# Drop constant variables
data <- data[, sapply(data, function(x) length(unique(x))) != 1, with=FALSE]
# Drop variables with a correlation >0.9
# Multicollinearity
# metavars_left = c('converted', 'checkoutAmount', 'treatmentGroup')
# data = data[c(metavars_left, setdiff(names(data),metavars_left))]
# num <- c(4:ncol(data)) # disregard meta-variables
# cor_num_P <- cor(data[,num],use="pairwise.complete.obs", method="pearson")
# #corrplot(corr = cor_num_P, method="pie", tl.cex=0.3, diag=F,type="lower")
# corr_var <- findCorrelation(cor_num_P, cutoff = .90, verbose= TRUE, names= TRUE, exact = TRUE)
# data <- data[ , !(names(data) %in% corr_var)]
dropVarColl <- c("DidConvertLastYear", "DidVisitLastYear", "ViewedBefore.overview.",
"log.of.SecondsSinceFirst.home.", "log.of.SecondsSinceFirst.account.",
"log.of.SecondsSinceFirst.about.", "log.of.SecondsSinceFirst.search.",
"log.of.SecondsSinceFirst.overview.", "log.of.SecondsSinceFirst.product.",
"log.of.SecondsSinceFirst.sale.", "log.of.ViewsOn.overview.",
"log.of.NumberOfDifferent.sale.", "log.of.NumberOfDifferent.search.",
"log.of.NumberOfDifferent.about.", "ClientKnown", "log.of.SecondsSinceOn.account.",
"log.of.SecondsSinceOn.about.", "log.of.SecondsSinceOn.sale.",
"log.of.SecondsSpentOn.account.", "log.of.SecondsSpentOn.about.",
"log.of.SecondsSpentOn.overview.", "log.of.SecondsSinceOn.home.",
"log.of.SecondsSpentOn.sale.", "ScreenTypeIs.product.", "ViewedBefore.account.",
"ViewedBefore.about.", "log.of.ViewsOn.product.", "log.of.ViewsOn.sale.",
"ViewedBefore.home.", "log.of.ViewsOn.home.", "log.of.ViewsOn.account."
)
data[, (dropVarColl) := NULL]
data <- data.table(predict(caret::dummyVars(~., data, sep="_", fullRank=TRUE), data), keep.rownames = FALSE)
# Missing Values
#apply(data, 2, function(x) any(is.na(x))) # no missing values
# replace negative values with zero since variables should be non-zero
#data <- as.data.frame(lapply(data, function(x){replace(x, x <0,0)})) # no negative values
# # Multivariate outliers
# mod <- lm(converted ~ ., data=data)
# cooksd <- cooks.distance(mod)
# sample_size <- nrow(data)
# plot(cooksd, pch="*", cex=2, main="Outliers as determined by Cooks distance")
# abline(h = 4/sample_size, col="blue")
# text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4/sample_size, names(cooksd),""), col="blue")
# outl <- as.numeric(names(cooksd)[(cooksd > (4/sample_size))])
# data <- data[-outl, ] # removed 5000 observations that had outliers
TARGET = "converted"
PROFIT = "checkoutAmount"
W = "treatmentGroup"
return(list("X"=data[,!c(TARGET, PROFIT, W), with=FALSE], "Y"=data[[TARGET]], "value"=data[[PROFIT]], "W"=data[[W]]))
}
#### TESTS ####
#data <- load_uplift19("../data/explore.csv")
#str(data,1)
#lm <- glm(w~., cbind(data$X, "w"=data$W), family = "binomial")
#ModelMetrics::auc(actual=data$W, predict(lm, type='response'))