-
Notifications
You must be signed in to change notification settings - Fork 3
/
dataset.R
31 lines (27 loc) · 921 Bytes
/
dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
dataset <- iris
# Imputation of categorical variables using Mode
getmode <- function(v) {
v <- v[nchar(as.character(v)) > 0]
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Imputation
medianModeImputation <- function(df) {
for (cols in colnames(df)) {
if (cols %in% names(df[, sapply(df, is.numeric)])) { ## Numeric variables first, then the Categorical
df <- df %>% mutate(!!cols := replace(!!rlang::sym(cols), is.na(!!rlang::sym(cols)), mean(!!rlang::sym(cols), na.rm = TRUE)))
} else {
df <- df %>% mutate(!!cols := replace(!!rlang::sym(cols), !!rlang::sym(cols) == "", getmode(!!rlang::sym(cols))))
}
}
return(df)
}
fitCurrent <- NULL
# print(dataset)
dataset <- medianModeImputation(dataset)
# named list of features
namedListOfFeatures <- function() {
namedList <- as.list(colnames(dataset))
names(namedList) <- colnames(dataset)
return(namedList)
}