-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfactor2binomial.R
136 lines (115 loc) · 4.75 KB
/
factor2binomial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
factor2bin <- function(data,
include,
exclude,
keepFactor = c(FALSE, TRUE),
integrate = c(FALSE, TRUE),
keepVarName = c(FALSE, TRUE),
maxLevels) {
# Last updated - 03.07.17
# Was worried old version would screw with classes by creating new matrix (it doesn't, all variables are new binomials anyway, duh).
# Tidied up indentation.
# SUMMARY
# Function that converts the factor variables in a data frame or matrix into binomial variables with one column per factor level. If a single variable is supplied as character vector, a matrix is returned.
# ARGUMENTS
# data <- a dataframe or matrix containing at at least one variable of class 'factor', or a single factor variable as a character vector.
# include <- Vector of regular expressions, specifying which variables to include. If NULL, all variables (subject to other ARGS) will be evaluated.
# exclude <- Vector of variable names not to be evaluated. Overrides 'include'.
# keepFactor <- If TRUE, original factor variables will be returned as part of the output. Default is FALSE.
# integrate <- If TRUE, output variables will be integrated with the input frame, which will be returned as the output. If FALSE (default), a new matrix containing only the new variables (and the original converted variables if "keepFactor = TRUE") is returned.
# keepVarName <- If TRUE, new variables will be named "<variable name>_<factor_level>". If FALSE (default), they are named <factor level>.
# maxLevels <- A single integer providing an optional cap on the number of number of levels in a variable, above which the variable will be ignored.
# DEFAULTS/REQUIRED
library(data.table)
stopifnot(!missing(data))
if (missing(keepFactor))
keepFactor <- FALSE
if (missing(integrate))
integrate <- FALSE
if (missing(keepVarName))
keepVarName <- FALSE
if (missing(maxLevels))
maxLevels <- max(as.numeric(lapply(data, nlevels)))
if ((!is.numeric(maxLevels)) || (!length(maxLevels) == 1))
stop("maxLevels must be a single integer")
if (missing(exclude)) {
exclude <- NULL
} else {
if (!is.character(exclude))
stop("'exclude' must be a character vector.")
}
# FUNCTION
# If character vector, convert to matrix (and factor variable if not).
if ((!is.data.frame(data)) && (!is.matrix(data))) {
if (!is.factor(data)) {
data <- as.factor(data)
warning("Variable was coerced to 'factor'.")
}
data <- as.matrix(data)
}
# Require at least one factor variable [Include ignore statement?]
if (!any(as.logical(lapply(data, is.factor))))
stop ("'data' must contain at least one variable of class 'factor'.")
# Required objects
added <- 0
new <- NULL
out <- data
# Should variable be processed?
for (v in 1:dim(data)[2]) {
# Is variable a factor?
if (is.factor(data[, v])) {
# If 'include' is missing, process all factor variables.
if (missing(include)) {
pass <- TRUE
} else {
# Is factor variable specified for processsing by include
pattern <- rep(FALSE, length(include))
for (e in 1:length(include)) {
if (length(grep(include[e], colnames(data)[v]))) {
pattern[e] <- TRUE
}
}
ifelse(any(pattern), pass <- TRUE, pass <- FALSE)
}
# Is the number of levels higher than max threshold?
if (nlevels(data[, v]) > maxLevels)
pass <- FALSE
# Is the variable specified for exclusion
for (e in 1:length(exclude)) {
if (length(grep(exclude[e], colnames(data)[v])))
pass <- FALSE
}
} else {
pass <- FALSE
}
# Create new variables
if (pass) {
var <- data[, v]
new <- data.frame(matrix(0, ncol = nlevels(droplevels(var)), nrow = dim(data)[1]))
for (i in 1:length(var)){
l <- match(var[i], levels(droplevels(var)))
new[i, l] <- 1
}
# Name new variables
v.levels <- levels(droplevels(var))
if (any(v.levels == ""))
v.levels[v.levels == ""] <- "blank"
if (keepVarName) {
names(new) <- transpose(lapply(v.levels, function(v.levels) paste0(names(data)[v], "_", v.levels)))[[1]]
} else {
names(new) <- v.levels
}
# Keep factor?
if (keepFactor)
new <- data.frame(var, new)
# Reintegrate with original data
if (integrate) {
out <- data.frame(out[, 1:(v+added-1)], new, out[, (v+added+1):(dim(out)[2])])
added <- added + dim(new)[2] - 1
} else {
# New variables
out <- data.frame(out, new)
}
}
}
return(out)
}