-
Notifications
You must be signed in to change notification settings - Fork 3
/
testRobRegCC.rmd
258 lines (196 loc) · 7.53 KB
/
testRobRegCC.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
---
title: "Robust regression with compositional covariates"
author: "Aditya Mishra"
date: "July 27, 2019"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
## Load package
rm(list = ls())
library(robregcc)
library(magrittr)
```
# Simulation Model
#### Features dimension
```{r }
## Define parameters to simulate example
##
p <- 80 # number of predictors
n <- 300 # number of sample
O <- 0.15*n # number of outlier, e.g. 15% of observation
L <- 1 # indicator variable for outlier type,
# L = {0,1} => leveraged {no, yes}
# generate outlier by shifting "O"observation by amount equals to shFac times
# true error variance sigma.
# shFac = {6,8} corresponds to {moderate, high} outlier
shFac <- 6
ngrp <- 4 # number of sub-composition
snr <- 3 # Signal to noise ratio
example_seed <- 2*p+1 # example seed
set.seed(example_seed)
```
#### Simulate model
```{r}
## Simulate true model variables, i.e., y, X, C, beta
## Follow examples from [Pixu Shi 2016]
# Simulate subcomposition matrix
C1 <- matrix(0,ngrp,23)
tind <- c(0,10,16,20,23)
for(ii in 1:ngrp)
C1[ii,(tind[ii]+1):tind[ii+1]] <- 1
C <- matrix(0,ngrp,p)
C[,1:ncol(C1)] <- C1
# model parameter beta
beta <- c(1, - 0.8, 0.4, 0, 0, - 0.6, 0, 0, 0, 0, -1.5,
0, 1.2, 0, 0, 0.3)
beta <- c(beta,rep(0,p-length(beta)))
# Simulate response and predictor, i.e., X, y
Sigma <- 1:p %>% outer(.,.,'-') %>% abs(); Sigma <- 0.5^Sigma
data.case <- vector("list",1)
data.case <- robregcc_sim(n,beta,O = O,Sigma,levg = L, snr,shft = shFac,0,
C,out=data.case)
```
#### Data preprocessing
```{r}
X <- data.case$X # predictor matrix
y <- data.case$y # model response
#
# Predictor transformation due to compositional constraint:
# Equivalent to performing centered log-ratio transform
Xt <- svd(t(C))$u %>% tcrossprod() %>% subtract(diag(p),.) %>%
crossprod(t(X),.)
Xm <- colMeans(Xt)
Xt <- scale(Xt,Xm,FALSE) # centering of predictors
#
mean.y <- mean(y)
y <- y - mean.y # centering of response
#
# Account for intercept in the model
Xt <- cbind(1,Xt) # accounting for intercept in predictor
C <- cbind(0,C) # accounting for intercept in constraint
bw <- c(0,rep(1,p)) # weight matrix to not penalize intercept
```
# Robust regression with compositional covariates
### Initialization
```{r , echo=FALSE}
# Breakdown point for tukey Bisquare loss function
b1 = 0.5 # 50% breakdown point
cc1 = 1.567 # corresponding model parameter
# b1 = 0.25; cc1 = 2.937 # initalization for scale parameter
set.seed(example_seed) # unique seed
# control parameter for intialization method
control <- robregcc_option(maxiter=100,tol = 1e-4,lminfac = 1e-7)
# intialization
fit.init <- cpsc_sp(Xt, y,alp=0.4, cfac=2, b1=b1,cc1=cc1,C,bw,1,control)
```
### Model fitting
```{r}
# control parameters
control <- robregcc_option()
beta.wt <- fit.init$betaR # Set weight for model parameter beta
beta.wt[1] <- 0
control$gamma = 2 # gamma for constructing weighted penalty
control$spb = 40/p # fraction of maximum non-zero model parameter beta
control$outMiter = 1000 # Outer loop iteration
control$inMiter = 3000 # Inner loop iteration
control$nlam = 50 # Number of tuning parameter lambda to be explored
control$lmaxfac = 1 # Parameter for constructing sequence of lambda
control$lminfac = 1e-8 # Parameter for constructing sequence of lambda
control$tol = 1e-20; # tolrence parameter for converging [inner loop]
control$out.tol = 1e-16 # tolerence parameter for convergence [outer loop]
control$kfold = 5 # number of fold of crossvalidation
# Robust regression using adaptive lasso penalty
fit.ada <- robregcc_sp(Xt,y,C, beta.init=fit.init$betaR,
gamma.init = fit.init$residualR,
beta.wt=abs(beta.wt),
gamma.wt = abs(fit.init$residualR),
control = control,
penalty.index = 1, alpha = 0.95)
# Robust regression using lasso penalty [Huber equivalent]
fit.soft <- robregcc_sp(Xt,y,C, beta.init=NULL, gamma.init = NULL,
beta.wt=bw, gamma.wt = NULL,
control = control, penalty.index = 2,
alpha = 0.95)
# Robust regression using hard thresholding penalty
control$lmaxfac = 1e2 # Parameter for constructing sequence of lambda
control$lminfac = 1e-3 # Parameter for constructing sequence of lambda
fit.hard <- robregcc_sp(Xt,y,C, beta.init=fit.init$betaf,
gamma.init = fit.init$residuals,
beta.wt=bw, gamma.wt = NULL,
control = control, penalty.index = 3,
alpha = 0.95)
```
### Extract model parameter estimate
```{r}
## Extract fitted model parameters
# coefficient estimate: [adaptive]
coef_cc(fit.ada, type = 0, s = 1)
# coefficient estimate: [lasso/Huber]
coef_cc(fit.soft, type = 0, s = 1)
# coefficient estimate: [Hard]
coef_cc(fit.hard, type = 0, s = 1)
# residual estimate: [adaptive]
residuals(fit.ada)
# residual estimate: [lasso/Huber]
residuals(fit.soft)
# residual estimate: [Hard]
residuals(fit.hard)
```
### Plot model output
```{r}
# mfrow for multiple plots
# [adaptive]
par(mfrow=c(1,3))
plot_path(fit.ada)
plot_cv(fit.ada)
plot_resid(fit.ada)
# [soft]
par(mfrow=c(1,3))
plot_path(fit.soft)
plot_cv(fit.soft)
plot_resid(fit.soft)
#title(sub ='[Soft]: Solution path, Cross-validation error, residual')
# [Hard]
par(mfrow=c(1,3))
plot_path(fit.hard)
plot_cv(fit.hard)
plot_resid(fit.hard)
#title(sub ='[Hard]: Solution path, Cross-validation error, residual')
par(mfrow=c(1,1))
```
### Estimated parameter comparison
```{r}
library(reshape2)
library(ggplot2)
# [Adaptive]
tmp <- data.frame(c(0,beta),fit.ada$beta0[,1])
names(tmp) <- c('Simulated parameter','Estimated parameter')
tmp$Index <- 1:(p+1)
df <- melt(tmp,3)
names(df)[2] <- "Comparison"
ggplot(data=df, aes(x=Index, y=value, fill=Comparison)) +
geom_bar(stat="identity", position=position_dodge()) +
theme_bw() + theme(legend.position="bottom") + ggtitle('Adaptive lasso') +
theme(plot.title = element_text(hjust = 0.5))
# [Lasso/Huber]
tmp <- data.frame(c(0,beta),fit.soft$beta0[,1])
names(tmp) <- c('Simulated parameter','Estimated parameter')
tmp$Index <- 1:(p+1)
df <- melt(tmp,3)
names(df)[2] <- "Comparison"
ggplot(data=df, aes(x=Index, y=value, fill=Comparison)) +
geom_bar(stat="identity", position=position_dodge()) +
theme_bw() + theme(legend.position="bottom") + ggtitle('Lasso') +
theme(plot.title = element_text(hjust = 0.5))
# [Hard]
tmp <- data.frame(c(0,beta),fit.hard$beta0[,1])
names(tmp) <- c('Simulated','Estimated')
tmp$Index <- 1:(p+1)
df <- melt(tmp,3)
names(df)[2] <- "Comparison"
ggplot(data=df, aes(x=Index, y=value, fill=Comparison)) +
geom_bar(stat="identity", position=position_dodge()) +
theme_bw() + theme(legend.position="bottom") + ggtitle('Hard penalty') +
theme(plot.title = element_text(hjust = 0.5))
```