testRobRegCC.rmd

---
title: "Robust regression with compositional covariates"
author: "Aditya Mishra"
date: "July 27, 2019"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
## Load package
rm(list = ls())
library(robregcc)
library(magrittr)
```


# Simulation Model

#### Features dimension

```{r }
## Define parameters to simulate example
## 
p <- 80                             # number of predictors  
n <- 300                            # number of sample   
O <- 0.15*n                         # number of outlier, e.g. 15% of observation    
L <- 1                              # indicator variable for outlier type, 
                                    # L = {0,1} => leveraged {no, yes}
                                    
# generate outlier by shifting "O"observation by amount equals to shFac times 
# true error variance sigma.
# shFac = {6,8} corresponds to {moderate, high} outlier 
shFac <- 6                          
ngrp <- 4                           # number of sub-composition
snr <- 3                            # Signal to noise ratio
example_seed <- 2*p+1               # example seed
set.seed(example_seed)    
```

#### Simulate model
```{r}
## Simulate true model variables, i.e., y, X, C, beta
## Follow examples from [Pixu Shi 2016]

# Simulate subcomposition matrix
C1 <- matrix(0,ngrp,23)
tind <- c(0,10,16,20,23)
for(ii in 1:ngrp)
  C1[ii,(tind[ii]+1):tind[ii+1]] <- 1
C <- matrix(0,ngrp,p)
C[,1:ncol(C1)] <- C1            


# model parameter beta
beta <- c(1, - 0.8, 0.4, 0, 0, - 0.6, 0, 0, 0, 0, -1.5, 
          0, 1.2, 0, 0, 0.3)
beta <- c(beta,rep(0,p-length(beta)))

# Simulate response and predictor, i.e., X, y
Sigma  <- 1:p %>% outer(.,.,'-') %>% abs(); Sigma  <- 0.5^Sigma
data.case <- vector("list",1)
data.case <- robregcc_sim(n,beta,O = O,Sigma,levg = L, snr,shft = shFac,0,
                          C,out=data.case)
```


#### Data preprocessing
```{r}
X <- data.case$X                          # predictor matrix
y <- data.case$y                          # model response 
#

# Predictor transformation due to compositional constraint:
# Equivalent to performing centered log-ratio transform 
Xt <- svd(t(C))$u %>% tcrossprod() %>% subtract(diag(p),.) %>% 
  crossprod(t(X),.)
Xm <- colMeans(Xt)
Xt <- scale(Xt,Xm,FALSE)                  # centering of predictors 
#
mean.y <- mean(y)
y <- y - mean.y                           # centering of response 
#
# Account for intercept in the model
Xt <- cbind(1,Xt)                         # accounting for intercept in predictor
C <- cbind(0,C)                           # accounting for intercept in constraint
bw <- c(0,rep(1,p))                       # weight matrix to not penalize intercept                
```


# Robust regression with compositional covariates 
### Initialization

```{r , echo=FALSE}
# Breakdown point for tukey Bisquare loss function 
b1 = 0.5                    # 50% breakdown point
cc1 =  1.567                # corresponding model parameter
# b1 = 0.25; cc1 =  2.937   # initalization for scale parameter 

set.seed(example_seed)      # unique seed

# control parameter for intialization method
control <- robregcc_option(maxiter=100,tol = 1e-4,lminfac = 1e-7)

# intialization
fit.init <- cpsc_sp(Xt, y,alp=0.4, cfac=2, b1=b1,cc1=cc1,C,bw,1,control)  
```

### Model fitting

```{r}

# control parameters
control <- robregcc_option()
beta.wt <- fit.init$betaR           # Set weight for model parameter beta
beta.wt[1] <- 0
control$gamma = 2                   # gamma for constructing  weighted penalty
control$spb = 40/p                  # fraction of maximum non-zero model parameter beta
control$outMiter = 1000             # Outer loop iteration
control$inMiter = 3000              # Inner loop iteration
control$nlam = 50                   # Number of tuning parameter lambda to be explored
control$lmaxfac = 1                 # Parameter for constructing sequence of lambda
control$lminfac = 1e-8              # Parameter for constructing sequence of lambda 
control$tol = 1e-20;                # tolrence parameter for converging [inner  loop]
control$out.tol = 1e-16             # tolerence parameter for convergence [outer loop]
control$kfold = 5                   # number of fold of crossvalidation


# Robust regression using adaptive lasso penalty
fit.ada <- robregcc_sp(Xt,y,C, beta.init=fit.init$betaR, 
                       gamma.init = fit.init$residualR,
                       beta.wt=abs(beta.wt), 
                       gamma.wt = abs(fit.init$residualR),
                       control = control, 
                       penalty.index = 1, alpha = 0.95)


# Robust regression using lasso penalty [Huber equivalent]   
fit.soft <- robregcc_sp(Xt,y,C, beta.init=NULL, gamma.init = NULL,
                        beta.wt=bw, gamma.wt = NULL,
                        control = control, penalty.index = 2, 
                        alpha = 0.95)


# Robust regression using hard thresholding penalty
control$lmaxfac = 1e2               # Parameter for constructing sequence of lambda
control$lminfac = 1e-3              # Parameter for constructing sequence of lambda
fit.hard <- robregcc_sp(Xt,y,C, beta.init=fit.init$betaf, 
                        gamma.init = fit.init$residuals,
                        beta.wt=bw, gamma.wt = NULL,
                        control = control, penalty.index = 3, 
                        alpha = 0.95)

```

### Extract model parameter estimate 

```{r}
## Extract fitted model parameters

# coefficient estimate: [adaptive] 
coef_cc(fit.ada, type = 0, s = 1)

# coefficient estimate: [lasso/Huber] 
coef_cc(fit.soft, type = 0, s = 1)

# coefficient estimate: [Hard] 
coef_cc(fit.hard, type = 0, s = 1)


# residual estimate: [adaptive] 
residuals(fit.ada)

# residual estimate: [lasso/Huber] 
residuals(fit.soft)

# residual estimate: [Hard] 
residuals(fit.hard)
```

### Plot model output

```{r}
# mfrow for multiple plots
# [adaptive]
par(mfrow=c(1,3))
plot_path(fit.ada)
plot_cv(fit.ada)
plot_resid(fit.ada)


# [soft]
par(mfrow=c(1,3))
plot_path(fit.soft)
plot_cv(fit.soft)
plot_resid(fit.soft)
#title(sub ='[Soft]: Solution path, Cross-validation error, residual')

# [Hard]
par(mfrow=c(1,3))
plot_path(fit.hard)
plot_cv(fit.hard)
plot_resid(fit.hard)
#title(sub ='[Hard]: Solution path, Cross-validation error, residual')
par(mfrow=c(1,1))
```


### Estimated parameter comparison

```{r}


library(reshape2)
library(ggplot2)

# [Adaptive]
tmp <- data.frame(c(0,beta),fit.ada$beta0[,1])
names(tmp) <- c('Simulated parameter','Estimated parameter')
tmp$Index <- 1:(p+1)
df <- melt(tmp,3)
names(df)[2] <- "Comparison"
ggplot(data=df, aes(x=Index, y=value, fill=Comparison)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  theme_bw() + theme(legend.position="bottom") + ggtitle('Adaptive lasso') +
  theme(plot.title = element_text(hjust = 0.5))

# [Lasso/Huber] 
tmp <- data.frame(c(0,beta),fit.soft$beta0[,1])
names(tmp) <- c('Simulated parameter','Estimated parameter')
tmp$Index <- 1:(p+1)
df <- melt(tmp,3)
names(df)[2] <- "Comparison"
ggplot(data=df, aes(x=Index, y=value, fill=Comparison)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  theme_bw() + theme(legend.position="bottom") + ggtitle('Lasso') +
  theme(plot.title = element_text(hjust = 0.5))


# [Hard] 
tmp <- data.frame(c(0,beta),fit.hard$beta0[,1])
names(tmp) <- c('Simulated','Estimated')
tmp$Index <- 1:(p+1)
df <- melt(tmp,3)
names(df)[2] <- "Comparison"
ggplot(data=df, aes(x=Index, y=value, fill=Comparison)) +
  geom_bar(stat="identity", position=position_dodge()) + 
  theme_bw() + theme(legend.position="bottom") + ggtitle('Hard penalty') +
  theme(plot.title = element_text(hjust = 0.5))


```