-
Notifications
You must be signed in to change notification settings - Fork 6
/
PIP_v2.R
185 lines (145 loc) · 7.32 KB
/
PIP_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env Rscript
# read command line
args <- commandArgs(trailingOnly=TRUE)
if (length(args) != 2) {
stop("USAGE: Rscript patient_slopes.R inputPhenotypeFile CovariateList
where args[1] = inputPhenotypeFile
args[2] = CovariateList")
}
# load required packages
if (!require(tidyverse)) install.packages('tidyverse')
if (!require(data.table)) install.packages('data.table')
if (!require(dplyr)) install.packages('dplyr')
if (!require(ggplot2)) install.packages('ggplot2')
if (!require(lme4)) install.packages('lme4')
if (!require(RNOmni)) install.packages('RNOmni')
if (!require(lmerTest)) install.packages('lmerTest')
if (!require(hydroGOF)) install.packages('hydroGOF')
if (!require(rlist)) install.packages('rlist')
if (!require(knitr)) install.packages('knitr')
# Create output directory where result with be written to
system("mkdir output")
# Read in input files
tbl <- fread(args[1],header=T)
covs <- fread(args[2],header=F, stringsAsFactor = F)
# Create user defined variable names from input phenotype data
user_input_ID <- colnames(tbl)[1]
user_input_Time <- colnames(tbl)[2]
user_input_PHENO <- colnames(tbl)[3]
# The downstream function when building the model to get the slope require the quantitative covariates to be scaled
### Automatically scale continuous variables
## function to check if binary or continuous
check_var <- function(v){
v2 <- na.omit(v)
# get unique values
v_unique <- unique(v2)
# check if length is longer than 2
if(length(v_unique) > 2){
return("con")
}else{
return("bin")
}
}
binary <- list()
cont <- list()
for(i in 1:length(covs$V1)){
if(check_var(tbl[,covs[i,]]) == "bin"){
binary[[i]] <- colnames(tbl[covs[i,]])
}else if(check_var(tbl[,covs[i,]]) == "con"){
cont[[i]] <- colnames(tbl[covs[i,]])
}
}
binary_df <- as.data.frame(do.call("rbind", binary))
binary_df$V1 <- as.character(binary_df$V1)
cont_df <- as.data.frame(do.call("rbind", cont))
cont_df$V1 <- as.character(cont_df$V1)
cont_df[2,]
tbl_scaled <- tbl
for (i in 1:length(cont_df$V1)) {
con_name <- cont_df$V1[i]
con <- paste(cont_df$V1[i],"scaled",sep="_")
tbl_scaled[[con]] <- scale(tbl_scaled[[con_name]], center = F)
}
for (i in 1:length(binary_df$V1)) {
bin_name <- binary_df$V1[i]
bin <- paste(binary_df$V1[i],"scaled",sep="_")
tbl_scaled[[bin]] <- tbl_scaled[[bin_name]]
}
# Modelling with lmer to get patient-specific slopes with user-defined covs or using unbiased step-based selection
### User defined covariates: Automatically put in correct phenotype of choice and covariates
tbl_scaled[[paste(colnames(tbl_scaled)[2],"_scaledFINAL",sep="")]] <- scale(tbl_scaled[[colnames(tbl_scaled)[2]]], center = F)
tbl_scaled[[paste(colnames(tbl_scaled)[3],"_scaledFINAL",sep="")]] <- scale(tbl_scaled[[colnames(tbl_scaled)[3]]], center = F)
x <- 1:length(covs$V1)
covariates <- paste0(covs$V1[x], "_scaled",collapse = " + ")
pheno_slope <- lmer(as.formula(paste(paste(user_input_PHENO,"_scaledFINAL ~ ",sep=""), paste(covariates, sep=""), "+", paste("(",user_input_Time,"_scaledFINAL|",user_input_ID, ")",sep=""))), tbl_scaled, REML = T)
summary(pheno_slope)
## get patient-specific slopes
raneffect_pheno <- as.data.frame(ranef(pheno_slope)[[as.data.frame(ranef(pheno_slope))[1,1]]])
raneffect_pheno$`(Intercept)` <- NULL
raneffect_pheno[[as.data.frame(ranef(pheno_slope))[1,1]]] <- rownames(raneffect_pheno)
# normalization of the time data
raneffect_pheno[[paste(user_input_Time,"_scaledFINAL_NORMALIZED",sep="")]] = rankNorm(raneffect_pheno[[paste(user_input_Time,"_scaledFINAL",sep="")]])
raneffect_pheno <- raneffect_pheno[,c(2,1,3)]
# Visualise patient-specific slopes with density plot
slope_density_plot <- ggplot(raneffect_pheno, aes(x=raneffect_pheno[[colnames(raneffect_pheno)[3]]])) +
geom_density(color="darkblue", fill="lightblue", alpha=0.4) +
theme_bw() +
ggtitle(paste("Phenotype: ",user_input_PHENO,"\nDensity of Patient Slopes",sep="")) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x = paste("Slopes of Patients\nCovariates Used:\n ",paste0(covs$V1[x], "_scaled",collapse = "\n"),sep=""),
y = "Density")
## Save the density plot
ggsave(paste("output/",user_input_PHENO,"_Density_PatientSlopes.png",sep=""), slope_density_plot, width = 3, height = 4.5, units = "in")
# output patient slopes
## Write out table of patient specific slopes
write.table(raneffect_pheno, paste("output/",user_input_PHENO,"_patient_slopes.txt",sep=""), sep = "\t", append = F, row.names = F, quote = F, col.names = T)
### Unbiased covariate selection: Stepwise selection of covariates for best model
# Backwards elimination of random and fixed effects of the model
# Backward elimination using terms with default alpha-levels:
# Keep random effects
# Note: User should not put in more than one random effect
(step_res <- step(pheno_slope, reduce.random = FALSE))
back_elim_pheno_model <- get_model(step_res)
anova(back_elim_pheno_model)
# Print out summary stats
summary(back_elim_pheno_model)
# get parameters from selection for 70/30 split below
coefs <- as.data.frame(summary(back_elim_pheno_model)$coefficients)
coefs <- coefs[2:nrow(coefs),]
covariates_back <- rownames(coefs)
covariates_back <- list.append(covariates_back, paste("(",user_input_Time,"_scaledFINAL|",user_input_ID, ")",sep=""))
# get covariance matrices
full_cov <- as.matrix(summary(pheno_slope)$vcov)
full_cov <- as.data.frame(full_cov)
back_cov <- as.matrix(summary(back_elim_pheno_model)$vcov)
back_cov <- as.data.frame(back_cov)
##### test and training set RMSE
### RMSE for full model provided by user and for stepwise model
# get unique patient IDs
ID_list <- unique(tbl[[colnames(tbl)[1]]])
train_list <- sample(ID_list, round(0.7 * (length(ID_list))), replace = F)
test_list <- setdiff(ID_list, train_list)
# make datasets
train_set <- tbl_scaled[tbl_scaled[[colnames(tbl)[1]]] %in% train_list,]
test_set <- tbl_scaled[tbl_scaled[[colnames(tbl)[1]]] %in% test_list,]
# full model provided by user
model_test_full <- lmer(as.formula(paste(paste(user_input_PHENO,"_scaledFINAL ~ ",sep=""), paste(covariates, sep=""), "+", paste("(",user_input_Time,"_scaledFINAL|",user_input_ID, ")",sep=""))), tbl_scaled, REML = T)
predicted_full <- predict(model_test_full, test_set, allow.new.levels=TRUE)
RMSE <- rmse(predicted_full, test_set[[paste(user_input_PHENO,"_scaledFINAL",sep="")]])
# backward selection model
model_test_back <- lmer(as.formula(paste(paste(user_input_PHENO,"_scaledFINAL ~ ",sep=""), paste(covariates_back, collapse="+"))), train_set, REML = T)
predicted_back <- predict(model_test_back, test_set, allow.new.levels=TRUE)
RMSE_back <- rmse(predicted_back, test_set[[paste(user_input_PHENO,"_scaledFINAL",sep="")]])
### make error table
testing_error <- data.frame("RMSE_full_model" = numeric(1), "RMSE_selected_model" = numeric(1))
testing_error$RMSE_full_model <- RMSE
testing_error$RMSE_selected_model <- RMSE_back
### to do - make error message output if model fails to converge
## make 2 separate outputs - correlaton matrix for full and back model
full_cov_kable <- kable(full_cov)
write(full_cov_kable, paste("output/",user_input_PHENO,"_full_model_cov.txt",sep=""))
back_cov_kable <- kable(back_cov)
write(back_cov_kable, paste("output/",user_input_PHENO,"_back_model_cov.txt",sep=""))
## - rmse for full and back model
error_kable <- kable(testing_error)
write(error_kable, paste("output/",user_input_PHENO,"_error_back_model_cov.txt",sep=""))