institution prediction baseline.Rmd

---
title: "IPD prediction"
output: html_document
date: '2022-09-07'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r library}
library(dplyr)
library(tidyr)
library(Hmisc)
library(jomo)
library(haven)
library(lubridate)
library(finalfit)
library(mice)
library(mitools)
library(rms)
library(mitml)
library(survminer)
library(patchwork) #Arrange plot
library(ggplot2)
library(GGally)
library(gridExtra)
library(kableExtra)
library(metafor)
library(png)
library(meta)
library(grid)
library(survival)
library(flexsurv)
library(pacman)
```



```{r data}
data<-read_dta("PICC institutionalisation.dta")

write.csv(data,"data.csv",row.names = F)  # need to be csv file, otherwise it fails
datanew<-read.csv("data.csv")  #import data again


#Update data: PCP66 is a male, baseline age is 74.8

datanew$sex[datanew$originalid=="PCP66"]<-0  #male is 0
datanew$agebl[datanew$originalid=="PCP66"]<-74.8



#age at baseline, sex, smoking status, co-morbidity, living alone, type of accommodation, presence of hallucinations, presence of cognitive symptoms, falls, MDS-UPDRS part III score (converted from UPDRS part III as applicable), H&Y stage, MMSE score, Schwab and England scale

#presence of hallucinations needs mdsupdrsbl102hallucinations and updrsblitem2thoughtdisorders

#presence of cognitive symptoms needs mdsupdrsbl101cognitive and updrsblitem1intellectual


data1<-datanew%>%
  select(study,idpicc,originalid, agebl,agediagnosis, sex, smoking, charlsonbl,livesalonebl,accommodationbl,mdsupdrsbl102hallucinations,updrsblitem2thoughtdisorders,mdsupdrsbl101cognitive,updrsblitem1intellectual,updrsblitem13falling,mdsupdrspart3bltotalconvertedasa,hybl,mmsebltotal,sebl,dependentadlbl2,updrsblitem10dressing,updrsblitem9cuttingfood,updrsblitem11hygiene,updrsblitem15walking,mdsupdrsbl204eating,mdsupdrsbl205dressing,mdsupdrsbl206hygiene,mdsupdrsbl212walking,institutionalised,dateinstitution,datediagnosis,datevisitbl,losttofollowup,datedeath,datelost,enddateinstitutionalised)  




#------Create index for hallucinations------

data1$updrshallucinations<-ifelse(data1$mdsupdrsbl102hallucinations>0,1,0)  # 1=yes,0=no

data1$updrsthoughtdisorders<-ifelse(data1$updrsblitem2thoughtdisorders>1,1,0)  # 1=yes (2-4),0=no (0-1)

data1<-data1 %>%
      mutate(hallucinationsindex=updrshallucinations)%>%           
      mutate(hallucinationsindex=coalesce(hallucinationsindex,updrsthoughtdisorders)) #if updrshallucinations not find then use updrsthoughtdisorders


#-----Create index for cognitive---------

#summary(data1$mdsupdrsbl101cognitive)
#summary(data1$updrsblitem2thoughtdisorders)


data1$mdsupdrsbl101cognitive[data1$mdsupdrsbl101cognitive>1]<-1     
data1$updrsblitem1intellectual[data1$updrsblitem1intellectual>1]<-1
data1<-data1 %>%
      mutate(cognitiveindex=mdsupdrsbl101cognitive)%>%           
      mutate(cognitiveindex=coalesce(cognitiveindex,updrsblitem1intellectual))  # 0=no,1=yes


table(data$study,data$mdsupdrsbl101cognitive)
table(data$study,data$updrsblitem1intellectual)

#-------------create index for falls-----------

data1$updrsblitem13falling[data1$updrsblitem13falling>1&!is.na(data1$updrsblitem13falling)]<-1

#summary(data1$updrsblitem13falling)


#----Create smoking----

data1$smoking[data1$smoking<3&!is.na(data1$smoking)]<-1  #Ever smoker
data1$smoking[data1$smoking==3&!is.na(data1$smoking)]<-0  #Never smoker
#summary(data1$smoking)



#-----Create activities----

#table(data1$updrsblitem15walking) there is not 4 in PICC data, don't need to change updrs walking

data1$updrsblitem15walking[data1$updrsblitem15walking==4]<-3

#table(data1$mdsupdrsbl212walking) 18 patients have 3 and 2 patients have 4

data1$mdsupdrsbl212walking[data1$mdsupdrsbl212walking==3]<-2
data1$mdsupdrsbl212walking[data1$mdsupdrsbl212walking==4]<-3

#table(data1$mdsupdrsbl212walking) 

data1<-data1%>%
  mutate(mdsactivities=mdsupdrsbl205dressing+mdsupdrsbl204eating+mdsupdrsbl206hygiene+mdsupdrsbl212walking)

#table(data1$mdsactivities)


data1<-data1%>%
  mutate(updractivities=updrsblitem10dressing+updrsblitem11hygiene+updrsblitem9cuttingfood+updrsblitem15walking)

#table(data1$updractivities)


data1<-data1 %>%
      mutate(activities=mdsactivities)%>%           
      mutate(activities=coalesce(activities,updractivities))  

#table(data1$activities)

#A<-data1%>%
# filter(study=="PINE" & is.na(activities))


#---Redefine Sebl---

table(data1$study,data1$sebl)  #1 in PINE is 65, 3 in NYPUM and 9 in PINE is 85, 14 in NYPUM and 28 in PINE is 95, 2 in PINE is 100

data1$sebl[data1$sebl==65]<-60
data1$sebl[data1$sebl==85]<-80
data1$sebl[data1$sebl==95]<-90
data1$sebl[data1$sebl==98]<-100

#-----Factor---


data1$study<-factor(data1$study,
                    levels = c(1,2,3,4,5,6),
                    labels= c("CamPalGN","ICICLE","NYPUM","ParkWest","PICNICS","PINE"))

summary(data1$study)  


data1$sex<-factor(data1$sex,
                  levels = c(0,1),
                  labels = c("male","female"))


data1$smoking <- factor(data1$smoking, 
                        levels=c(0,1),
                        labels=c("Never smoker", "Ever smoker"))


data1$livesalonebl<-factor(data1$livesalonebl,
                           levels=c(0,1),
                           labels = c("lives alone","lives with other(s)"))


data1$fall<-factor(data1$updrsblitem13falling,
                   levels = c(0,1),
                   labels = c("no","yes")) 



data1$hallucinationsindex<-factor(data1$hallucinationsindex,
                                  levels = c(0,1),
                                  labels = c("no","yes"))  


data1$cognitiveindex<-factor(data1$cognitiveindex,
                            levels = c(0,1),
                            labels = c("no","yes"))



data1$accommodationbl<-factor(data1$accommodationbl,
                              levels = c(1,2,3,4),
                              labels = c("At home","Nursing home","Other","Sheltered housing")) 



data1$dependentadlbl2<-factor(data1$dependentadlbl2,
                              levels = c(0,1),
                              labels = c("independency","dependency")) #Is it?





```


```{r follow-up}

#PICNICS PCP66 change datedeath to enddateinstitutionalised, cos this patient didn't enter into institution

data1$enddateinstitutionalised[data1$originalid=="PCP66"]<-data1$datedeath[data1$originalid=="PCP66"]


data1<-data1 %>%
  mutate(t=dateinstitution)%>%           
  mutate(t=coalesce(t,enddateinstitutionalised))

sum(is.na(data1$t)) #37 missing, patients do not have date of institution/ end date of institution 

#A<-data1%>%
#  filter(is.na(t))%>%
#  select(study,idpicc,originalid, institutionalised,enddateinstitutionalised,datelost,datedeath)

#table(A$study)


#sum(A$institutionalised,na.rm = T)  #35 patients know entered in the nursing home but don't know when

#2 patients in CamPalGN without any information of instituionlisation record


35+2+26

#Remove those patients with missing t, 1109-38=1071

data1<-data1%>%
  filter(!is.na(t)) #now 1072 patients in data

data1$cens<-ifelse(is.na(data1$dateinstitution),0,1)  #0=right censored, 1=event 

data1$tt<-as.Date(as.character(data1$t), format="%Y-%m-%d")-
                as.Date(as.character(data1$datevisitbl), format="%Y-%m-%d")

data1$tt<-as.numeric(data1$tt)

data1$year<-data1$tt/365.25


#A<-data1%>%
#  filter(data1$tt<=0) 

#table(A$study)  #To see how many in different study

#Only keep those follow-up time >0
data1<-data1%>%
  filter(data1$tt>0) #1046 removed 26 patients 



#summary(data$study)  #1109 patients in original data
#summary(data1$study)  #After remove, now 1045 patients in data 

#Surv(data1$year,data1$cens) 0=right censored, 1=event checking

#--------Change censor 10y and 4.8y---------

data3<-data1 #create data3

#ICICLE study
data3$cens[data3$study=="ICICLE" & data3$year>4.8]<-0 #0=right censored, 1=event 
data3$year[data3$study=="ICICLE" & data3$year>4.8]<-4.8 #If follow-up time>4.8 then change to 4.8 year
#Other study
data3$cens[data3$study!="ICICLE" & data3$year>10]<-0 #0=right censored, 1=event 
data3$year[data3$study!="ICICLE" & data3$year>10]<-10 #If follow-up time>10 then change to 10 year


table(data3$study)

```

```{r only keep variables needed}

#colnames(data1) data1 is without change
data4<-data1[,c(1:4,6,16:18,29:30,45:48)] #Only keep variables needed   

#colnames(data4)
#summary(data4)

#data3 is change 4.8y/10y censor
data5<-data3[,c(1:4,6,16:18,29:30,45:48)] #Only keep variables needed

#colnames(data5)
#summary(data5)

data6<-data5[,c(1,4:8)] #check how many row with missing value

sum(apply(data6, 1, anyNA)) #30 missing

30/1046*100  #2.9%

data6%>%
  group_by(study)%>%
  summarise(sum(is.na(mdsupdrspart3bltotalconvertedasa)),sum(is.na(mmsebltotal))) #PICNICS miss 7 mds-updrs and 1 mmse
                                                                                  #PINE miss 1 mds-updrs and 15 mmse
data6%>%
  group_by(study)%>%
  filter(is.na(mdsupdrspart3bltotalconvertedasa)|is.na(mmsebltotal))%>%
  select(study,mdsupdrspart3bltotalconvertedasa,mmsebltotal) #There is no one both missing

```


```{r missing pattern}

#----With sebl----

data.rename<-data5%>%
  rename("Age at baseline"=agebl, 
         "Sex"=sex, 
         "MDS-UPDRS part3"=mdsupdrspart3bltotalconvertedasa,
         "Hoehn and Yahr Scale"=hybl,
         "MMSE"=mmsebltotal
         )

explanatory<-c("MDS-UPDRS part3","MMSE") 
dependent<- c("cens","tt")


mispattern<-data.rename %>% 
  missing_pattern(explanatory)



#png("missingp1.png",width = 1500,height =1500,res = 400)

#data.rename %>% 
#  missing_pattern(explanatory)

#dev.off()



```


```{r jomo}

data5$cons<-1

data5$nelsonaalen<-nelsonaalen(data5,year,cens) #0 is right censor,1 is event

Y<- data5[,c("mdsupdrspart3bltotalconvertedasa","mmsebltotal")] 

X<-data5[,c("cons","agebl","sex","hybl","nelsonaalen")] #adding Nelson-Aalen estimate  

clus<-data5$study

imp.dry<-jomo.MCMCchain(Y = Y,X = X,clus = clus, nburn = 2)

set.seed(15678)
imp1 <- jomo.MCMCchain(Y = Y, X = X, clus = clus, nburn = 5000)


#head(imp1$collectbeta) # check beta


#plot trace for each parameter value

#png("Jomo1.png",width = 3500,height =1000,res = 400)

#par(mfrow=c(1,4))

plot(imp1$collectbeta[1, 1, 1:5000], type = "l", ylab = expression(beta["MDS-UPDRS,0"]),
     xlab = "Iteration number" )

plot(imp1$collectbeta[1, 2, 1:5000], type = "l", ylab = expression(beta["MMSE,0"]),
     xlab = "Iteration number" )

#plot trace for cov matrix element
#imp1$collectomega[,,1] #check the row and col name
#Category variable don't need to plot, just a straight line

plot(imp1$collectomega[1, 1, 1:5000], type = "l", ylab = expression(omega[MDS-UPDRS,1,1]^2),
     xlab = "Iteration number" )

plot(imp1$collectomega[2, 2, 1:5000], type = "l", ylab = expression(omega[MMSE,1,1]^2),
     xlab = "Iteration number" )


#dev.off()


# Capture the state of the sampler as starting values for the second set of iterations:
beta.start <- imp1$collectbeta[,,5000] # capture the fixed parameter values
l1cov.start <- imp1$collectomega[,,5000] # capture the level-1 covariance matrix values
start.imp <- imp1$finimp.latnorm # capture the final imputed data set 



#Re-run the same function for a larger number of iterations
imp2 <- jomo.MCMCchain(Y = Y, X = X, clus = clus, beta.start = beta.start, l1cov.start = l1cov.start,
                       start.imp = start.imp, nburn = 5000)

# Check the trace again

#png("Jomo2.png",width = 3500,height =1000,res = 400)

#par(mfrow=c(1,4))

plot(imp2$collectbeta[1, 1, 1:5000], type = "l", ylab = expression(beta["mdsupdrspart3,0"]),
     xlab = "Iteration number" )

plot(imp2$collectbeta[1, 2, 1:5000], type = "l", ylab = expression(beta["mmse,0"]),
     xlab = "Iteration number" )

#plot trace for cov matrix element

plot(imp2$collectomega[1, 1, 1:5000], type = "l", ylab = expression(omega[mdsupdrspart3,1,1]^2),
     xlab = "Iteration number" )

plot(imp2$collectomega[2, 2, 1:5000], type = "l", ylab = expression(omega[mmse,1,1]^2),
     xlab = "Iteration number" )


#dev.off()

#collect posterior mean of cov matrix
l1cov.guess <- apply(imp2$collectomega, c(1, 2), mean)

#dim(imp2$collectomega[,,1])

# Multiply by degrees of freedom to get scale matrix
l1cov.prior <- l1cov.guess*2

# Perform multilevel imputation:
imp3 <- jomo(Y = Y, X = X, clus = clus, l1cov.prior = l1cov.prior, nburn = 5000, nbetween = 1000, nimp =3,meth = "random" )  


```



```{r Choose one imputation datasets to use}
imp3.2<-imp3%>%
  filter(Imputation==2)
  
```

```{r merge data}

#create id to merge the data
data5$id<-seq(nrow(data5))

data5.time<-data5%>%
  select(id,year,cens,idpicc,institutionalised,t,tt)

imp3.new<-merge(imp3.2,data5.time,by.x = "id",by.y = "id")
```


```{r prepare data before model}

imp3.new$age10<-imp3.new$agebl/10

imp3.new$mdsupdrs3.10<-imp3.new$mdsupdrspart3bltotalconvertedasa/10

summary(imp3.new)  #mmse has >30

imp3.new$mmsebltotal[imp3.new$mmsebltotal>30&!is.na(imp3.new$mmsebltotal)]<-30

#I am now need to do one-stage IPD-meta, therefore, should all in one data sets and stratified by study

```

```{r censor for 4 years}

imp3.temp<-survSplit(Surv(year, cens) ~ ., data = imp3.new, cut = 4,
                  episode="timegroup")
imp3.5y<-subset(imp3.temp, timegroup == 1) #only the first 4 year
```

```{r knots}

imp3.5y%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year),median(year))
  
#maximum event time 9.943874 year
#minimum event time 0.1067762	year

log(3.978097)  #Kmax
log(0.1067762) #Kmin
log(2.818617)
```

```{r PO model}

PO0<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal+clus,data=imp3.5y,k=0,scale = "odds") #log-logistic model

PO1<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal+clus,data=imp3.5y,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "odds")

PO2<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal+clus,data=imp3.5y,k=2,bknots = c(log(0.1067762),log(3.978097)),scale = "odds")
```

```{r PH model}
PH0<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal+clus,data=imp3.5y,k=0,scale="hazard")

PH1<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal+clus,data=imp3.5y,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "hazard")

PH2<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal+clus,data=imp3.5y,k=2,bknots = c(log(0.1067762),log(3.978097)),scale = "hazard")
```

```{r Probit model}

#Pr0<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl+mmsebltotal+clus,data=imp3.5y,k=0,scale = "normal") #log-normal model

#Pr1<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl+mmsebltotal+clus,data=imp3.5y,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "normal")

#Pr2<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl+mmsebltotal+clus,data=imp3.5y,k=2,bknots = c(log(0.1067762),log(3.978097)),scale = "normal")
```

```{r POPH compare}

AIC(PO0) 
AIC(PO1) #1 knots in median
AIC(PO2) #2 knots one in 33% one in 67%

BIC(PO0) 
BIC(PO1) #1 knots in median
BIC(PO2) #2 knots one in 33% one in 67%


AIC(PH0) 
AIC(PH1) #1 knots in median
AIC(PH2) #2 knots one in 33% one in 67%

BIC(PH0) 
BIC(PH1) #1 knots in median
BIC(PH2) #2 knots one in 33% one in 67%

#AIC(Pr0)
#AIC(Pr2)

#BIC(Pr0)
#BIC(Pr2)
```


```{r final model}
PO1
```


```{r Harrell Uno}

#---leave CamPalGN out----

data5y.1<-imp3.5y%>%
  filter(clus!="CamPalGN")

data5y.1v<-imp3.5y%>%
  filter(clus=="CamPalGN")

#Boundary knots location
data5y.1%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  


#refit model
model1<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.1,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "odds")


#linear predictor method 1
# Design matrix of predictors
des_matr1<-as.data.frame(model.matrix(~ age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.1v))
des_matr1$`(Intercept)` <- NULL
coef1<-c(model1$coefficients[4],model1$coefficients[5],model1$coefficients[6],model1$coefficients[7])
data5y.1v$lp.1 <- as.vector(as.matrix(des_matr1) %*% cbind(coef1))

#linear predictor (to double check if the equation is right) method 2

#data5y.1v$coe1<-model1$coefficients[4]
#data5y.1v$coe2<-model1$coefficients[5]
#data5y.1v$coe3<-model1$coefficients[6]
#data5y.1v$coe4<-model1$coefficients[7]

#data5y.1v<-data5y.1v%>%
#  mutate(lp.A=age10*coe1+sex*coe2+mdsupdrs3.10*coe3+hybl*coe4)

#data5y.1v$lp.A

#survival probabilities
s5.1<-predict(model1,type = "survival",times = 4) #survival probabilities at 4 year

#add linear predictor in validation datasets


# Harrell's C
harrell_C_1v <- concordance(Surv(year,cens) ~ lp.1, 
                              data5y.1v, 
                               reverse = TRUE)
# Uno's C
Uno_C_1v<- concordance(Surv(year,cens) ~ lp.1, 
                           data5y.1v, 
                           reverse = TRUE,
                           timewt = "n/G2")

#---leave ICICLE out----

data5y.2<-imp3.5y%>%
  filter(clus!="ICICLE")

data5y.2v<-imp3.5y%>%
  filter(clus=="ICICLE")

#Boundary knots location
data5y.2%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  


#refit model
model2<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.2,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "odds")

#survival probabilities
s5.2<-predict(model2,type = "survival",times = 4) #survival probabilities at 4 year

# Design matrix of predictors
#add linear predictor in validation datasets
des_matr2<-as.data.frame(model.matrix(~ age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.2v))
des_matr2$`(Intercept)` <- NULL
coef2<-c(model2$coefficients[4],model2$coefficients[5],model2$coefficients[6],model2$coefficients[7])
data5y.2v$lp.2 <- as.vector(as.matrix(des_matr2) %*% cbind(coef2))

# Harrell's C
harrell_C_2v <- concordance(Surv(year,cens) ~ lp.2, 
                              data5y.2v, 
                               reverse = TRUE)
# Uno's C
Uno_C_2v<- concordance(Surv(year,cens) ~ lp.2, 
                           data5y.2v, 
                           reverse = TRUE,
                           timewt = "n/G2")


#---leave NYPUM out----

data5y.3<-imp3.5y%>%
  filter(clus!="NYPUM")

data5y.3v<-imp3.5y%>%
  filter(clus=="NYPUM")

#Boundary knots location
data5y.3%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  

#refit model
model3<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.3,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "odds")

#survival probabilities
s5.3<-predict(model3,type = "survival",times = 4) #survival probabilities at 4 year

# Design matrix of predictors
#add linear predictor in validation datasets
des_matr3<-as.data.frame(model.matrix(~ age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.3v))
des_matr3$`(Intercept)` <- NULL
coef3<-c(model3$coefficients[4],model3$coefficients[5],model3$coefficients[6],model3$coefficients[7])
data5y.3v$lp.3 <- as.vector(as.matrix(des_matr3) %*% cbind(coef3))

# Harrell's C
harrell_C_3v <- concordance(Surv(year,cens) ~ lp.3, 
                              data5y.3v, 
                               reverse = TRUE)
# Uno's C
Uno_C_3v<- concordance(Surv(year,cens) ~ lp.3, 
                           data5y.3v, 
                           reverse = TRUE,
                           timewt = "n/G2")


#---leave ParkWest out----

data5y.4<-imp3.5y%>%
  filter(clus!="ParkWest")

data5y.4v<-imp3.5y%>%
  filter(clus=="ParkWest")

#Boundary knots location
data5y.4%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  

#refit model
model4<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.4,k=1,bknots = c(log(0.1067762),log(3.978097)),scale = "odds")

#survival probabilities
s5.4<-predict(model4,type = "survival",times = 4) #survival probabilities at 4 year

#add linear predictor in validation datasets
des_matr4<-as.data.frame(model.matrix(~ age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.4v))
des_matr4$`(Intercept)` <- NULL
coef4<-c(model4$coefficients[4],model4$coefficients[5],model4$coefficients[6],model4$coefficients[7])
data5y.4v$lp.4 <- as.vector(as.matrix(des_matr4) %*% cbind(coef4))

# Harrell's C
harrell_C_4v<- concordance(Surv(year,cens) ~ lp.4, 
                              data5y.4v, 
                               reverse = TRUE)
# Uno's C
Uno_C_4v<- concordance(Surv(year,cens) ~ lp.4, 
                           data5y.4v, 
                           reverse = TRUE,
                           timewt = "n/G2")


#---leave PICNICS out----

data5y.5<-imp3.5y%>%
  filter(clus!="PICNICS")

data5y.5v<-imp3.5y%>%
  filter(clus=="PICNICS")

#Boundary knots location
data5y.5%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  

#refit model
model5<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.5,k=1,bknots = c(log(0.5557837),log(3.978097)),scale = "odds")

#survival probabilities
s5.5<-predict(model5,type = "survival",times = 4) #survival probabilities at 4 year

#linear predictor
des_matr5<-as.data.frame(model.matrix(~ age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.5v))
des_matr5$`(Intercept)` <- NULL
coef5<-c(model5$coefficients[4],model5$coefficients[5],model5$coefficients[6],model5$coefficients[7])
data5y.5v$lp.5<- as.vector(as.matrix(des_matr5) %*% cbind(coef5))


# Harrell's C
harrell_C_5v <- concordance(Surv(year,cens) ~ lp.5, 
                              data5y.5v, 
                               reverse = TRUE)
# Uno's C
Uno_C_5v<- concordance(Surv(year,cens) ~ lp.5, 
                           data5y.5v, 
                           reverse = TRUE,
                           timewt = "n/G2")

#---leave PINE out----

data5y.6<-imp3.5y%>%
  filter(clus!="PINE")

data5y.6v<-imp3.5y%>%
  filter(clus=="PINE")

#Boundary knots location
data5y.6%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  

#refit model
model6<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.6,k=1,bknots = c(log(0.1067762),log(3.8987)),scale = "odds")

#survival probabilities
s5.6<-predict(model6,type = "survival",times = 4) #survival probabilities at 4 year

#linear predictor
des_matr6<-as.data.frame(model.matrix(~ age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.6v))
des_matr6$`(Intercept)` <- NULL
coef6<-c(model6$coefficients[4],model6$coefficients[5],model6$coefficients[6],model6$coefficients[7])
data5y.6v$lp.6<- as.vector(as.matrix(des_matr6) %*% cbind(coef6))


# Harrell's C
harrell_C_6v <- concordance(Surv(year,cens) ~ lp.6, 
                              data5y.6v, 
                               reverse = TRUE)
# Uno's C
Uno_C_6v<- concordance(Surv(year,cens) ~ lp.6, 
                           data5y.6v, 
                           reverse = TRUE,
                           timewt = "n/G2")
```


```{r Time-dependent AUC}

#---leave CamPalGN out----

Uno_1v <-
  timeROC::timeROC(
    T = data5y.1v$year, 
    delta = data5y.1v$cens,
    marker = data5y.1v$lp.1,
    cause = 1, 
    weighting = "marginal", 
    times = 3.99,
    iid = TRUE
  )


#---leave ICICLE out----

Uno_2v <-
  timeROC::timeROC(
    T = data5y.2v$year, 
    delta = data5y.2v$cens,
    marker = data5y.2v$lp.2,
    cause = 1, 
    weighting = "marginal", 
    times = 3.99,
    iid = TRUE
  )  

#NA is because ICICILE less than 5 years, so I change to 4 years here.


#---leave NYPUM out----

Uno_3v <-
  timeROC::timeROC(
    T = data5y.3v$year, 
    delta = data5y.3v$cens,
    marker = data5y.3v$lp.3,
    cause = 1, 
    weighting = "marginal", 
    times = 3.99,
    iid = TRUE
  )


#---leave ParkWest out----

Uno_4v <-
  timeROC::timeROC(
    T = data5y.4v$year, 
    delta = data5y.4v$cens,
    marker = data5y.4v$lp.4,
    cause = 1, 
    weighting = "marginal", 
    times = 3.99,
    iid = TRUE
  )


#---leave PICNICS out----

Uno_5v <-
  timeROC::timeROC(
    T = data5y.5v$year, 
    delta = data5y.5v$cens,
    marker = data5y.5v$lp.5,
    cause = 1, 
    weighting = "marginal", 
    times = 3.99,
    iid = TRUE
  )

#---leave PINE out----

Uno_6v <-
  timeROC::timeROC(
    T = data5y.6v$year, 
    delta = data5y.6v$cens,
    marker = data5y.6v$lp.6,
    cause = 1, 
    weighting = "marginal", 
    times = 3.99,
    iid = TRUE
  )

```


```{r Discrimination 4 year}

harrell_C_1v 
harrell_C_2v 
harrell_C_3v 
harrell_C_4v 
harrell_C_5v
harrell_C_6v 



Uno_C_1v
Uno_C_2v
Uno_C_3v
Uno_C_4v
Uno_C_5v
Uno_C_6v


Uno_1v
Uno_2v
Uno_3v
Uno_4v
Uno_5v
Uno_6v

round(71.16+1.96*5.58,2)
```

```{r Mean calibration}

# Observed / Expected ratio
alpha <- .05

#---leave CamPalGN out----

# Observed
obj.1 <- summary(survfit(
  Surv(year, cens) ~ 1, 
  data = data5y.1v),
  times = 4)

#The observed is estimated using the complementary of the Kaplan-Meier curve at the fixed time point.

obs_t.1 <- 1 - obj.1$surv

# Predicted risk 
#Expected events=adding up the cumulative hazard
#The expected count for each subject is defined as the predicted cumulative hazard for the subject, up until event time or censoring
#predictRisk function When operating on models for survival analysis (without competing risks) the function still predicts the risk, as 1 - S(t|X) where S(t|X) is survival chance of a subject characterized by X.

data5y.1v$pred<-predict(model1,newdata = data5y.1v,type = "survival",times = 4)[[2]]  #survival probabilities

# Expected
exp_t.1 <-mean(1-data5y.1v$pred) #predicts risk, as 1 - S(t|X)


# Observed / Expected ratio
OE_t.1 <- obs_t.1 / exp_t.1

OE_summary.1 <- c(
  "OE" = OE_t.1,
  "2.5 %" = OE_t.1 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.1$n.event)),
  "97.5 %" = OE_t.1 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.1$n.event))
)

OE_summary.1


#---leave ICICLE out----

# Observed
obj.2 <- summary(survfit(
  Surv(year, cens) ~ 1, 
  data = data5y.2v),
  times = 4)  

obs_t.2 <- 1 - obj.2$surv

# Expected

data5y.2v$pred<-predict(model2,newdata = data5y.2v,type = "survival",times = 4)[[2]]  #survival probabilities

exp_t.2 <-mean(1-data5y.2v$pred) #predicts risk, as 1 - S(t|X)


# Observed / Expected ratio
OE_t.2 <- obs_t.2 / exp_t.2

OE_summary.2 <- c(
  "OE" = OE_t.2,
  "2.5 %" = OE_t.2 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.2$n.event)),
  "97.5 %" = OE_t.2 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.2$n.event))
)

OE_summary.2


#---leave NYPUM out----

# Observed
obj.3 <- summary(survfit(
  Surv(year, cens) ~ 1, 
  data = data5y.3v),
  times = 4)  

obs_t.3 <- 1 - obj.3$surv

# Expected

data5y.3v$pred<-predict(model3,newdata = data5y.3v,type = "survival",times = 4)[[2]]  #survival probabilities

exp_t.3 <-mean(1-data5y.3v$pred) #predicts risk, as 1 - S(t|X)

# Observed / Expected ratio
OE_t.3 <- obs_t.3 / exp_t.3

OE_summary.3 <- c(
  "OE" = OE_t.3,
  "2.5 %" = OE_t.3 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.3$n.event)),
  "97.5 %" = OE_t.3 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.3$n.event))
)

OE_summary.3


#---leave ParkWest out----


# Observed
obj.4 <- summary(survfit(
  Surv(year, cens) ~ 1, 
  data = data5y.4v),
  times = 4)  

obs_t.4 <- 1 - obj.4$surv

# Expected

data5y.4v$pred<-predict(model4,newdata = data5y.4v,type = "survival",times = 4)[[2]]  #survival probabilities

exp_t.4 <-mean(1-data5y.4v$pred) #predicts risk, as 1 - S(t|X)

# Observed / Expected ratio
OE_t.4 <- obs_t.4 / exp_t.4

OE_summary.4 <- c(
  "OE" = OE_t.4,
  "2.5 %" = OE_t.4 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.4$n.event)),
  "97.5 %" = OE_t.4 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.4$n.event))
)

OE_summary.4


#---leave PICNICS out----

# Observed
obj.5 <- summary(survfit(
  Surv(year, cens) ~ 1, 
  data = data5y.5v),
  times = 4)  

obs_t.5<- 1 - obj.5$surv

# Expected

data5y.5v$pred<-predict(model5,newdata = data5y.5v,type = "survival",times = 4)[[2]]  #survival probabilities

exp_t.5 <-mean(1-data5y.5v$pred) #predicts risk, as 1 - S(t|X)

# Observed / Expected ratio
OE_t.5 <- obs_t.5 / exp_t.5

OE_summary.5 <- c(
  "OE" = OE_t.5,
  "2.5 %" = OE_t.5 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.5$n.event)),
  "97.5 %" = OE_t.5 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.5$n.event))
)

OE_summary.5

#---leave PINE out----

# Observed
obj.6 <- summary(survfit(
  Surv(year, cens) ~ 1, 
  data = data5y.6v),
  times = 4)  

obs_t.6<- 1 - obj.6$surv

# Expected

data5y.6v$pred<-predict(model6,newdata = data5y.6v,type = "survival",times = 4)[[2]]  #survival probabilities

exp_t.6 <-mean(1-data5y.6v$pred) #predicts risk, as 1 - S(t|X)

# Observed / Expected ratio
OE_t.6 <- obs_t.6 / exp_t.6

OE_summary.6 <- c(
  "OE" = OE_t.6,
  "2.5 %" = OE_t.6 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.6$n.event)),
  "97.5 %" = OE_t.6 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.6$n.event))
)

OE_summary.6

```



```{r  Moderate calibration}

#moderate calibration uses smooth curves of predicted risk from a more complex ‘secondary’ Cox model against the predicted risk from the development model 


#---leave CamPalGN out----

#data5y.1v$pred<-predict(model1,newdata = data5y.1v,type = "survival",times = 4) #survival probabilities

# predicted risk
data5y.1v$risk <-(1-data5y.1v$pred) #predicted risk, as 1 - S(t|X),the probabilities of events

data5y.1v$risk.cll <- log(-log(1-data5y.1v$risk)) #complementary log-log link

#Check location of knots again

data5y.1v%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year))  


# Estimate actual risk

vcal.1.RP <- flexsurvspline(Surv(year, cens) ~ risk.cll, data = data5y.1v, k=1,bknots = c(log(1.147159),log(3.761807)),scale = "odds")

#vcal.1.RP <- cph(Surv(year, cens) ~ rcs(risk.cll, 3), x = T, y = T,surv = T,data = data5y.1v) 


dat_cal.1.RP <- cbind.data.frame(
  "obs" = 1 - predict(vcal.1.RP,
                     type = "survival",
                      times = 4,
                      newdata = data5y.1v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.1.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.1v)[[4]],
  
  "upper" = 1 - predict(vcal.1.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.1v)[[3]],
  "pred" = data5y.1v$risk
)


#survest function is to estimate survival probabilities
#dat_cal.1.RP <- cbind.data.frame(
#                   "obs" = 1 - rms::survest(vcal.1.RP,
#                    times = 4,
#                    newdata = data5y.1v)$surv, #1-s(t=4) to get the refitted model predicted risk
  
#  "lower" = 1 - rms::survest(vcal.1.RP,
#                        times = 4,
#                        newdata = data5y.1v)$upper,
  
# "upper" = 1 - rms::survest(vcal.1.RP,
#                            times = 4,
#                       newdata = data5y.1v)$lower,
#"pred" = data5y.1v$risk
#)


dat_cal.1.RP <- dat_cal.1.RP[order(dat_cal.1.RP$pred), ]

#png("CamPalGN-4y.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.1.RP$pred, #x is predicted risk from the 
  y=dat_cal.1.RP$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is CamPalGN",
  bty = "n" #no box
)

lines(dat_cal.1.RP$pred, 
      dat_cal.1.RP$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.1.RP$pred, 
      dat_cal.1.RP$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("topleft",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)

#dev.off()

# Numerical measures
absdiff_cph.1.RP <- abs(dat_cal.1.RP$pred - dat_cal.1.RP$obs)

numsum_cph.1.RP <- c(
  "ICI" = mean(absdiff_cph.1.RP),
  setNames(quantile(absdiff_cph.1.RP, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.1.RP)
)
numsum_cph.1.RP



#---leave ICICLE out----

# predicted risk
data5y.2v$risk <-(1-data5y.2v$pred) #predicted risk, as 1 - S(t|X),the probabilities of events

data5y.2v$risk.cll <- log(-log(1-data5y.2v$risk)) #complementary log-log link

#Check location of knots again

data5y.2v%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year)) 

# Estimate actual risk

vcal.2.RP <- flexsurvspline(Surv(year, cens) ~ risk.cll, data = data5y.2v, k=1,bknots = c(log(1.601643),log(3.827515)),scale = "odds")

#vcal.2.RP <- flexsurvspline(Surv(year, cens) ~ rcs(risk.cll,3), data = data5y.2v, k=1,bknots = c(log(1.601643),log(3.827515)),scale = "odds")


#survest function is to estimate survival probabilities

dat_cal.2.RP <- cbind.data.frame(
  "obs" = 1 - predict(vcal.2.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.2v)[[2]], #1-s(t=5) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.2.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.2v)[[4]],
  
  "upper" = 1 - predict(vcal.2.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.2v)[[3]],
  "pred" = data5y.2v$risk
)



dat_cal.2.RP <- dat_cal.2.RP[order(dat_cal.2.RP$pred), ]

#png("ICICLE-4y.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.2.RP$pred, #x is predicted risk from the 
  y=dat_cal.2.RP$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is ICICLE", 
  bty = "n" #no box
)

lines(dat_cal.2.RP$pred, 
      dat_cal.2.RP$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.2.RP$pred, 
      dat_cal.2.RP$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("topleft",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.8)
#dev.off()

# Numerical measures
absdiff_cph.2.RP <- abs(dat_cal.2.RP$pred - dat_cal.2.RP$obs)

numsum_cph.2.RP <- c(
  "ICI" = mean(absdiff_cph.2.RP),
  setNames(quantile(absdiff_cph.2.RP, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.2.RP)
)
numsum_cph.2.RP



#---leave NYPUM out----

# predicted risk
data5y.3v$risk <-(1-data5y.3v$pred) #predicted risk, as 1 - S(t|X),the probabilities of events

data5y.3v$risk.cll <- log(-log(1-data5y.3v$risk)) #complementary log-log link

#Check location of knots again

data5y.3v%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year)) 

# Estimate actual risk

vcal.3.RP <- flexsurvspline(Surv(year, cens) ~ risk.cll, data = data5y.3v, k=1,bknots = c(log(0.8925394),log(3.789185)),scale = "odds")

#vcal.3.RP <- flexsurvspline(Surv(year, cens) ~ rcs(risk.cll,3), data = data5y.3v, k=1,bknots = c(log(0.8925394),log(3.789185)),scale = "odds")

#survest function is to estimate survival probabilities

dat_cal.3.RP <- cbind.data.frame(
  "obs" = 1 - predict(vcal.3.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.3v)[[2]], #1-s(t=5) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.3.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.3v)[[4]],
  
  "upper" = 1 - predict(vcal.3.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.3v)[[3]],
  "pred" = data5y.3v$risk
)



dat_cal.3.RP <- dat_cal.3.RP[order(dat_cal.3.RP$pred), ]

#png("NYPUM-4y.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.3.RP$pred, #x is predicted risk from the 
  y=dat_cal.3.RP$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model", 
  main = "Validation dataset is NYPUM",
  bty = "n" #no box
)

lines(dat_cal.3.RP$pred, 
      dat_cal.3.RP$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.3.RP$pred, 
      dat_cal.3.RP$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)
#dev.off()


# Numerical measures
absdiff_cph.3.RP <- abs(dat_cal.3.RP$pred - dat_cal.3.RP$obs)

numsum_cph.3.RP <- c(
  "ICI" = mean(absdiff_cph.3.RP),
  setNames(quantile(absdiff_cph.3.RP, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.3.RP)
)
numsum_cph.3.RP



#---leave ParkWest out----

# predicted risk
data5y.4v$risk <-(1-data5y.4v$pred) #predicted risk, as 1 - S(t|X),the probabilities of events

data5y.4v$risk.cll <- log(-log(1-data5y.4v$risk)) #complementary log-log link

#Check location of knots again

data5y.4v%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year)) 

vcal.4.RP <- flexsurvspline(Surv(year, cens) ~ risk.cll, data = data5y.4v, k=1,bknots = c(log(1.990418),log(3.811088)),scale = "odds")

#vcal.4.RP <- flexsurvspline(Surv(year, cens) ~ rcs(risk.cll,3), data = data5y.4v, k=1,bknots = c(log(1.990418),log(3.811088)),scale = "odds")

#survest function is to estimate survival probabilities

dat_cal.4.RP <- cbind.data.frame(
  "obs" = 1 - predict(vcal.4.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.4v)[[2]], #1-s(t=5) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.4.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.4v)[[4]],
  
  "upper" = 1 - predict(vcal.4.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.4v)[[3]],
  "pred" = data5y.4v$risk
)



dat_cal.4.RP <- dat_cal.4.RP[order(dat_cal.4.RP$pred), ]

#png("ParkWest-4y.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.4.RP$pred, #x is predicted risk from the 
  y=dat_cal.4.RP$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is ParkWest",
  bty = "n" #no box
)

lines(dat_cal.4.RP$pred, 
      dat_cal.4.RP$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.4.RP$pred, 
      dat_cal.4.RP$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("topleft",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)

#dev.off()

# Numerical measures
absdiff_cph.4.RP <- abs(dat_cal.4.RP$pred - dat_cal.4.RP$obs)

numsum_cph.4.RP <- c(
  "ICI" = mean(absdiff_cph.4.RP),
  setNames(quantile(absdiff_cph.4.RP, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.4.RP)
)
numsum_cph.4.RP


#-----leave PICNICS out------

# predicted risk
data5y.5v$risk <-(1-data5y.5v$pred) #predicted risk, as 1 - S(t|X),the probabilities of events

data5y.5v$risk.cll <- log(-log(1-data5y.5v$risk)) #complementary log-log link

#Check location of knots again

data5y.5v%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year)) 

#vcal.5.RP <- flexsurvspline(Surv(year, cens) ~ risk.cll, data = data5y.5v, k=1,bknots = c(log(0.1067762),log(3.8987)),scale = "odds")

vcal.5.RP <- flexsurvspline(Surv(year, cens) ~ rcs(risk.cll,3), data = data5y.5v, k=1,bknots = c(log(0.1067762),log(3.8987)),scale = "odds")

#survest function is to estimate survival probabilities

dat_cal.5.RP <- cbind.data.frame(
  "obs" = 1 - predict(vcal.5.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.5v)[[2]], #1-s(t=5) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.5.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.5v)[[4]],
  
  "upper" = 1 - predict(vcal.5.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.5v)[[3]],
  "pred" = data5y.5v$risk
)



dat_cal.5.RP <- dat_cal.5.RP[order(dat_cal.5.RP$pred), ]

#png("PICNICS-4y.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.5.RP$pred, #x is predicted risk from the 
  y=dat_cal.5.RP$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is PICNICS",
  bty = "n" #no box
)

lines(dat_cal.5.RP$pred, 
      dat_cal.5.RP$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.5.RP$pred, 
      dat_cal.5.RP$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.8)
#dev.off()

# Numerical measures
absdiff_cph.5.RP <- abs(dat_cal.5.RP$pred - dat_cal.5.RP$obs)

numsum_cph.5.RP <- c(
  "ICI" = mean(absdiff_cph.5.RP),
  setNames(quantile(absdiff_cph.5.RP, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.5.RP)
)
numsum_cph.5.RP


#----leave PINE out----


# predicted risk
data5y.6v$risk <-(1-data5y.6v$pred) #predicted risk, as 1 - S(t|X),the probabilities of events

data5y.6v$risk.cll <- log(-log(1-data5y.6v$risk)) #complementary log-log link

#Check location of knots again

data5y.6v%>%
  filter(cens==1)%>%  #cens==1 event
  summarise(max(year),min(year)) 


#vcal.6.RP <- flexsurvspline(Surv(year, cens) ~ risk.cll, data = data5y.6v, k=1,bknots = c(log(0.5557837),log(3.978097)),scale = "odds")

vcal.6.RP <- flexsurvspline(Surv(year, cens) ~ rcs(risk.cll,3), data = data5y.6v, k=1,bknots = c(log(0.5557837),log(3.978097)),scale = "odds")

#survest function is to estimate survival probabilities

dat_cal.6.RP <- cbind.data.frame(
  "obs" = 1 - predict(vcal.6.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.6v)[[2]], #1-s(t=5) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.6.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.6v)[[4]],
  
  "upper" = 1 - predict(vcal.6.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.6v)[[3]],
  "pred" = data5y.6v$risk
)



dat_cal.6.RP <- dat_cal.6.RP[order(dat_cal.6.RP$pred), ]

#png("PINE-4y.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.6.RP$pred, #x is predicted risk from the 
  y=dat_cal.6.RP$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is PINE",
  bty = "n" #no box
)

lines(dat_cal.6.RP$pred, 
      dat_cal.6.RP$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.6.RP$pred, 
      dat_cal.6.RP$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("topleft",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.8)

#dev.off()

# Numerical measures
absdiff_cph.6.RP <- abs(dat_cal.6.RP$pred - dat_cal.6.RP$obs)

numsum_cph.6.RP <- c(
  "ICI" = mean(absdiff_cph.6.RP),
  setNames(quantile(absdiff_cph.6.RP, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.6.RP)
)
numsum_cph.6.RP
```


```{r recalibration}


#----CamPalGN----

#Try t=4

#rcs.mod1<-as.data.frame(basis(model1$knots,log(4))) #basis spline
#data5y.1v$rcs1<-rcs.mod1[,2]
#data5y.1v$rcs2<-rcs.mod1[,3]
#data5y.1v$pred.re<-1/(1+exp(-17.5665135+0.9822*log(4)-0.4243082*data5y.1v$rcs2+data5y.1v$lp.1))
#data5y.1v$risk.re<-1-data5y.1v$pred.re

#model1

#This is right!!!! Finally!!! t=4 is right

#Adding intercept and changing the knots location

model1.re<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl,data=data5y.1,k=1,bknots = c(log(1.147159),log(3.761807)),scale = "odds")  #change the knots position

rcs.mod1<-as.data.frame(basis(model1.re$knots,log(4))) #basis spline
data5y.1v$rcs1<-rcs.mod1[,2]
data5y.1v$rcs2<-rcs.mod1[,3]

log((1-obj.1$surv)/mean(1-data5y.1v$pred)) #ln mean O/E risk ratio 

model1.re$res[c("gamma0", "gamma1", "gamma2"), "est"]

#data5y.1v$lp.1 <- as.vector(as.matrix(des_matr1) %*% cbind(coef1))

#data5y.1v$lp.1 <- data5y.1v$lp.1

data5y.1v$pred.re<-1/(1+exp(-16.937753-0.9346098+1.575655*data5y.1v$rcs1-1.286890*data5y.1v$rcs2+data5y.1v$lp.1)) #change the knots position and s(lnt;r) but keep the linear predictor the same

(1-obj.1$surv)/mean(1-data5y.1v$pred.re) #Mean calibration improve

#Before is (1-obj.1$surv)/mean(1-data5y.1v$pred), 0.3927391, now is 0.8493298

data5y.1v$risk.re<-1-data5y.1v$pred.re

dat_cal.1.RP.re <- cbind.data.frame(
  "obs" = 1 - predict(vcal.1.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.1v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.1.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.1v)[[4]],
  
  "upper" = 1 - predict(vcal.1.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.1v)[[3]],
  "pred" = data5y.1v$risk.re
)



dat_cal.1.RP.re <- dat_cal.1.RP.re[order(dat_cal.1.RP.re$pred), ]

par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.1.RP.re$pred, #x is predicted risk from the 
  y=dat_cal.1.RP.re$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  bty = "n" #no box
)

lines(dat_cal.1.RP.re$pred, 
      dat_cal.1.RP.re$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.1.RP.re$pred, 
      dat_cal.1.RP.re$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)

# Numerical measures
absdiff_cph.1.RP.re <- abs(dat_cal.1.RP.re$pred - dat_cal.1.RP.re$obs)

numsum_cph.1.RP.re <- c(
  "ICI" = mean(absdiff_cph.1.RP.re),
  setNames(quantile(absdiff_cph.1.RP.re, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.1.RP.re)
)
numsum_cph.1.RP.re


#-----ICICLE-----

model2.re<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl,data=data5y.2,k=1,bknots = c(log(1.601643),log(3.827515)),scale = "odds")  #change the knots position

rcs.mod2<-as.data.frame(basis(model2.re$knots,log(4))) #basis spline
data5y.2v$rcs1<-rcs.mod2[,2]
data5y.2v$rcs2<-rcs.mod2[,3]

log((1-obj.2$surv)/mean(1-data5y.2v$pred)) #ln mean O/E risk ratio 

model2.re$res[c("gamma0", "gamma1", "gamma2"), "est"]

data5y.2v$pred.re<-1/(1+exp(-16.175303+0.03743414+1.611698*data5y.2v$rcs1-2.118458*data5y.2v$rcs2+data5y.2v$lp.2)) #change the knots position and s(lnt;r) but keep the linear predictor the same

(1-obj.2$surv)/mean(1-data5y.2v$pred.re) #Mean calibration improve
#(1-obj.2$surv)/mean(1-data5y.2v$pred) improve 1.04 to 1.01

data5y.2v$risk.re<-1-data5y.2v$pred.re

dat_cal.2.RP.re <- cbind.data.frame(
  "obs" = 1 - predict(vcal.2.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.2v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.2.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.2v)[[4]],
  
  "upper" = 1 - predict(vcal.2.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.2v)[[3]],
  "pred" = data5y.2v$risk.re
)



dat_cal.2.RP.re <- dat_cal.2.RP.re[order(dat_cal.2.RP.re$pred), ]

#png("ICICLE-4y-re1.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.2.RP.re$pred, #x is predicted risk from the 
  y=dat_cal.2.RP.re$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from development model with new boundary knots position and intercept",
  ylab = "Predicted risk from refitted model", 
  main = "Validation dataset is ICICLE",
  bty = "n" #no box
)

lines(dat_cal.2.RP.re$pred, 
      dat_cal.2.RP.re$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.2.RP.re$pred, 
      dat_cal.2.RP.re$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)
#dev.off()

# Numerical measures
absdiff_cph.2.RP.re <- abs(dat_cal.2.RP.re$pred - dat_cal.2.RP.re$obs)

numsum_cph.2.RP.re <- c(
  "ICI" = mean(absdiff_cph.2.RP.re),
  setNames(quantile(absdiff_cph.2.RP.re, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.2.RP.re)
)
numsum_cph.2.RP.re

#-----NYPUM----

model3.re<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl,data=data5y.3,k=1,bknots = c(log(0.8925394),log(3.789185)),scale = "odds")  #change the knots position

rcs.mod3<-as.data.frame(basis(model3.re$knots,log(4))) #basis spline
data5y.3v$rcs1<-rcs.mod3[,2]
data5y.3v$rcs2<-rcs.mod3[,3]

log((1-obj.3$surv)/mean(1-data5y.3v$pred)) #ln mean O/E risk ratio 

model3.re$res[c("gamma0", "gamma1", "gamma2"), "est"]

data5y.3v$pred.re<-1/(1+exp(-15.773117+0.03063814+1.286205*data5y.3v$rcs1-1.578305*data5y.3v$rcs2+data5y.3v$lp.3)) #change the knots position and s(lnt;r) but keep the linear predictor the same

(1-obj.3$surv)/mean(1-data5y.3v$pred.re) #Mean calibration improve
#(1-obj.3$surv)/mean(1-data5y.3v$pred) improve 1.03 to 1.03 not change

data5y.3v$risk.re<-1-data5y.3v$pred.re

dat_cal.3.RP.re <- cbind.data.frame(
  "obs" = 1 - predict(vcal.3.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.3v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.3.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.3v)[[4]],
  
  "upper" = 1 - predict(vcal.3.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.3v)[[3]],
  "pred" = data5y.3v$risk.re
)



dat_cal.3.RP.re <- dat_cal.3.RP.re[order(dat_cal.3.RP.re$pred), ]

par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.3.RP.re$pred, #x is predicted risk from the 
  y=dat_cal.3.RP.re$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  bty = "n" #no box
)

lines(dat_cal.3.RP.re$pred, 
      dat_cal.3.RP.re$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.3.RP.re$pred, 
      dat_cal.3.RP.re$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)

# Numerical measures
absdiff_cph.3.RP.re <- abs(dat_cal.3.RP.re$pred - dat_cal.3.RP.re$obs)

numsum_cph.3.RP.re <- c(
  "ICI" = mean(absdiff_cph.3.RP.re),
  setNames(quantile(absdiff_cph.3.RP.re, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.3.RP.re)
)
numsum_cph.3.RP.re

#-----ParkWest-----

model4.re<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+hybl,data=data5y.4,k=1,bknots = c(log(1.990418),log(3.811088)),scale = "odds")  #change the knots position

rcs.mod4<-as.data.frame(basis(model4.re$knots,log(4))) #basis spline
data5y.4v$rcs1<-rcs.mod4[,2]
data5y.4v$rcs2<-rcs.mod4[,3]

log((1-obj.4$surv)/mean(1-data5y.4v$pred)) #ln mean O/E risk ratio 

model4.re$res[c("gamma0", "gamma1", "gamma2"), "est"]

data5y.4v$pred.re<-1/(1+exp(-16.463195-0.4890715+1.718182*data5y.4v$rcs1-2.387994*data5y.4v$rcs2+data5y.4v$lp.4)) #change the knots position and s(lnt;r) but keep the linear predictor the same

(1-obj.4$surv)/mean(1-data5y.4v$pred.re) #Mean calibration improve
#(1-obj.4$surv)/mean(1-data5y.4v$pred) improve 0.61 to 0.93

data5y.4v$risk.re<-1-data5y.4v$pred.re

dat_cal.4.RP.re <- cbind.data.frame(
  "obs" = 1 - predict(vcal.4.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.4v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.4.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.4v)[[4]],
  
  "upper" = 1 - predict(vcal.4.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.4v)[[3]],
  "pred" = data5y.4v$risk.re
)



dat_cal.4.RP.re <- dat_cal.4.RP.re[order(dat_cal.4.RP.re$pred), ]

par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.4.RP.re$pred, #x is predicted risk from the 
  y=dat_cal.4.RP.re$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model",
  ylab = "Predicted risk from refitted model",  
  bty = "n" #no box
)

lines(dat_cal.4.RP.re$pred, 
      dat_cal.4.RP.re$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.4.RP.re$pred, 
      dat_cal.4.RP.re$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)

# Numerical measures
absdiff_cph.4.RP.re <- abs(dat_cal.4.RP.re$pred - dat_cal.4.RP.re$obs)

numsum_cph.4.RP.re <- c(
  "ICI" = mean(absdiff_cph.4.RP.re),
  setNames(quantile(absdiff_cph.4.RP.re, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.4.RP.re)
)
numsum_cph.4.RP.re

#----PICNICS-----

model5.re<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.5,k=1,bknots = c(log(0.1067762),log(3.8987)),scale = "odds")  #change the knots position

#The boundary knot position is based on the validation dataset, but the internal knot is based on the development data set.

model5.re$knots #check the knot postition

#The coefficients is the same as the development model so should be obtained by model5 not model5.re

rcs.mod5<-as.data.frame(basis(model5.re$knots,log(4))) #basis spline
data5y.5v$rcs1<-rcs.mod5[,2]
data5y.5v$rcs2<-rcs.mod5[,3]

log((1-obj.5$surv)/mean(1-data5y.5v$pred)) #ln mean O/E risk ratio 

model5.re$res[c("gamma0", "gamma1", "gamma2"), "est"]

data5y.5v$pred.re<-1/(1+exp(-9.29836083-0.2930496+2.03774141*data5y.5v$rcs1-0.03969151*data5y.5v$rcs2+data5y.5v$lp.5)) #change the knots position and s(lnt;r) but keep the linear predictor the same

(1-obj.5$surv)/mean(1-data5y.5v$pred.re) #Mean calibration improve
#(1-obj.5$surv)/mean(1-data5y.5v$pred) improve 0.75 to 0.95

data5y.5v$risk.re<-1-data5y.5v$pred.re

dat_cal.5.RP.re <- cbind.data.frame(
  "obs" = 1 - predict(vcal.5.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.5v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.5.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.5v)[[4]],
  
  "upper" = 1 - predict(vcal.5.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.5v)[[3]],
  "pred" = data5y.5v$risk.re
)



dat_cal.5.RP.re <- dat_cal.5.RP.re[order(dat_cal.5.RP.re$pred), ]


png("PIC-4yr-re1.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.5.RP.re$pred, #x is predicted risk from the 
  y=dat_cal.5.RP.re$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model with new boundary knots position and intercept",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is PICNICS",
  bty = "n" #no box
)

lines(dat_cal.5.RP.re$pred, 
      dat_cal.5.RP.re$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.5.RP.re$pred, 
      dat_cal.5.RP.re$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)
dev.off()

# Numerical measures
absdiff_cph.5.RP.re <- abs(dat_cal.5.RP.re$pred - dat_cal.5.RP.re$obs)

numsum_cph.5.RP.re <- c(
  "ICI" = mean(absdiff_cph.5.RP.re),
  setNames(quantile(absdiff_cph.5.RP.re, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.5.RP.re)
)
numsum_cph.5.RP.re

#------PINE------

model6.re<-flexsurvspline(Surv(year,cens)~age10+sex+mdsupdrs3.10+mmsebltotal,data=data5y.6,k=1,bknots = c(log(0.5557837),log(3.978097)),scale = "odds") # change the knots position

rcs.mod6<-as.data.frame(basis(model6.re$knots,log(4))) #basis spline
data5y.6v$rcs1<-rcs.mod6[,2]
data5y.6v$rcs2<-rcs.mod6[,3]

model6.re$res[c("gamma0", "gamma1", "gamma2"), "est"]
log((1-obj.6$surv)/mean(1-data5y.6v$pred))

data5y.6v$pred.re<-1/(1+exp(-21.10189+0.4388156+1.13516*log(4)-1.24703*data5y.6v$rcs2+data5y.6v$lp.6))

(1-obj.6$surv)/mean(1-data5y.6v$pred.re) #Mean calibration improve
#(1-obj.6$surv)/mean(1-data5y.6v$pred) improve 1.42 to 1.14

data5y.6v$risk.re<-1-data5y.6v$pred.re


dat_cal.6.RP.re <- cbind.data.frame(
  "obs" = 1 - predict(vcal.6.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.6v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.6.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.6v)[[4]],
  
  "upper" = 1 - predict(vcal.6.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.6v)[[3]],
  "pred" = data5y.6v$risk.re
)

dat_cal.6.RP.re <- dat_cal.6.RP.re[order(dat_cal.6.RP.re$pred), ]



png("PINE-4yr-re1.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.6.RP.re$pred, #x is predicted risk from the 
  y=dat_cal.6.RP.re$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from development model with new boundary knots position and intercept",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is PINE",
  bty = "n" #no box
)

lines(dat_cal.6.RP.re$pred, 
      dat_cal.6.RP.re$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.6.RP.re$pred, 
      dat_cal.6.RP.re$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("topleft",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.8)
dev.off()

# Numerical measures
absdiff_cph.6.RP.re <- abs(dat_cal.6.RP.re$pred - dat_cal.6.RP.re$obs)

numsum_cph.6.RP.re <- c(
  "ICI" = mean(absdiff_cph.6.RP.re),
  setNames(quantile(absdiff_cph.6.RP.re, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.6.RP.re)
)
numsum_cph.6.RP.re

```


```{r Weak calibration not run}

#---leave CamPalGN out----

#Because if I use data5y.1v$pred<-predict(model1,newdata = data5y.1v,type = "survival",times = 4) #survival probabilities, the slope will be 1, which means it should get the actual not the predict one.

# I am not sure if this is actual survival? 

vcal.1.weak <- flexsurvspline(Surv(year, cens) ~risk.cll, data = data5y.1v, k=1,bknots = c(log(1.147159),log(3.761807)),scale = "odds")

data5y.1v$pred.act<-predict(vcal.1.weak,type = "survival",times = 4,newdata = data5y.1v)[[2]] #actual survival 

data5y.1v$g_st<-log((1/data5y.1v$pred.act)-1)

val.1<-lm(g_st~lp.1,data = data5y.1v)

#val.1<-lm(g_st~risk.cll,data = data5y.1v)

summary(val.1)

calslope_summary.1 <- c(
  "calibration slope" = val.1$coefficients[2],
  "2.5 %"  = val.1$coefficients[2] - 1.96 * 0.001632,
  "97.5 %" = val.1$coefficients[2] + 1.96 * 0.001632
)

calslope_summary.1

#---leave ICICLE out----


vcal.2.weak <- flexsurvspline(Surv(year, cens) ~risk.cll, data = data5y.2v, k=1,bknots = c(log(1.601643), log(3.827515)),scale = "odds")

data5y.2v$pred.act<-predict(vcal.2.weak,type = "survival",times = 4,newdata = data5y.2v)[[2]] #actual survival 

data5y.2v$g_st<-log((1/data5y.2v$pred.act)-1)

val.2<-lm(g_st~lp.2,data = data5y.2v)

summary(val.2)

calslope_summary.2 <- c(
  "calibration slope" = val.2$coefficients[2],
  "2.5 %"  = val.2$coefficients[2] - 1.96 * 0.001216,
  "97.5 %" = val.2$coefficients[2] + 1.96 * 0.001216
)


calslope_summary.2 

#---leave NYPUM out----

vcal.3.weak <- flexsurvspline(Surv(year, cens) ~risk.cll, data = data5y.3v, k=1,bknots = c(log(0.8925394), log(3.789185)),scale = "odds")

data5y.3v$pred.act<-predict(vcal.3.weak,type = "survival",times = 4,newdata = data5y.3v)[[2]] #actual survival 

data5y.3v$g_st<-log((1/data5y.3v$pred.act)-1)

val.3<-lm(g_st~lp.3,data = data5y.3v)

#summary(val.3)

calslope_summary.3 <- c(
  "calibration slope" = val.3$coefficients[2],
  "2.5 %"  = val.3$coefficients[2] - 1.96 * 0.004041,
  "97.5 %" = val.3$coefficients[2] + 1.96 * 0.004041
)

calslope_summary.3

#---leave ParkWest out----

vcal.4.weak <- flexsurvspline(Surv(year, cens) ~risk.cll, data = data5y.4v, k=1,bknots = c(log(1.990418), log(3.811088)),scale = "odds")

data5y.4v$pred.act<-predict(vcal.4.weak,type = "survival",times = 4,newdata = data5y.4v)[[2]] #actual survival

data5y.4v$g_st<-log((1/data5y.4v$pred.act)-1)

val.4<-lm(g_st~lp.4,data = data5y.4v)

#summary(val.4)

calslope_summary.4 <- c(
  "calibration slope" = val.4$coefficients[2],
  "2.5 %"  = val.4$coefficients[2] - 1.96 * 0.002054,
  "97.5 %" = val.4$coefficients[2] + 1.96 * 0.002054
)

calslope_summary.4

#---leave PICNICS out----

vcal.5.weak <- flexsurvspline(Surv(year, cens) ~risk.cll, data = data5y.5v, k=1,bknots = c(log(0.1067762), log(3.8987)),scale = "odds")

data5y.5v$pred.act<-predict(vcal.5.weak,type = "survival",times = 4,newdata = data5y.5v)[[2]] #actual survival

data5y.5v$g_st<-log((1/data5y.5v$pred.act)-1)

val.5<-lm(g_st~lp.5,data = data5y.5v)

#summary(val.5)

calslope_summary.5 <- c(
  "calibration slope" = val.5$coefficients[2],
  "2.5 %"  = val.5$coefficients[2] - 1.96 * 0.003821,
  "97.5 %" = val.5$coefficients[2] + 1.96 * 0.003821
)

calslope_summary.5


#---leave PINE out----

vcal.5.weak <- flexsurvspline(Surv(year, cens) ~risk.cll, data = data5y.5v, k=1,bknots = c(log(0.5557837), log(3.978097)),scale = "odds")

data5y.5v$pred.act<-predict(vcal.5.weak,type = "survival",times = 4,newdata = data5y.5v)[[2]] #actual survival

data5y.6v$g_st<-log((1/data5y.6v$pred.act)-1)

val.6<-lm(g_st~lp.6,data = data5y.6v)

summary(val.6)

calslope_summary.6 <- c(
  "calibration slope" = val.6$coefficients[2],
  "2.5 %"  = val.6$coefficients[2] - 1.96 *0.005033,
  "97.5 %" = val.6$coefficients[2] + 1.96 *0.005033
)

calslope_summary.6


```

```{r recalibration2 for PICNICS PINE}

#----PICNICS----

#Change intercept based on weak calibration

#summary(val.5)

data5y.5v$pred.re.1<-1/(1+exp(-9.29836083-0.2930496-4.654+2.03774141*data5y.5v$rcs1-0.03969151*data5y.5v$rcs2+2.082*data5y.5v$lp.5)) #change the knots position and s(lnt;r) but keep the linear predictor the same

(1-obj.5$surv)/mean(1-data5y.5v$pred.re.1) #Mean calibration decrease
#(1-obj.5$surv)/mean(1-data5y.5v$pred.re) decrease from 0.95 to 0.79

#Before is (1-obj.1$surv)/mean(1-data5y.1v$pred.re), 0.8493298 to 1.04

data5y.5v$risk.re.1<-1-data5y.5v$pred.re.1

dat_cal.5.RP.re.1 <- cbind.data.frame(
  "obs" = 1 - predict(vcal.5.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.5v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.5.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.5v)[[4]],
  
  "upper" = 1 - predict(vcal.5.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.5v)[[3]],
  "pred" = data5y.5v$risk.re.1
)



dat_cal.5.RP.re.1 <- dat_cal.5.RP.re.1[order(dat_cal.5.RP.re.1$pred), ]


png("PIC-4yr-re2.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.5.RP.re.1$pred, #x is predicted risk from the 
  y=dat_cal.5.RP.re.1$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from developed model with new boundary knots position and intercept and coefficients",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is PICNICS",
  bty = "n" #no box
)

lines(dat_cal.5.RP.re.1$pred, 
      dat_cal.5.RP.re.1$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.5.RP.re.1$pred, 
      dat_cal.5.RP.re.1$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)
dev.off()

# Numerical measures
absdiff_cph.5.RP.re.1 <- abs(dat_cal.5.RP.re.1$pred - dat_cal.5.RP.re.1$obs)

numsum_cph.5.RP.re.1 <- c(
  "ICI" = mean(absdiff_cph.5.RP.re.1),
  setNames(quantile(absdiff_cph.5.RP.re.1, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.5.RP.re.1)
)
numsum_cph.5.RP.re.1


#----PINE----

#summary(val.6)

#log(O/E)

#1/(1+exp(-21.016487+0.3555012+1.127613*log(4)-1.261545*data5y.6v$rcs2+data5y.6v$lp.6))

data5y.6v$pred.re.1<-1/(1+exp(-21.10189+0.4388156+10.386+1.13516*log(4)-1.24703*data5y.6v$rcs2+0.36*data5y.6v$lp.6))

#obatin coefficent model6 and times with 0.36

(1-obj.6$surv)/mean(1-data5y.6v$pred.re.1) #Mean calibration improve
#(1-obj.6$surv)/mean(1-data5y.6v$pred.re) improve 1.14 to 0.97

data5y.6v$risk.re.1<-1-data5y.6v$pred.re.1

dat_cal.6.RP.re.1 <- cbind.data.frame(
  "obs" = 1 - predict(vcal.6.RP,
                      type = "survival",
                      times = 4,
                      newdata = data5y.6v)[[2]], #1-s(t=4) to get the refitted model predicted risk
  
  "lower" = 1 - predict(vcal.6.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.6v)[[4]],
  
  "upper" = 1 - predict(vcal.6.RP,
                        type = "survival",
                        conf.int = T,
                        times = 4,
                        newdata = data5y.6v)[[3]],
  "pred" = data5y.6v$risk.re.1
)

dat_cal.6.RP.re.1 <- dat_cal.6.RP.re.1[order(dat_cal.6.RP.re.1$pred), ]

png("PINE-4yr-re2.png",width = 3500,height =2000,res = 400)
par(xaxs = "i", yaxs = "i", las = 1)
plot(
  x=dat_cal.6.RP.re.1$pred, #x is predicted risk from the 
  y=dat_cal.6.RP.re.1$obs,
  type = "l", 
  lty = 1, 
  xlim = c(0, 1),
  ylim = c(0, 1), 
  lwd = 2,
  xlab = "Predicted risk from development model with new boundary knots position and intercept and coefficients",
  ylab = "Predicted risk from refitted model",  
  main = "Validation dataset is PINE",
  bty = "n" #no box
)

lines(dat_cal.6.RP.re.1$pred, 
      dat_cal.6.RP.re.1$lower, 
      type = "l", 
      lty = 2, 
      lwd = 2)
lines(dat_cal.6.RP.re.1$pred, 
      dat_cal.6.RP.re.1$upper,
      type = "l", 
      lty = 2, 
      lwd = 2)
abline(0, 1, lwd = 2, lty = 2, col = 2)
legend("bottomright",
        c("Ideal calibration",
          "Calibration curve based on secondary Royston-Parmar model",
          "95% confidence interval"),
        col = c(2, 1, 1),
        lty = c(2, 1, 2),
        lwd = c(2, 2, 2),
        bty = "n",
        cex = 0.85)

dev.off()

# Numerical measures
absdiff_cph.6.RP.re.1 <- abs(dat_cal.6.RP.re.1$pred - dat_cal.6.RP.re.1$obs)

numsum_cph.6.RP.re.1 <- c(
  "ICI" = mean(absdiff_cph.6.RP.re.1),
  setNames(quantile(absdiff_cph.6.RP.re.1, c(0.5, 0.9)), c("E50", "E90")),
  "Emax" = max(absdiff_cph.6.RP.re.1)
)
numsum_cph.6.RP.re.1

```

```{r mean/weak calibration after recalibration}

#-----PICNICS----

OE_t.5.re <- (1-obj.5$surv)/mean(1-data5y.5v$pred.re) 
OE_summary.5.re <- c(
  "OE" = OE_t.5.re,
  "2.5 %" = OE_t.5.re * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.5$n.event)),
  "97.5 %" = OE_t.5.re * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.5$n.event))
)

OE_summary.5.re


OE_t.5.re.1 <- (1-obj.5$surv)/mean(1-data5y.5v$pred.re.1) 
OE_summary.5.re.1 <- c(
  "OE" = OE_t.5.re.1,
  "2.5 %" = OE_t.5.re.1 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.5$n.event)),
  "97.5 %" = OE_t.5.re.1 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.5$n.event))
)

OE_summary.5.re.1


data5y.5v$lp.5.re<-as.vector(as.matrix(des_matr5) %*% cbind(2.082*coef5))

val.5.re<-lm(g_st~lp.5.re,data = data5y.5v)

summary(val.5.re)

calslope_summary.5.re <- c(
  "calibration slope" = val.5.re$coefficients[2],
  "2.5 %"  = val.5.re$coefficients[2] - 1.96 *0.001835,
  "97.5 %" = val.5.re$coefficients[2] + 1.96 *0.001835
)

calslope_summary.5.re


#-----PINE----

OE_t.6.re <- (1-obj.6$surv)/mean(1-data5y.6v$pred.re) #type data5y.6v$pred.re.1 for second recalibration

OE_summary.6.re <- c(
  "OE" = OE_t.6.re,
  "2.5 %" = OE_t.6.re * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.6$n.event)),
  "97.5 %" = OE_t.6.re * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.6$n.event))
)

OE_summary.6.re


OE_t.6.re.1 <- (1-obj.6$surv)/mean(1-data5y.6v$pred.re.1) #type data5y.6v$pred.re.1 for second recalibration

OE_summary.6.re.1 <- c(
  "OE" = OE_t.6.re.1,
  "2.5 %" = OE_t.6.re.1 * exp(-qnorm(1 - alpha / 2) * sqrt(1 / obj.6$n.event)),
  "97.5 %" = OE_t.6.re.1 * exp(+qnorm(1 - alpha / 2) * sqrt(1 / obj.6$n.event))
)

OE_summary.6.re.1



data5y.6v$lp.6.re<-as.vector(as.matrix(des_matr6) %*% cbind(0.36*coef6))

val.6.re<-lm(g_st~lp.6.re,data = data5y.6v)

summary(val.6.re)

calslope_summary.6.re <- c(
  "calibration slope" = val.6.re$coefficients[2],
  "2.5 %"  = val.6.re$coefficients[2] - 1.96 *0.01398,
  "97.5 %" = val.6.re$coefficients[2] + 1.96 *0.01398
)

calslope_summary.6.re


```


```{r mean and weak calibration 4 year not run}

OE_summary.1
OE_summary.2
OE_summary.3
OE_summary.4
OE_summary.5
OE_summary.6  


calslope_summary.1
calslope_summary.2
calslope_summary.3
calslope_summary.4
calslope_summary.5
calslope_summary.6

```