PERMIT SIR reshaping and cleaning

memory.size(100000) #Assign sufficient memory to R
load ("sir.data.rda") #Long format input file

#load("small.rda") #This is the demographic data from he processed sir.data.rda, 
#comprising PatientID, Gender, BirthDate and LSOA
#small<-unique(small)
#sir.data<-merge(sir.data,small,all.x=TRUE)
#save(sir.data,file="sirdata.rda")
#load("sirdata.rda")

library(zoo)
library(plyr)
library(tidyverse)
library(zoo)
library(data.table)
library(survival)
library(lubridate)

#READ CONDITION FILES- suffix names with a common suffix to import together
temp = list.files(pattern="*1.csv")
for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))

#######################################################################################
#SELECT PATIENTS 18 OR ABOVE AT HEART FAILURE DIAGNOSIS
hf<-sir.data[sir.data$ReadCode %in% HeartFailure1.csv$ReadCode,] #Create subset of HF patients
length(unique(as.factor(hf$PatientID))) #7254 Individuals had heart failure
hf$Age<-(as.numeric(year(strptime(hf$EntryDate, format="%Y%m%d"))))-hf$BirthYear
hf$hfage<-hf$Age
smalltab<-hf[,c("PatientID","hfage")]
smalltab$PatientID<-as.factor(smalltab$PatientID)
smalltab<-smalltab[!is.na(smalltab$hfage),]

first<-smalltab %>% group_by(PatientID) %>%
slice(which.min(hfage))
ungroup(first)
head(first)

sir.data<-merge(sir.data[sir.data$PatientID %in% hf$PatientID,],as.data.frame(first),all.x=TRUE)
sir.data<-sir.data[sir.data$hfage>=18,]

length(unique(as.factor(sir.data$PatientID))) #7208  hf patients who were 18 or over at first diagnosis
#save(sir.data,file="sirdatahfonly.rda") #full patient records from all adult hf patients from all years

####################################################################################
#COHORT SELECTION
#SELECT PATIENTS WITH CREATININE DATA
sir.data[sir.data$ReadCode=="44J3.",]->crea 
crea <- droplevels(crea)

#CHECK FOR CASES WHERE A VIABLE VALUE IS ENTERED IN THE UNITS COLUMN BY MISTAKE
temp<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 & as.numeric(as.character(crea$CodeUnits))<1000  & !is.na(crea$CodeValue),crea$CodeUnits,NA)
table(temp) # 51 records affected, but most not viable values
crea$CodeValue<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 & 
as.numeric(as.character(crea$CodeUnits))<1000 & 
is.na(crea$CodeValue),crea$CodeUnits,crea$CodeValue)
crea$CodeUnits<-ifelse(!is.na(temp),paste(""),paste(crea$CodeUnits))
crea$CodeValue<-as.numeric(as.character(crea$CodeValue))
crea<-crea[!is.na(crea$CodeValue),]

#length(unique(as.factor(crea$PatientID))) #6971 hf patients over 18 at diagnosis with creatinine data


###########################################################################
#SENSITIVITY TESTS
#HOW MANY ZERO CR VALUES AND HOW MANY CR VALUES UNDER 20
lowcr<-crea[crea$CodeValue<18 & !is.na(crea$CodeValue),] 
#length(lowcr$PatientID)#21727
levels(as.factor(lowcr$PatientID)) 
table(lowcr$CodeValue)
#4121 patients are affected, 471 are zero values

crea<-crea[crea$CodeValue>0,]
lowcr<-crea[crea$CodeValue<18,] 
length(lowcr$PatientID) 
#ARE THESE FROM OLD ASSAY?
lowcr <- droplevels(lowcr)
head(lowcr)
table(lowcr$CodeUnits)
#             %      g/L    g/mol        L micmol/l     mmol   mmol/L 
#   4400        1        1        1        1     6099        1      766 

#  umol/l   umol/L        h    mol/L 
#       5   891607        1        1 


#In the first iteration 891607 records were retained. 
#Evidently some were omitted due to minor typographic changes
#We can assume post 2008 values unless otherwise marked were completed with the new assay as we believed the lab switched over to this in 2005
#There is only one regional lab so the tests will be consistent, just potentially not the units used for recording.
#Some units may ahve been incorrectly selected from the pulldown menu, so values over 50 should be assumed to be in umol not mmol.
#We can use a simple formula to convert values under 100 in mmol to umol

crea$CodeUnits<-as.factor(crea$CodeUnits)
levels(crea$CodeUnits)
#[1] ""         "%"        "g/L"      "g/mol"    "L"        "micmol/l"
 #[7] "mmol"     "mmol/L"   "umol/l"   "None"  "umol/L"  
 
levels(crea$CodeUnits)[c(1,6,9)]<-"umol/L"
levels(crea$CodeUnits)[c(6)]<-"mmol/L"
levels(crea$CodeUnits)[c(2:5)]<-"NA"
crea<-crea[!is.na(crea$CodeUnits),]
crea$CodeValue<-ifelse(crea$CodeUnits=="mmol/L" & as.numeric(crea$CodeValue)<50,(as.numeric(crea$CodeValue)*1000),as.numeric(crea$CodeValue))
crea$CodeUnits<-"umol/L"
summary(crea$CodeValue) #Any remaining high and low values will be removed
crea<-crea[as.numeric(crea$CodeValue)>=20 & !is.na(crea$CodeValue),] 
#Upper limit TBA

save(crea,file="crearecleanedincd.rda") #includes dialysis patients
#or save(crea,file="crearecleaned.rda") #no dialysis patients
###############################################################################
#DATA CLEANING 

#load("sirdatahfonly.rda") #full patient records from all adult hf patients from all years
#load("crearecleaned.rda") # all steps above completed

length(crea$CodeValue) #412394 records left
#422129 with dialysis pats

#REMOVE SAME DAY CREATININE ENTRIES IF THE SOURCE LOCATION CODE DIFFERS.
names(crea)
crea<-crea[order(crea$PatientID,crea$CodeValue, rev(crea$Source)),]
crea2<-crea[(duplicated(crea[,c(1,4,6)])&!duplicated(crea[,7])),]
length(crea2$CodeValue) #Only 24 same day duplicates are from different sources
crea3<-crea[(duplicated(crea[,c(1,4,6)])),]
length(crea3$CodeValue) #in total, 105665 of 412394 (26%) records have a same day duplicate
#106686 with dialysis

crea<-crea[!rownames(crea) %in% rownames(crea2),]

#REMOVE DELAYED CREATININE ENTRIES FROM SAME CALENDAR MONTH IF THE SOURCE LOCATION CODE DIFFERS.

crea$event.date<-as.Date(as.character(crea$EntryDate),format="%Y%m%d")
(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))) -> year
(as.numeric(month(strptime(crea$event.date, format="%Y-%m-%d")))) -> month
crea$EntryPeriod<-paste(month,year)

crea$Source<-ifelse(crea$Source=="salfordt",paste("2"),paste("1")) #2 for hospital, 1 for GP
crea$Source<-ifelse(is.na(crea$Source),paste("2"),crea$Source)

crea<-crea[order(crea[,1], -(crea[,4]),(crea[,7])),]
crea4<-crea[(duplicated(crea[,c(1,6,14])&!duplicated(crea[,7])),] #DELAYED DUPLICATES (SAME PATIENT, MONTH, VALUE)
length(crea4$PatientID) #- Only 1 of the remaining potential duplicate records outside of the same day window come from different locations
crea5<-crea[duplicated(crea[,c(1,6,14)]),] 
table(crea5$Source) #The vast majority of out of range duplicates are multiple tests from within hospital. 

crea<-crea[!rownames(crea) %in% rownames(crea4),]

length(crea$PatientID) # 412369 records retained 
#422102 with dial included


#SELECT MEAN DAILY CREATININE IF MULTIPLE ENTRIES AFTER REMOVING DELAYED DUPLICATES AND OUT OF RANGE VALUES
smalltab<-crea[,c("PatientID","CodeValue", "EntryDate")]
xcrea<-smalltab %>% group_by(PatientID, EntryDate) %>%
summarize(Creatinine = mean(as.numeric(as.character(CodeValue))))
head(xcrea)
ungroup(xcrea)
crea<-merge(crea,as.data.frame(xcrea),all.x=TRUE)

save(crea,file="crearecleaned.rda")
#or
#save(crea,file="crearecleanedincd.rda")
######################################################
#Add demographic variables from lookup tables

#LSOA and age already added in SIR

#ASSIGN AGE
crea$Age<-(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d"))))-crea$BirthYear

#CODE ETHNICITY
ethnic.data<-read.table("ethnic.data.csv",header=TRUE,sep=",")
ethnic.data$Category<-floor(ethnic.data$Category)
crea<-merge(crea,ethnic.data,by.x="Ethnicity",by.y="ClinCode2",all.x=TRUE, all.y=FALSE)
names(crea)
crea<-subset(crea, select=-c(Ethnicity,ClinCode1,EntryPeriod,Rubric))
colnames(crea)[which(names(crea) == "Category")] <- "Ethnicity"
save(crea, file = "crea.ongoing.Rdata")

levels(unique(as.factor(crea$PatientID))) 
#6918 remaining with hf as an adult, all same day duplicates removed.
#No other duplicate pathology data removed, no data time range applied.

#ADD LSOA
imd<-read.csv("IMD2010.csv")
imd<-imd[,c("LSOA","IMD_Decile2010")]
crea<-merge(crea,imd,all.x=TRUE)

#COHORT SELECTION
#LIMIT TO PATIENTS WITH AT LEAST 2 POST 2008 CREATININE TEST VALUES

crea<-crea[as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))>=2008,]
table(crea$PatientID) < 2 -> rare  
rownames(as.matrix(rare)) -> ids
table(rare)
crea[!(crea$PatientID %in% ids[rare]),] -> crea.rep	
levels(unique(as.factor(crea.rep$PatientID))) # 6589 non-dialysis adult hf patients have 2 or more post-2008 creatinine tests
# 6635 adult hf patients have 2 or more post-2008 creatinine tests
length(crea.rep$PatientID) #284233 remaining, 290959 inc dialysis

sir.data<-sir.data[sir.data$PatientID %in% crea.rep$PatientID,]
#Breakpoint
#######################################################
save(crea.rep, file = "crea.rephf2tests.Rdata")
save(sir.data, file = "sir.datahf2tests.Rdata")
#######################################################
save(crea.rep, file = "crea.rephf2testsd.Rdata")
save(sir.data, file = "sir.datahf2testsd.Rdata")
#######################################################

#ADD IN VARIOUS CONDITIONS AND FLAGS FROM MAIN EHR FILES USING R SCRIPT IN SAME FOLDER
#ADD PRESCRIPTION DATA USING R SCRIPT IN SAME FOLDER

#Narrow lookup table to the full list of codes of interest to speed up processing
#temp = list.files(pattern="*1.csv")
#for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))
#merged <- Reduce(function(x, y) merge(x, y, all=TRUE), 
#list(AF1.csv,BMI1.csv,BNP1.csv,BUN1.csv,cessation1.csv,DBP1.csv,Diabetes1.csv,Dialysis1.csv,Haemoglobin1.csv,HeartFailure1.csv, HeartRate1.csv,IHD1.csv,MCV1.csv,Nephrectomy1.csv,NTPROBNP1.csv,PVD1.csv,RM1.csv,SBP1.csv,SerumAlbumin1.csv,SerumPotassium1.csv,SerumSodium1.csv,smoking1.csv,transplant1.csv,UACR1.csv,UAlbumin1.csv,UricAcid1.csv))
#sir.data<-sir.data[which(sir.data$PatientID %in% crea.rep$PatientID & sir.data$ReadCode %in% merged$ReadCode),]

##########################################################################
#FROM VERSIONS 061217 onwards:
#SELECT PATIENTS WHICH HAVE AT LEAST 2 TESTS, OPTION FOR TIME RANGE RESTRICTION
library(lubridate)
sir.data$event.date<-as.Date(as.character(sir.data$EntryDate),format="%Y%m%d")

aggregate(as.numeric(year(strptime(sir.data$event.date, format="%Y-%m-%d"))), list(sir.data$PatientID), range) -> ranges
ranges$x[,2] - ranges$x[,1] -> ranges$range
ranges[(which(ranges$range<2)),1] -> range_short_ids    # define exclusion range as 2 years
crea.rep[-which(crea.rep$PatientID %in% range_short_ids),]->crea.rep 
unique(crea.rep$PatientID)#5950/6623
save(crea.rep,file="crea.rep2yrsall.rda")
#save(crea.rep,file="crea.rep2yrsalld.rda")

#IF DELETING DELAYED PATHOLOGY DUPLICATES
#names(sir.data)
#d<-sir.data[duplicated(sir.data[,c(1,2,6,15)])&sir.data$ReadCode %in% Pathology1.csv$ReadCode,c(1,2,6,7,15)]
#d<-unique(d[order(as.Date(d$EntryDate,format="%Y%m%d")),,drop=FALSE,fromLast=FALSE]) #REMAINING DUPLICATES BASED ON SAME VALUE AND CALENDAR MONTH BUT DIFFERENT SOURCE LOCATION
#THE EARLIEST OF THE DUPLICATES IS KEPT.

#Then use strptime (strip time) to convert dates into POSIX format
#date_vec <-strptime(paste(crea.rep$entry.date), "%Y/%m/%d")
#compare observation 1 and 2, 2 and 3, 3 and 4...
#first_date <- date_vec[1:(length(date_vec)-1)]
#second_date <- date_vec[2:length(date_vec)]
#second_gap <- difftime(second_date, first_date, units="days")
#Determine the gaps that are less than 30 days apart.Leave TRUE in to keep 1st instance
#dup_index <- second_gap>10
#dup_index <- c(TRUE, dup_index)
#dat[dup_index, ]