-
Notifications
You must be signed in to change notification settings - Fork 4
/
PERMIT SIR reshaping and cleaning
252 lines (204 loc) · 11.2 KB
/
PERMIT SIR reshaping and cleaning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
memory.size(100000) #Assign sufficient memory to R
load ("sir.data.rda") #Long format input file
#load("small.rda") #This is the demographic data from he processed sir.data.rda,
#comprising PatientID, Gender, BirthDate and LSOA
#small<-unique(small)
#sir.data<-merge(sir.data,small,all.x=TRUE)
#save(sir.data,file="sirdata.rda")
#load("sirdata.rda")
library(zoo)
library(plyr)
library(tidyverse)
library(zoo)
library(data.table)
library(survival)
library(lubridate)
#READ CONDITION FILES- suffix names with a common suffix to import together
temp = list.files(pattern="*1.csv")
for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))
#######################################################################################
#SELECT PATIENTS 18 OR ABOVE AT HEART FAILURE DIAGNOSIS
hf<-sir.data[sir.data$ReadCode %in% HeartFailure1.csv$ReadCode,] #Create subset of HF patients
length(unique(as.factor(hf$PatientID))) #7254 Individuals had heart failure
hf$Age<-(as.numeric(year(strptime(hf$EntryDate, format="%Y%m%d"))))-hf$BirthYear
hf$hfage<-hf$Age
smalltab<-hf[,c("PatientID","hfage")]
smalltab$PatientID<-as.factor(smalltab$PatientID)
smalltab<-smalltab[!is.na(smalltab$hfage),]
first<-smalltab %>% group_by(PatientID) %>%
slice(which.min(hfage))
ungroup(first)
head(first)
sir.data<-merge(sir.data[sir.data$PatientID %in% hf$PatientID,],as.data.frame(first),all.x=TRUE)
sir.data<-sir.data[sir.data$hfage>=18,]
length(unique(as.factor(sir.data$PatientID))) #7208 hf patients who were 18 or over at first diagnosis
#save(sir.data,file="sirdatahfonly.rda") #full patient records from all adult hf patients from all years
####################################################################################
#COHORT SELECTION
#SELECT PATIENTS WITH CREATININE DATA
sir.data[sir.data$ReadCode=="44J3.",]->crea
crea <- droplevels(crea)
#CHECK FOR CASES WHERE A VIABLE VALUE IS ENTERED IN THE UNITS COLUMN BY MISTAKE
temp<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 & as.numeric(as.character(crea$CodeUnits))<1000 & !is.na(crea$CodeValue),crea$CodeUnits,NA)
table(temp) # 51 records affected, but most not viable values
crea$CodeValue<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 &
as.numeric(as.character(crea$CodeUnits))<1000 &
is.na(crea$CodeValue),crea$CodeUnits,crea$CodeValue)
crea$CodeUnits<-ifelse(!is.na(temp),paste(""),paste(crea$CodeUnits))
crea$CodeValue<-as.numeric(as.character(crea$CodeValue))
crea<-crea[!is.na(crea$CodeValue),]
#length(unique(as.factor(crea$PatientID))) #6971 hf patients over 18 at diagnosis with creatinine data
###########################################################################
#SENSITIVITY TESTS
#HOW MANY ZERO CR VALUES AND HOW MANY CR VALUES UNDER 20
lowcr<-crea[crea$CodeValue<18 & !is.na(crea$CodeValue),]
#length(lowcr$PatientID)#21727
levels(as.factor(lowcr$PatientID))
table(lowcr$CodeValue)
#4121 patients are affected, 471 are zero values
crea<-crea[crea$CodeValue>0,]
lowcr<-crea[crea$CodeValue<18,]
length(lowcr$PatientID)
#ARE THESE FROM OLD ASSAY?
lowcr <- droplevels(lowcr)
head(lowcr)
table(lowcr$CodeUnits)
# % g/L g/mol L micmol/l mmol mmol/L
# 4400 1 1 1 1 6099 1 766
# umol/l umol/L h mol/L
# 5 891607 1 1
#In the first iteration 891607 records were retained.
#Evidently some were omitted due to minor typographic changes
#We can assume post 2008 values unless otherwise marked were completed with the new assay as we believed the lab switched over to this in 2005
#There is only one regional lab so the tests will be consistent, just potentially not the units used for recording.
#Some units may ahve been incorrectly selected from the pulldown menu, so values over 50 should be assumed to be in umol not mmol.
#We can use a simple formula to convert values under 100 in mmol to umol
crea$CodeUnits<-as.factor(crea$CodeUnits)
levels(crea$CodeUnits)
#[1] "" "%" "g/L" "g/mol" "L" "micmol/l"
#[7] "mmol" "mmol/L" "umol/l" "None" "umol/L"
levels(crea$CodeUnits)[c(1,6,9)]<-"umol/L"
levels(crea$CodeUnits)[c(6)]<-"mmol/L"
levels(crea$CodeUnits)[c(2:5)]<-"NA"
crea<-crea[!is.na(crea$CodeUnits),]
crea$CodeValue<-ifelse(crea$CodeUnits=="mmol/L" & as.numeric(crea$CodeValue)<50,(as.numeric(crea$CodeValue)*1000),as.numeric(crea$CodeValue))
crea$CodeUnits<-"umol/L"
summary(crea$CodeValue) #Any remaining high and low values will be removed
crea<-crea[as.numeric(crea$CodeValue)>=20 & !is.na(crea$CodeValue),]
#Upper limit TBA
save(crea,file="crearecleanedincd.rda") #includes dialysis patients
#or save(crea,file="crearecleaned.rda") #no dialysis patients
###############################################################################
#DATA CLEANING
#load("sirdatahfonly.rda") #full patient records from all adult hf patients from all years
#load("crearecleaned.rda") # all steps above completed
length(crea$CodeValue) #412394 records left
#422129 with dialysis pats
#REMOVE SAME DAY CREATININE ENTRIES IF THE SOURCE LOCATION CODE DIFFERS.
names(crea)
crea<-crea[order(crea$PatientID,crea$CodeValue, rev(crea$Source)),]
crea2<-crea[(duplicated(crea[,c(1,4,6)])&!duplicated(crea[,7])),]
length(crea2$CodeValue) #Only 24 same day duplicates are from different sources
crea3<-crea[(duplicated(crea[,c(1,4,6)])),]
length(crea3$CodeValue) #in total, 105665 of 412394 (26%) records have a same day duplicate
#106686 with dialysis
crea<-crea[!rownames(crea) %in% rownames(crea2),]
#REMOVE DELAYED CREATININE ENTRIES FROM SAME CALENDAR MONTH IF THE SOURCE LOCATION CODE DIFFERS.
crea$event.date<-as.Date(as.character(crea$EntryDate),format="%Y%m%d")
(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))) -> year
(as.numeric(month(strptime(crea$event.date, format="%Y-%m-%d")))) -> month
crea$EntryPeriod<-paste(month,year)
crea$Source<-ifelse(crea$Source=="salfordt",paste("2"),paste("1")) #2 for hospital, 1 for GP
crea$Source<-ifelse(is.na(crea$Source),paste("2"),crea$Source)
crea<-crea[order(crea[,1], -(crea[,4]),(crea[,7])),]
crea4<-crea[(duplicated(crea[,c(1,6,14])&!duplicated(crea[,7])),] #DELAYED DUPLICATES (SAME PATIENT, MONTH, VALUE)
length(crea4$PatientID) #- Only 1 of the remaining potential duplicate records outside of the same day window come from different locations
crea5<-crea[duplicated(crea[,c(1,6,14)]),]
table(crea5$Source) #The vast majority of out of range duplicates are multiple tests from within hospital.
crea<-crea[!rownames(crea) %in% rownames(crea4),]
length(crea$PatientID) # 412369 records retained
#422102 with dial included
#SELECT MEAN DAILY CREATININE IF MULTIPLE ENTRIES AFTER REMOVING DELAYED DUPLICATES AND OUT OF RANGE VALUES
smalltab<-crea[,c("PatientID","CodeValue", "EntryDate")]
xcrea<-smalltab %>% group_by(PatientID, EntryDate) %>%
summarize(Creatinine = mean(as.numeric(as.character(CodeValue))))
head(xcrea)
ungroup(xcrea)
crea<-merge(crea,as.data.frame(xcrea),all.x=TRUE)
save(crea,file="crearecleaned.rda")
#or
#save(crea,file="crearecleanedincd.rda")
######################################################
#Add demographic variables from lookup tables
#LSOA and age already added in SIR
#ASSIGN AGE
crea$Age<-(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d"))))-crea$BirthYear
#CODE ETHNICITY
ethnic.data<-read.table("ethnic.data.csv",header=TRUE,sep=",")
ethnic.data$Category<-floor(ethnic.data$Category)
crea<-merge(crea,ethnic.data,by.x="Ethnicity",by.y="ClinCode2",all.x=TRUE, all.y=FALSE)
names(crea)
crea<-subset(crea, select=-c(Ethnicity,ClinCode1,EntryPeriod,Rubric))
colnames(crea)[which(names(crea) == "Category")] <- "Ethnicity"
save(crea, file = "crea.ongoing.Rdata")
levels(unique(as.factor(crea$PatientID)))
#6918 remaining with hf as an adult, all same day duplicates removed.
#No other duplicate pathology data removed, no data time range applied.
#ADD LSOA
imd<-read.csv("IMD2010.csv")
imd<-imd[,c("LSOA","IMD_Decile2010")]
crea<-merge(crea,imd,all.x=TRUE)
#COHORT SELECTION
#LIMIT TO PATIENTS WITH AT LEAST 2 POST 2008 CREATININE TEST VALUES
crea<-crea[as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))>=2008,]
table(crea$PatientID) < 2 -> rare
rownames(as.matrix(rare)) -> ids
table(rare)
crea[!(crea$PatientID %in% ids[rare]),] -> crea.rep
levels(unique(as.factor(crea.rep$PatientID))) # 6589 non-dialysis adult hf patients have 2 or more post-2008 creatinine tests
# 6635 adult hf patients have 2 or more post-2008 creatinine tests
length(crea.rep$PatientID) #284233 remaining, 290959 inc dialysis
sir.data<-sir.data[sir.data$PatientID %in% crea.rep$PatientID,]
#Breakpoint
#######################################################
save(crea.rep, file = "crea.rephf2tests.Rdata")
save(sir.data, file = "sir.datahf2tests.Rdata")
#######################################################
save(crea.rep, file = "crea.rephf2testsd.Rdata")
save(sir.data, file = "sir.datahf2testsd.Rdata")
#######################################################
#ADD IN VARIOUS CONDITIONS AND FLAGS FROM MAIN EHR FILES USING R SCRIPT IN SAME FOLDER
#ADD PRESCRIPTION DATA USING R SCRIPT IN SAME FOLDER
#Narrow lookup table to the full list of codes of interest to speed up processing
#temp = list.files(pattern="*1.csv")
#for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))
#merged <- Reduce(function(x, y) merge(x, y, all=TRUE),
#list(AF1.csv,BMI1.csv,BNP1.csv,BUN1.csv,cessation1.csv,DBP1.csv,Diabetes1.csv,Dialysis1.csv,Haemoglobin1.csv,HeartFailure1.csv, HeartRate1.csv,IHD1.csv,MCV1.csv,Nephrectomy1.csv,NTPROBNP1.csv,PVD1.csv,RM1.csv,SBP1.csv,SerumAlbumin1.csv,SerumPotassium1.csv,SerumSodium1.csv,smoking1.csv,transplant1.csv,UACR1.csv,UAlbumin1.csv,UricAcid1.csv))
#sir.data<-sir.data[which(sir.data$PatientID %in% crea.rep$PatientID & sir.data$ReadCode %in% merged$ReadCode),]
##########################################################################
#FROM VERSIONS 061217 onwards:
#SELECT PATIENTS WHICH HAVE AT LEAST 2 TESTS, OPTION FOR TIME RANGE RESTRICTION
library(lubridate)
sir.data$event.date<-as.Date(as.character(sir.data$EntryDate),format="%Y%m%d")
aggregate(as.numeric(year(strptime(sir.data$event.date, format="%Y-%m-%d"))), list(sir.data$PatientID), range) -> ranges
ranges$x[,2] - ranges$x[,1] -> ranges$range
ranges[(which(ranges$range<2)),1] -> range_short_ids # define exclusion range as 2 years
crea.rep[-which(crea.rep$PatientID %in% range_short_ids),]->crea.rep
unique(crea.rep$PatientID)#5950/6623
save(crea.rep,file="crea.rep2yrsall.rda")
#save(crea.rep,file="crea.rep2yrsalld.rda")
#IF DELETING DELAYED PATHOLOGY DUPLICATES
#names(sir.data)
#d<-sir.data[duplicated(sir.data[,c(1,2,6,15)])&sir.data$ReadCode %in% Pathology1.csv$ReadCode,c(1,2,6,7,15)]
#d<-unique(d[order(as.Date(d$EntryDate,format="%Y%m%d")),,drop=FALSE,fromLast=FALSE]) #REMAINING DUPLICATES BASED ON SAME VALUE AND CALENDAR MONTH BUT DIFFERENT SOURCE LOCATION
#THE EARLIEST OF THE DUPLICATES IS KEPT.
#Then use strptime (strip time) to convert dates into POSIX format
#date_vec <-strptime(paste(crea.rep$entry.date), "%Y/%m/%d")
#compare observation 1 and 2, 2 and 3, 3 and 4...
#first_date <- date_vec[1:(length(date_vec)-1)]
#second_date <- date_vec[2:length(date_vec)]
#second_gap <- difftime(second_date, first_date, units="days")
#Determine the gaps that are less than 30 days apart.Leave TRUE in to keep 1st instance
#dup_index <- second_gap>10
#dup_index <- c(TRUE, dup_index)
#dat[dup_index, ]