This repository has been archived by the owner on Nov 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bt-prepdata.R
326 lines (226 loc) · 12.7 KB
/
bt-prepdata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#####################################################################################################################
# This script is run to prepare data for twin modelling (wide format)
# using the BATSS sample for the paper:
# The latent structure of emerging cognitive abilities: an infant twin study.
# Bussu G., Taylor M., Tammimies K., Ronald A., Falck-Ytter T.
#
# coded by Giorgia Bussu, June 2022
#
####################################
# The script needs as input developmental and demographic scores from the entire BATSS sample in the long format.
#
# The script gives as output the selected dataset based on inclusion criteria, in wide (twin_data.csv) and long format (phenotypic_data.csv).
#
#####################################################################################################################
rm(list=ls())
### set a working directory (otherwise create a new R project):
setwd('your folder')
list.files()
### import and check data:
data_msel <- readxl::read_xlsx('Mullen.xlsx')
names(data_msel);dim(data_msel)
data_demo <- readxl::read_xlsx('Demographics_5m.xlsx')
names(data_demo);dim(data_demo)
data_exclusion<-readxl::read_xlsx('BabyTwins background summary and exclusion.xlsx')
names(data_exclusion); dim(data_exclusion)
### checking match across files
length(which(data_msel$`Participant EASE Code`!=data_demo$`Participant EASE Code`| data_msel$`Participant EASE Code`!=data_exclusion$Code | data_demo$`Participant EASE Code`!=data_exclusion$Code))
mm_indx<-which(data_msel$`Participant EASE Code`!=data_demo$`Participant EASE Code`| data_msel$`Participant EASE Code`!=data_exclusion$Code | data_demo$`Participant EASE Code`!=data_exclusion$Code)
# MSEL file had twins swapped within 5 pairs -> now matching to demographic and exclusion files
msel_matchingindx<-match(data_msel$`Participant EASE Code`,data_exclusion$Code)
data_msel<-data_msel[msel_matchingindx,]
#####################################################################################################################
### exclusion based on general criteria
#####################################################################################################################
excl_indx<-which(data_exclusion$`Exclusion version A`==1)
### additional exclusion criteria: exposure to Swedish
# select those who have Swedish as 2nd or 3rd language
lang2_indx<-which(data_demo$`Heard language 2`=='Swedish')
lang3_indx<-which(data_demo$`Heard language 3`=='Swedish')
# translate % exposure to language to numeric variable
data_demo$language2_home<-as.numeric(gsub("%","",data_demo$`Heard language 2- Proportion Heard by Child at home`))
data_demo$language2_else<-as.numeric(gsub("%","",data_demo$`Heard language 2- Proportion heard by child elsewhere`))
data_demo$language3_home<-as.numeric(gsub("%","",data_demo$`Heard language 3- Proportion Heard by Child at home`))
data_demo$language3_else<-as.numeric(gsub("%","",data_demo$`Heard language 3- Proportion heard by child elsewhere`))
# select those with exposure<10% NB here the indx goes over those who have Swedish as 2nd/3rd language, not whole sample
excl_indx_lang2<-which((data_demo$language2_else[lang2_indx]+data_demo$language2_home[lang2_indx])<0.1)
excl_indx_lang3<-which((data_demo$language3_else[lang3_indx]+data_demo$language3_home[lang3_indx])<0.1)
# exclusion indx reported to the whole sample
excl_indx_lang<-c(lang2_indx[excl_indx_lang2],lang3_indx[excl_indx_lang3])
# entire list exclusion criteria
to_exclude<-c(excl_indx,excl_indx_lang)
# clean data files from excluded participants
data_demo_clean<-data_demo[-to_exclude,]
data_msel_clean<-data_msel[-to_exclude,]
data_exclusion_clean<-data_exclusion[-to_exclude,]
# add dataset background info
data_background<- readxl::read_xlsx('Background_5m.xlsx')
backmatch<-match(data_background$Kod,data_msel_clean$`Participant EASE Code`)
data_background_matched<-data_background[which(!is.na(backmatch)),]
# double check background match
head(cbind(data_background_matched$Kod,data_msel_clean$`Participant EASE Code`))
which(data_background_matched$Kod!=data_msel_clean$`Participant EASE Code`)
# include only valid msel
indx_valid<-which(data_msel_clean$`Data validity code`=='Valid')
table(data_msel_clean$`Data validity code`)
indx_invalid<-which(data_msel_clean$`Data validity code`=='Invalid')
indx_unknown<-which(data_msel_clean$`Data validity code`=='Not-known')
data_msel_clean<-data_msel_clean[indx_valid,]
data_demo_clean<-data_demo_clean[indx_valid,]
data_background_matched<-data_background_matched[indx_valid,]
data_exclusion_clean<-data_exclusion[indx_valid,]
# add data on pregnancy
data_pregnancy<- readxl::read_xlsx('pregnancy_time_data.xlsx')
indx_preg<-match(data_msel_clean$`Participant EASE Code`,data_pregnancy$ID)
preg_week<-data_pregnancy$Gweeks[indx_preg]
preg_day<-data_pregnancy$Gdays[indx_preg]
preg_day[which(is.na(preg_day))]<-0
preg_term<-preg_week*7+preg_day
#####################################################################################################################
### dataset included in our analyses
#####################################################################################################################
data<-data.frame(cbind(data_msel_clean$`Participant EASE Code`,data_exclusion_clean$Zygosity,data_msel_clean$Gender,data_msel_clean$`Age at Date of Assessment (days)`,data_msel_clean$GM_raw_score,data_msel_clean$FM_raw_score,data_msel_clean$VR_raw_score,data_msel_clean$RL_raw_score,data_msel_clean$EL_raw_score,data_msel_clean$`Early Learning Composite Score`,preg_term,data_demo_clean$`Bio Mum Age`,data_demo_clean$`Bio Dad Age`,data_demo_clean$`A. Highest level of education`,data_demo_clean$`B. Highest level of education`,data_background_matched$`F16. Ungef?r hur h?g ?r familjens* gemensamma m?nadsinkomst innan skatt (l?n och andra ers?ttningar/bidrag)?`,data_demo_clean$TWAB,data_demo_clean$`Twin pair no.`))
names(data)<-c('id','zygosity','sex','age','GM','FM','VR','RL','EL','ELC','term_age','Mum_age','Dad_age','A_edu','B_edu','family_income','TWAB','Twinpair')
# parental education level to MAX within family
data$A_edu_num<-as.numeric(as.factor(data$A_edu))
data$A_edu_num[which(data$A_edu_num==4)]<-6
data$A_edu_num[which(data$A_edu_num==5)]<-4
data$A_edu_num[which(data$A_edu_num==6)]<-5
data$B_edu_num<-as.numeric(as.factor(data$B_edu))
data$B_edu_num[which(data$B_edu_num==4)]<-6
data$B_edu_num[which(data$B_edu_num==5)]<-4
data$B_edu_num[which(data$B_edu_num==6)]<-5
data$edu_mean<-rowMeans(cbind(data$A_edu_num,data$B_edu_num),na.rm = T)
# parental age to mean across mum and dad (NA.RM=TRUE for 3 missing Dad age)
data$Mum_age<-as.numeric(data$Mum_age)
data$Dad_age<-as.numeric(data$Dad_age)
data$parental_age<-rowMeans(cbind(data$Mum_age,data$Dad_age),na.rm = T)
#####################################################################################################################
##### basic data tidying
#####################################################################################################################
names(data);dim(data)
data<-data[,-c(12:15,19,20)]
table(data$zygosity)
# check how many pairs are incomplete by making a variable that counts
# the frequency of their pair ID:
data <- merge(data,data.frame(table(Twinpair=data$Twinpair)),by='Twinpair')
table(data$Freq) # check how many pair IDs appear only once
# carry out the exclusion + make sure you keep singletons as NA:
data$Twinpair[which(data$Freq==1)]
data_test<-rbind(data[1:5,],data[5:76,],data[76:91,],data[91:114,],data[114:115,],data[115:168,],data[168:177,],data[177:236,],data[236:361,],data[361:474,],data[474:521,],data[521:567,])
data_test <- merge(data_test,data.frame(table(id=data_test$id)),by='id')
naindx<-matrix(which(data_test$Freq.y==2),nrow=2)[1,]
data_test$GM[naindx]<-NA
data_test$FM[naindx]<-NA
data_test$VR[naindx]<-NA
data_test$RL[naindx]<-NA
data_test$EL[naindx]<-NA
data_test$ELC[naindx]<-NA
data_test$TWAB[naindx[which(data_test$TWAB[naindx]==1)]]<-2
data_test$TWAB[naindx[-which(data_test$TWAB[naindx]==1)]]<-1
data<-data_test
dim(data)
# remove the frequency variable:
data <- data[-c(17,18)]
#####################################################################################################################
##### transform data for twin analysis
#####################################################################################################################
# binary sex: 0=Females; 1=Males.
data$sex[which(data$sex=='Male')]<-1
data$sex[which(data$sex=='Female')]<-0
data$sex<-as.numeric(data$sex)
# ordinal discrete income
data$income<-as.numeric(as.factor(data$family_income))
data$income[which(data$income==2)]<-12
data$income[which(data$income==1)]<-2
data$income[which(data$income==11)]<-1
data$income[which(data$income==12)]<-11
data<-data[,-c(13)]
# numeric variables
vars <- colnames(data)[c(2,5:15)]
data[vars]<-lapply(data[vars],as.numeric)
### check the distribution of the Mullen scales:
vars <- colnames(data)[c(6:11)]
lapply(data[vars],psych::describe)
### control for the effects of covariates: sex, age, parental age, income, education.
data$age<-scale(data$age)
data$term_age<-scale(data$term_age)
data$parental_age<-scale(data$parental_age)
data$income<-scale(data$income)
data$edu_mean<-scale(data$edu_mean)
resGM <- resid(lm(scale(GM)~sex+age+term_age,data=data,na.action=na.exclude))
resFM <- resid(lm(scale(FM)~sex+age+term_age,data=data,na.action=na.exclude))
resVR <- resid(lm(scale(VR)~sex+age+term_age,data=data,na.action=na.exclude))
resRL <- resid(lm(scale(RL)~sex+age+term_age,data=data,na.action=na.exclude))
resEL <- resid(lm(scale(EL)~sex+age+term_age,data=data,na.action=na.exclude))
#resELC<- resid(lm(ELC~sex+age+term_age,data=data,na.action=na.exclude))
rawComposite<-data$GM+data$FM+data$VR+data$RL+data$EL
rawComposite<-15*scale(rawComposite)+100
resRAWC <- resid(lm(scale(rawComposite)~sex+age+term_age,data=data,na.action=na.exclude))
# standardize the residuals and add them to the data:
data$GMstd <- resGM
data$FMstd <- resFM
data$VRstd <- resVR
data$RLstd <- resRL
data$ELstd <- resEL
data$ELCstd <- resRAWC
# check that the standardization worked:
stand <- colnames(data)[c(17:22)]
lapply(data[stand],psych::describe)
#####################################################################################################################
######## Randomize twin order
#####################################################################################################################
### give each person a random number
set.seed(2022)
npairs <- nrow(data)/2
rand <- c(rep(4,npairs),rep(5,npairs))
data$rand <- sample(rand) # randomly choose 1 number for each person
### divide the twins into two subsets
twin1 <- subset(data,TWAB==1)
twin2 <- subset(data,TWAB==2)
### flip twin 2's random number
randVar <- data.frame(cbind(twin1$Twinpair,twin1$rand))
colnames(randVar) <- c('Twinpair','rand')
twin2 <- twin2[-c(23)]
twin2 <- merge(twin2,randVar,by='Twinpair')
# check
table(twin1$rand);table(twin2$rand)
# for twin 1, convert the random numbers to 0s and 1s:
twin1$rand[twin1$rand==4] <- 0
twin1$rand[twin1$rand==5] <- 1
# and then recode twin 2, but in the opposite direction to twin 1:
twin2$rand[twin2$rand==4] <- 1
twin2$rand[twin2$rand==5] <- 0
# check:
table(twin1$rand);table(twin2$rand)
# now join twin 1 and twin 2 back together:
data <- data.frame(rbind(twin1,twin2))
names(data);dim(data)
# sort the data by pairnr and check the first few rows:
data <- data[order(data[,2]),]
head(data)
# split the data by random number:
twin1 <- subset(data,rand==1)
twin2 <- subset(data,rand==0)
# check that the dimensions are the same:
dim(twin1);dim(twin2)
# add '1' to the end of the variables for twin 1 and '2' for twin 2:
twin1labs <- paste(colnames(twin1),1,sep='')
twin2labs <- paste(colnames(twin2),2,sep='')
names(twin1) <- twin1labs
names(twin2) <- twin2labs
# combine data so that 1 pair each line
dataD <- data.frame(cbind(twin1,twin2))
# remove unused variables:
dataD <- dataD[-c(6:16,23:39,46)]
# relabel a few variables:
names(dataD)[names(dataD)=='Twinpair1'] <- 'Twinpair'
names(dataD)[names(dataD)=='id1'] <- 'id'
names(dataD)[names(dataD)=='zygosity1'] <- 'zygosity'
names(dataD)[names(dataD)=='sex1'] <- 'sex'
names(dataD)[names(dataD)=='age1'] <- 'age'
#####################################################################################################################
#### Save data
#####################################################################################################################
write.csv(data,'phenotypic_data.csv')
write.csv(dataD,'twin_data.csv')