-
Notifications
You must be signed in to change notification settings - Fork 0
/
NIAGADS_data_cleaning_shell_script_using_bluelab_pipeline.sh
492 lines (343 loc) · 21 KB
/
NIAGADS_data_cleaning_shell_script_using_bluelab_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
data.dir <- "/nfs/beluga0_home/DATA/NIAGADS/NG00020/data/"
initial_qc.dir <- "/nfs/beluga0_home/DATA/NIAGADS/NG00020/initial_qc/out/"
# 1. Convert plink files from build 36 to build 37
## produces:
### plink files (ignore): /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data
### updated-map gds (to be used for variant QC, PC and GRM generation, and association testing): /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds
```{r}
map_37 <- read.table("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/Human610-Quadv1_H.bed", header = FALSE, skip = 1)
#Load Plink map file to compair against
bim <- read.table("/nfs/beluga0_home/DATA/NIAGADS/NG00020/NG00020_genotype_files/LOAD_families_010413.bim")
head(bim)
# V1 V2 V3 V4 V5 V6
#1 10 rs35418599DB 0 59083 0 0
#2 10 cnvi0015449DB 0 78874 0 0
#Since variant IDs end in DB, and don't want to alter this file, will add DB to end of variant IDs in map_37
colnames(bim) <- c("chr", "variant.name", "cM", "pos", "allele_1", "allele_2")
colnames(map_37) <- c("chr", "start_pos", "end_pos", "variant.name")
#Note: variant.ids that we want (rsIDs) are further down in file
#Note: SNP position corresponds with end pos. Plink only wants a single position
head(map_37)
# chr start_pos end_pos variant.name
#1 9 139906358 139906359 200003
#2 9 139926401 139926402 200006
#3 2 220084901 220084902 200047
map_37$chr <- gsub("chr", "", map_37$chr)
map_37$variant.name <- paste0(map_37$variant.name, "DB")
sum(map_37$variant.name %in% bim$variant.name)/nrow(map_37)
#[1] 1 #All variants from map file are in bim file
sum(bim$variant.name %in% map_37$variant.name)/nrow(bim)
#[1] 0.9897101 #99% of variants from bim file are in map file
#What about autosomes only?
sum(bim[bim$chr != "X" & bim$chr != "XY" & bim$chr != "Y",]$variant.name %in% map_37[map_37$chr != "X" & map_37$chr != "Y",]$variant.name)/sum(bim$chr != "X" & bim$chr != "XY" & bim$chr != "Y")
#[1] 0.9902216
#Note: variants in gds that aren't in map file are unchanged when converting in plink, so
#Make a vector of variants not in build 37 to remove from plink files
exclude_SNPs <- bim[!(bim$variant.name %in% map_37$variant.name), "variant.name", drop = TRUE]
length(exclude_SNPs)
#[1] 6389
write(exclude_SNPs, "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/exclude_SNPs_plink.txt", ncolumns = 1)
#Generate plink friendly files
map_37_pos <- map_37[, c("variant.name", "end_pos")]
map_37_chr <- map_37[, c("variant.name", "chr")]
write.table(map_37_pos, "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/map_37_pos.txt", col.names = FALSE, quote = FALSE, row.names = FALSE)
write.table(map_37_chr, "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/map_37_chr.txt", col.names = FALSE, quote = FALSE, row.names = FALSE)
```
```{bash}
cd /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data
```
```{bash}
## Remove SNPs not in map 37 file
~/programs/plinkV1.9/plink --noweb --bfile /nfs/beluga0_home/DATA/NIAGADS/NG00020/NG00020_genotype_files/LOAD_families_010413 --exclude exclude_SNPs_plink.txt --make-bed --out LOAD_families_010413_only_build37_SNPs
#620901 variants loaded from .bim file.
#614512 variants remaining.
## update pos
~/programs/plinkV1.9/plink --noweb --bfile A_LOAD_families_010413_only_build37_SNPs --update-map map_37_pos.txt --make-bed --out B_LOAD_families_010413_new_pos
#614512 variants and 9396 people pass filters and QC.
#Note: variants will no be ordered by new positions until make-bed is run again!
## update chr
~/programs/plinkV1.9/plink --noweb --bfile B_LOAD_families_010413_new_pos --update-map map_37_chr.txt --update-chr --make-bed --out C_LOAD_families_010413_new_pos_chr
#614512 variants and 9396 people pass filters and QC.
## exclude non-autosomes
~/programs/plinkV1.9/plink --noweb --bfile C_LOAD_families_010413_new_pos_chr --autosome --make-bed --out D_LOAD_families_010413_aut
#594742 variants and 9396 people pass filters and QC.
```
```{bash}
#Convert to GDS
/acct/hkings/git/blue_pipeline/assoc_pipeline/plink_to_gds.R D_LOAD_families_010413_aut
```
# 2. Combine phenotype files and perform QC
## Produces:
### annotated phenotype dataframe (PCs will be added and will be used for assoc testing): /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pheno_adf.rds
### sample filter (to be used for generatign PCs and GRM): /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds
### summary figures and tables
```{r}
#Note: Walk through script in R, don't execute as a singel script
/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/pheno_qc.Rmd
#Output files are here: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out
```
# 3. Variant and sample QC - check for missingness
## produces:
### summary of sample missingness: /nfs/beluga _home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/missing_by_sample.rds
### sample missingness figures
### Note there are no samples that fail based on our missingngess filters. If there were, these would need to be intersected with those that fail phenotype QC
### summary of variant qc measures: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/variant_metrics.rds
### variant filter (to be fed to LD-pruning; doe not include HWE filtering, this will be added before association testing): /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/vars_qc_keep.rds
### variant missingness figures
```{bash}
## sample QC
/acct/hkings/git/blue_pipeline/assoc_pipeline/sample_qc.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds --out_prefix /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/
#The samples with 100% missingness are the "dummy" founder samples included for plink files
## variant QC
### Note: must include sampel filter because there are a lot of dummy samples and otherwise this will lead to high missingness
/acct/hkings/git/blue_pipeline/assoc_pipeline/variant_qc.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds --out_prefix /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/
```
#Extract variant and sample filters from dataframes
```{r samp_qc}
library(dplyr)
samp_qc <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/missing_by_sample.rds")
summary(samp_qc$missing.rate)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.03986 0.04020 0.04201 0.46705 1.00000 1.00000
```
There are variants with high missingness...
```{r varmet}
varmet <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/variant_metrics.rds")
summary(varmet$missing.rate)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#0.0000000 0.0002267 0.0004533 0.0409082 0.0009066 1.0000000
```
...as well as rare variants...
```{r maf}
summary(varmet$maf)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000 0.1044 0.2176 0.2286 0.3526 0.5000
nrow(varmet)
#[1] 594742
...so we'll create a variant keep list:
#Get counts:
nrow(varmet[!(varmet$missing.rate < 0.05),])
#[1] 24745 #excluded for missingness
nrow(varmet[varmet$maf < 0.05 & varmet$maf > 0,])
#[1] 49050 #excluded for MAF > 0.05
keep_vars <- filter(varmet, missing.rate < 0.05, !(maf < 0.05 & maf > 0))$variant.id
length(keep_vars)
#[1] 521088
saveRDS(keep_vars, "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/vars_qc_keep.rds")
```
# 4. Develop PCs and GRM
## produces: (all files in /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out)
### pruned SNP list:pruned_snps.rds
### intermediate PC and GRM files (king, and A PC and GRM files)
### final PC files:b_pcair.rds (complete PC-Air object), b_pcair_pcs.rds (just the PCs to be merged with phenotype file for assoc. testing), b_pcair_unrels.rds, b_pcair_rels.rds
### final GRM files:b_pcr_mat.rds (relatedness matrix to be passed to association testing), b_pcrelate.rds (complete PC-Relate object)
### Plots:
#from PC-Relate: b_pcrelate_kinship.png (kinship plot)
#from PC-AiR: b_paracord.png, b_pairs.png (pairwise PC scatterplots), b_pc12.png (PC1xPC2), b_pc34.png (PC3xPC4), b_pc_scree.png (% variance explained by each PC)
```{bash}
cd /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out
## run LD-pruning:
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/ld_pruning.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds --variant_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/vars_qc_keep.rds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds > LD_pruning.log &
#Used defaults for:
#maf: 0.05
#missingness: 0.05
#r-threshold: sqrt(0.1)
#Did not excldue "problematic" PC regions like HLA or LCT
#95,639 markers are selected in total.
## Run King
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/king_grm.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds --variant_id pruned_snps.rds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds > King.log &
## Plot King
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/kinship_plot.R king_out.rds --is_king --out_file king_kinship.png > King_plot.log &
## Run 1st iteration PC air
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/pcair.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds king_grm.rds king_grm.rds --out_prefix a_ --variant_id pruned_snps.rds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds --num_core 4 > a_PC_AiR.log &
#The first file is the kin_file and the secodn file is the div_file
#used defaults for:
#kin_thresh: 2 ^ (-9 / 2)
#div_thresh: 2 ^ (-9 / 2)
## Run 1st iteration PC-Relate
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/pcrelate.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds a_pcair.rds --sample_block_size 6000 --variant_block 10000 --out_prefix a_ --n_pcs 4 --variant_id pruned_snps.rds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds > a_PC_relate.log &
#used defaults for:
#scale_kin: 1
#sparse_thresh: 0, "Threshold for sparsifying GRM (will be multiplied by scale_kin)"
add_argument("--small_samp_correct",
"Flag to implement small-sample correction",
flag = TRUE) %>%
## Run 2nd iteraction PC-AiR
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/pcair.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds a_pcr_mat.rds king_grm.rds --out_prefix b_ --variant_id pruned_snps.rds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds --num_core 4 > b_PC_AiR.log &
#The first file is the kin_file and the second file is the div_file (note: that now the PC-Relate output is used as the kin_file)
#used defaults for:
#kin_thresh: 2 ^ (-9 / 2)
#div_thresh: 2 ^ (-9 / 2)
## Plot PCs
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/pca_plots.R b_pcair.rds --phenotype_file /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pheno_adf.rds --group race_ethnicity --n_pairs 6 --out_prefix b_ > PC_plots.log &
## Run 2nd iteration PC-Relate
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/pcrelate.R /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds b_pcair.rds --out_prefix b_ --sample_block_size 6000 --variant_block 10000 --n_pcs 4 --scale_kin 2 --variant_id pruned_snps.rds --sample_id /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds > b_PC_relate.log &
#Note: scale_kin 2 argument added
## Plot PC-Relate
/nfs/beluga2_home/hkings/git/blue_pipeline/assoc_pipeline/kinship_plot.R b_pcrelate.rds --out_file b_pcrelate_kinship.png > pcrelate_plot.log &
```
# 5. Set up files for association tests: Exclude identical twins from sample keep vector and add PCs to annotated phenotype dataframe
```{r}
## 5a. Identify identical twins & generate keep_samples vector that excludes them
"%notin%" <- Negate("%in%")
#Read in PCrelate object and existing keep_samples vector
library(GENESIS)
mypcrel <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out/b_pcrelate.rds")
kinship <- mypcrel$kinBtwn
#Read in keep_samples file
keep_samples <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc.rds")
#Find likely identical twins
identical_twins <- kinship[kinship$kin > 0.45, c("kin", "k2", "ID1", "ID2")]
identical_twins
saveRDS(identical_twins, "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/identical_twins.rds")
#Remove identical twins from keep_samples vector
exclude_twins <- kinship[kinship$kin > 0.45, "ID1"] #ID1 is always the lowest of the 2 number IDs
sum(keep_samples %in% exclude_twins)
#[1] 13
keep_samples_noTwins <- keep_samples[keep_samples %notin% exclude_twins]
saveRDS(keep_samples_noTwins, file = "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc_noTwins.rds")
length(keep_samples_noTwins)
#[1] 4399
exclude_twins2 <- append(exclude_twins, kinship[kinship$kin > 0.45,
## 5b. Add PCs to annotated phenotype file
library(Biobase)
library(SeqArray)
#Read in phenotype file
annot <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pheno_adf.rds")
#extract just phenotype info:
phenotype <- pData(annot)
# Get PCs
pca <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out/b_pcair.rds")
#get gds and extract sample IDs
gds <- seqOpen("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds")
gds.id <- seqGetData(gds, "sample.id")
##The null model requires a sample ID column:
phenotype$sample.id <- phenotype$SUBJ_NO
##Read in PCA covariates -will read in 8 but expect to use 4 (based on: file:///C:/Users/hking/OneDrive/my%20documents/UW/thesis%20and%20research/NIAGADS/NG00020_setup.html)
pcs.df <- as.data.frame(pca$vectors[,1:8])
colnames(pcs.df) <- c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8")
pcs.df$sample.id <- row.names(pcs.df)
#Add PCA covariates to phenotype data of interest by subject number
#Only select columns of interest
merged_phen <- merge(phenotype[, c("sample.id", "SEX", "Case_Control", "ConType", "BirthYr_pheno", "AgeDeath", "Race", "Hispanic", "race_ethnicity", "E2", "E4", "E3", "E4_carrier", "E4_het_hom", "E2_carrier", "E2_het_hom", "age", "age_source", "surv", "devres", "martingale", "pass_qc")], pcs.df, by = "sample.id", all.x=TRUE)
#order phenotype df to match order of gds ids
merged_phen <- merged_phen[match(gds.id, merged_phen$sample.id),]
identical(as.character(merged_phen$sample.id), as.character(gds.id))
colnames(merged_phen)
##annotate data:
metadata <- data.frame(labelDescription = c(
"sample.id",
"sex (1= male, 0 = female)",
"case-control status",
"ConType",
"birth year",
"age at death",
"reported race",
"reported hispanic status",
"combined descriptor of race and Hispanic status",
"count of E2 allele",
"count of E4 allele",
"count of E3 allele",
"whether they are a carrier of the E4 allele (1 = heterozygous or homozygous for E4)",
"whether they are a heterozygote (0) or homozygote (1) for E4 allele; homozygote for non-E4 = NA",
"whether they are a carrier of the E2 allele (1 = heterozygous or homozygous for E2)",
"whether they are a heterozygote (0) or homozygote (1) for E2 allele; homozygote for non-E2 = NA",
"age censored; estimated age of symptom onset (AgeDem) for cases; AgeAtLastEval or (if not available) Sampled.Age for controls",
"first principal component",
"second principle component",
"third principle component",
"fourth principle component",
"fifth principle component",
"sixth principle component",
"seventh principle component",
"eighth principle component",
"survival time / censoring age",
"the source of the censoring age variable",
"deviance residuals from survival data",
"martingale residuals from survival data",
"Whether they passed phenotype QC filters, does not account for identical twins"
))
#Re-generate annotated dataframe
#(This is a bit round about... it would be better to just add PCs to already made annotated dataframe, but I couldn't figure out how to do that)
annot <- AnnotatedDataFrame(merged_phen, metadata)
#Save annot as an R object
saveRDS(annot, file = "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pheno_for_assoc.rds")
```
# 6. Calculate and plot PC correlations
## produces: (all files are in /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out)
### pc-correlation file: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out/snp_corr.rds
### pc-correlation plots: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out/pc_corr.png
### zoomed-in plots from PC2 correlation: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out/PC2_chr<>_corr_peak.png
```{BASH}
cd /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out
R -q --vanilla --args --pca_file "b_pcair.rds" --gds_file "/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds" --block-size 60000 --outfile "snp_corr.rds" < /nfs/beluga2_home/hkings/NIAGADS/8_30_scripts/calculate_SNP_PC_corr.R > calculate_snp_pc_corr.log &
#Note: Walk through script in R, don't execute as a single script
#MUST UPDATE WHEN HWE COMPLETED
/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/plot_snp_pc_corr.R
```
# 7. Test HWE in set of unrelated controls, controling for ancestry through PCs
# 8. Association testing:
See README's in AAO, AD, E4_carrier, and E2_carrier
#Shared files for assoc testing:
#phenotype file: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pheno_for_assoc.rds
#sample filter: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc_noTwins.rds
#gds: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds
#GRM: /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pcs_grm_pheno_only/out/b_pcr_mat.rds
#TEMPORARY variant filter (no HWE applied, so need to be updated):/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/initial_qc/out/vars_qc_keep.rds
# 9. Additional data summary for paper:
```{r}
library(SeqArray)
pheno <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pheno_for_assoc.rds")
pheno2 <- pData(pheno)
## 9a. Sample summaries
### Filter phenotype data based on sample filter
pheno2 <- pheno2[pheno2$sample.id %in% samps,]
samps <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc_noTwins.rds")
length(samps)
#[1] 4399
table(pheno2$Case_Control)
# 0 1
#2221 2178
## 9b. Variant summaries - NOT DONE, need HWE
vars <- readRDS("vars_for_assoc_test")
```
## 9c. calculate type I error rate
cd /nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/calculate_typeI_error
```{r}
# Determine Bonferoni-corrected p-value threshold using genetic type I error calculator (GEC)
#What are variant IDs look like in bim file (column 2)?
head /home/DATA/NIAGADS/NG00020/NG00020_genotype_files/LOAD_families_010413.bim
#10 rs35418599DB 0 59083 0 0
#10 cnvi0015449DB 0 78874 0 0
#What do sample ids look like in fam file (column 2)?
head /home/DATA/NIAGADS/NG00020/NG00020_genotype_files/LOAD_families_010413.fam
#1600 160001 0 0 1 -9
#1600 160003 0 0 2 -9
#Convert variant filter into a line-seperated list by rsid and cnvi to match bin file
#--extract accepts list of variant IDs as 1 per line or space-seperated
library(SeqArray)
library(SeqVarTools)
gds <- seqOpen("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/data/D_LOAD_families_010413_aut.gds")
head(seqGetData(gds, "annotation/id"))
#[1] "rs35418599DB" "cnvi0015449DB"
keep_vars <- readRDS("vars_for_assoc_test")
seqSetFilter(gds, variant.id = keep_vars)
#Save a vector of filtered variant IDs
keep_vars_id <- seqGetData(gds, "annotation/id")
write(keep_vars_id, file = "vars_qc_keep_GEC.txt", ncolumns = 1)
#Save sample filter in format for plink
#--keep accepts a space/tab-delimited text file with family IDs in the first column and within-family IDs in the second column
keep_samps <- readRDS("/nfs/beluga0_home/ANALYSIS/NIAGADS/NG00020/hkings/pheno/out/pass_pheno_qc_noTwins.rds")
#For some reason, this seems to also require family IDs, so will add those from the fam file
fam <- read.table("/home/DATA/NIAGADS/NG00020/NG00020_genotype_files/LOAD_families_010413.fam")
IDs_keep <- fam[fam$V2 %in% keep_samps, c("V1", "V2")]
nrow(IDs_keep)
#[1] 4385
write.table(IDs_keep, file = "pass_pheno_qc_noTwins_GEC.txt", sep = " ", col.names = F, row.names = F)
```
# Remove non-QC passing variants and excluded samples from Plink binary files
~/programs/plinkV1.9/plink --bfile /home/DATA/NIAGADS/NG00020/NG00020_genotype_files/LOAD_families_010413 --extract vars_qc_keep_GEC.txt --keep pass_pheno_qc_noTwins_GEC.txt --make-bed --out plink_filt
```
# Calculate Type I error rate
java -jar gec.jar --effect-number --plink-binary plink_filt --genome --error 0.05 --out type_1_error_thresh
```