-
Notifications
You must be signed in to change notification settings - Fork 0
/
running_GWAS_user.R
executable file
·365 lines (324 loc) · 19.5 KB
/
running_GWAS_user.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
##################################################
## Project: DGRPool
## Script purpose: Running GWAS for user-submitted data
## Version: 1.0.0
## Date Created: 2023 Mar 29
## Date Modified: 2024 Aug 06
## Author: Vincent Gardeux (vincent.gardeux@epfl.ch)
##################################################
# Measure execution time
start_time = Sys.time()
# Global script options
options(echo = F)
args <- commandArgs(trailingOnly = TRUE)
## Libraries
suppressPackageStartupMessages(library(data.table)) # Fast reading of files
suppressPackageStartupMessages(library(stringr)) # String utilities
suppressPackageStartupMessages(library(ramwas)) # QQ-Plot & Manhattan plot
suppressPackageStartupMessages(library(car)) # Covariate analysis
suppressPackageStartupMessages(library(rstatix)) # Shapiro test
suppressPackageStartupMessages(library(Cairo)) # For the - in front of -log10(P) y-axis of the PDF outputs
suppressPackageStartupMessages(library(jsonlite)) # JSON output
## Functions
stop_quietly <- function(){
opt <- options(show.error.messages = F)
on.exit(options(opt))
q(save="no", status=1, runLast=FALSE)
}
error.json <- function(displayed) {
stats <- list()
stats$displayed_error = displayed
print(toJSON(stats, method="C", auto_unbox=T))
stop_quietly()
}
# Result file
results <- list()
# Parameters
if(is.null(args) | length(args) < 9){
message("Usage: running_GWAS_user.R parameters\n1. [Required] Path to phenotype file\n2. [Required] Path to PLINK genotype file prefix (bed)\n3. [Required] Path to covariate file\n4. [Required] Path to annotation file\n5. [Required] Output folder path\n6. [Required] Number of threads\n7. [Required] Minimum genotyping ratio\n8. [Required] Minimum MAF\n9. [Required] Filtering threshold for p-value\n10. [Optional] Name of phenotype to load (if file contains multiple phenotypes)")
error.json("You need to input at least 9 parameters: [Phenotype file path][Genotype file prefix][Covariate file path][Annotation file path][Output folder path][Number of threads][Genotype Ratio][Min MAF]")
}
results$pheno_file <- args[1]
results$genotype_file <- args[2]
results$covariate_file <- args[3]
results$annotation_file <- args[4]
results$output_folder <- args[5]
results$output_folder <- gsub(x = results$output_folder, pattern = "\\\\", replacement = "/")
if(!endsWith(results$output_folder, "/")) results$output_folder <- paste0(results$output_folder, "/")
if(!file.exists(results$output_folder)) dir.create(results$output_folder, mode = "755", showWarnings = FALSE)
results$nb_threads <- as.numeric(args[6])
results$genotype_ratio <- as.numeric(args[7])
results$min_maf <- as.numeric(args[8])
results$gwas_options <- paste0("--glm hide-covar --quantile-normalize --variance-standardize --geno ", results$genotype_ratio, " --maf ", results$min_maf) # --geno 0.2 --maf 0.01
results$filtering_pval <- as.numeric(args[9])
results$pheno_name <- NULL
results$pheno_column <- 3 # Default phenotype to read
results$which_sex <- "NA" # Default, in case no information
if(length(args) == 10){
# Check if there is a "name" parameter
results$pheno_name <- args[10]
}
if(length(args) > 10) error.json("You entered too many parameters: ", length(args))
# Read user-submitted phenotyping data
if(!file.exists(results$pheno_file)) error.json(paste0("This file: '",results$pheno_file,"' does not exist!"))
data.dgrp_pheno <- fread(results$pheno_file, sep = "\t", header = T, data.table = F, na.strings = "")
if(colnames(data.dgrp_pheno)[1] != "DGRP") error.json(paste0("First column of ",results$pheno_file," should be the 'DGRP' column"))
# Check sex column
if("sex" %in% colnames(data.dgrp_pheno)){
# Check if it's correctly in col 2
where_sex <- which(colnames(data.dgrp_pheno) == "sex")
if(length(where_sex) > 1) error.json(paste0("Phenotype 'sex' was found multiple times in your phenotyping file. It should be unique, and in column 2."))
if(where_sex != 2) error.json(paste0("Phenotype 'sex' was found in your phenotyping file, but not in the correct column (", where_sex, "). It should be in column 2."))
# Check content of column
data.dgrp_pheno[2][is.na(data.dgrp_pheno[2])] <- "NA"
results$which_sex <- names(table(data.dgrp_pheno[2]))
if(any(!results$which_sex %in% c("F", "M", "NA"))) error.json(paste0("Column 'sex' should only contain 'F', 'M', or 'NA' values."))
} else {
# If no sex column, I create one with only "NA" values
col_names <- colnames(data.dgrp_pheno)[2:ncol(data.dgrp_pheno)]
data.dgrp_pheno$sex <- rep("NA", nrow(data.dgrp_pheno))
data.dgrp_pheno <- data.dgrp_pheno[,c("DGRP", "sex", col_names)]
}
# Check pheno_name, if parameters is provided
if(!is.null(results$pheno_name)){
# Check if pheno_name is unique and in the file
results$pheno_column <- which(colnames(data.dgrp_pheno) == results$pheno_name)
if(length(results$pheno_column) == 0) error.json(paste0("Phenotype '",results$pheno_name,"' was not found in any column of ", results$pheno_file))
if(length(results$pheno_column) > 1) error.json(paste0("Phenotype '",results$pheno_name,"' was found in many column of ", results$pheno_file))
} else results$pheno_name <- colnames(data.dgrp_pheno)[results$pheno_column]
## Load covariates
# For Wolbachia: y or n
# For big inversions: ST (0), INV (1) or INV/ST (2)
dgrp.cov <- fread (results$covariate_file, header = T, sep = "\t")
rownames(dgrp.cov) <- dgrp.cov$IID
dgrp_lines_genotyped <- dgrp.cov$IID
message("Found ", length(dgrp_lines_genotyped), " genotyped DGRP lines")
message(ncol(dgrp.cov) - 2, " covariates to account for : [", paste0(colnames(dgrp.cov)[3:ncol(dgrp.cov)], collapse = ", "), "]")
## Load annotation
annotation.dr <- fread (results$annotation_file, header = FALSE, sep = "\t")
annotation.dr <- annotation.dr[,c("V1", "V3", "V4")]
colnames(annotation.dr) <- c("SNP", "gene_annotation", "regulatory_annotation")
## Create results
results$results <- list()
## Running GWAS
for(s in results$which_sex) {
results$results[[s]] <- list()
## Extracting phenotype
phenotype <- subset(data.dgrp_pheno, sex == s)[,results$pheno_column]
phenotype_name <- paste0(results$pheno_name, "_", s)
dgrp_name <- subset(data.dgrp_pheno, sex == s)[,"DGRP"]
## Transform to numeric
suppressWarnings(phenotype <<- as.numeric(phenotype))
## Fix weird names
phenotype_name <- gsub(phenotype_name, pattern = "[^a-zA-Z0-9]+", replacement = "_")
## Removing NAs
non_na <- which(!is.na(phenotype))
phenotype <- phenotype[non_na]
dgrp_name <- dgrp_name[non_na]
dgrp_name <- gsub(dgrp_name, pattern = "DGRP_", replacement = "line_")
dgrp_name <- gsub(dgrp_name, pattern = "line_0", replacement = "line_")
dgrp_name <- gsub(dgrp_name, pattern = "line_0", replacement = "line_")
results$results[[s]][["nb_used_dgrps"]] <- length(dgrp_name)
## Generating phenotype file for PLINK
plink.file_content <- data.frame(FID = dgrp_name, IID = dgrp_name)
plink.file_content[phenotype_name] <- phenotype
plink.file_content <- subset(plink.file_content, IID %in% dgrp_lines_genotyped) ## Restrict to genotyped samples
results$results[[s]][["plink_phenotype_file"]] <- paste0(results$output_folder, phenotype_name, ".pheno.plink2.tsv")
fwrite(x = format(plink.file_content, nsmall = 1), file = results$results[[s]][["plink_phenotype_file"]], sep = "\t", quote = F, row.names = F, col.names = T, scipen=50)
## Test for categorical phenotype
is_categorical <- F
tryCatch({
tmp.var <- as.numeric(plink.file_content[[phenotype_name]])
}, warning=function(cond) {
if(grepl(x = cond, pattern = "NAs introduced by coercion")) {
is_categorical <<- T
}
})
## Running GWAS
list_values <- unique(plink.file_content[,3])
if(is_categorical){
results$results[[s]][["error_message"]] <- paste0("ERROR in processing phenotype ", phenotype_name, " Categorical phenotype?")
message(results$results[[s]][["error_message"]])
} else if(length(list_values) == 0){
results$results[[s]][["error_message"]] <- paste0("ERROR in processing phenotype ", phenotype_name, " : No values.")
message(results$results[[s]][["error_message"]])
} else if(length(list_values) == 1){
results$results[[s]][["error_message"]] <- paste0("ERROR in processing phenotype ", phenotype_name, " : Non-variable phenotype.")
message(results$results[[s]][["error_message"]])
} else if(length(list_values) == 2 && all(sort(list_values) == c(0, 2))){
# Error: All samples for --glm phenotype 'S7_MB_32_M' are cases.
results$results[[s]][["error_message"]] <- paste0("ERROR in processing phenotype ", phenotype_name, " : Values are only in [0, 2]. This is a specific Case/Control coding for PLINK2, can you change these values?")
message(results$results[[s]][["error_message"]])
} else if(length(list_values) == 2 && all(sort(list_values) == c(0, 1))){
results$results[[s]][["error_message"]] <- paste0("ERROR in processing phenotype ", phenotype_name, " : Values are only in [0, 1]. This is a specific Case/Control coding for PLINK2, can you change these values?")
message(results$results[[s]][["error_message"]])
} else{
## Covariate analysis (Anova)
results$results[[s]][["covariate_anova_file"]] <- paste0(results$output_folder, phenotype_name, ".cov.anova.txt")
sink(results$results[[s]][["covariate_anova_file"]])
data.cov <- merge(plink.file_content, dgrp.cov, id = "family")
## Testing variance of covariates
# Generate formula
covars <- c()
if(length(unique(data.cov[,4])) > 1) covars <- c(covars, "factor(wolba)")
if(length(unique(data.cov[,5])) > 1) covars <- c(covars, "factor(In_2L_t)")
if(length(unique(data.cov[,6])) > 1) covars <- c(covars, "factor(In_2R_NS)")
if(length(unique(data.cov[,7])) > 1) covars <- c(covars, "factor(In_3R_P)")
if(length(unique(data.cov[,8])) > 1) covars <- c(covars, "factor(In_3R_K)")
if(length(unique(data.cov[,9])) > 1) covars <- c(covars, "factor(In_3R_Mo)")
if(length(covars) > 0){
fit <- lm(data = data.cov, formula = as.formula(paste0(phenotype_name, " ~ ", paste0(covars, collapse = " + "))))
if(deviance(fit) >= sqrt(.Machine$double.eps)){
if(any(is.na(fit$coefficients))){
results$results[[s]][["error_message"]] <- "ERROR: In Anova.III.lm(mod, error, singular.ok = singular.ok, ...) : there are aliased coefficients in the model"
message(results$results[[s]][["error_message"]])
} else {
print(Anova(fit, type = 'III'))
}
}
print(summary(fit))
} else {
results$results[[s]][["error_message"]] <- "ERROR: No invariant covariates??"
message(results$results[[s]][["error_message"]])
}
# Compute the normality test of Shapiro-Wilk
cat("Shapiro-Wilk test of normality:\n")
tryCatch( {
shapiro <- as.data.frame(shapiro_test(residuals(fit)))
cat(paste0('{\"shapiro\"={"statistic":', shapiro$statistic, ',"pvalue":', shapiro$p.value, '}}\n'))
if(shapiro$p.value > 0.05) cat("Interpretation: NORMAL OK (p > 0.05)\n")
if(shapiro$p.value <= 0.05) cat("Interpretation: NOT NORMAL (p <= 0.05)\n")
}, error = function(e) {cat("Shapiro test failed : all residuals values are identical\n")})
# Kruskal-Wallis test
cat("\nKruskal-Wallis test\n")
kruskal.results <- data.frame(covariate = "TOTO", chisquared = -1, df = -1, pvalue = -1)
for(cova in covars){
res <- kruskal.test(as.formula(paste0(phenotype_name, " ~ ", cova)), data = data.cov)
kruskal.results <- rbind(kruskal.results, data.frame(covariate = cova, chisquared = res$statistic, df = res$parameter, pvalue = res$p.value))
}
kruskal.results <- kruskal.results[-1,]
rownames(kruskal.results) <- NULL
cat(jsonlite::toJSON(kruskal.results))
# returns output to the console
sink()
# Process phenotypes
if(length(covars) + 2 >= nrow(plink.file_content)){
results$results[[s]][["error_message"]] <- paste0("ERROR in processing phenotype ", phenotype_name, " : # samples <= # predictor columns.")
message(results$results[[s]][["error_message"]])
} else {
# Run PLINK 2.x
std.out <- NULL
results$command_line_gwas <- paste0("plink2 --threads ", results$nb_threads, " ", results$gwas_options, " --covar ", results$covariate_file, " --bfile ", results$genotype_file, " --pheno \"", results$results[[s]]$plink_phenotype_file, "\" --out \"", results$output_folder, "PLINK2\"")
tryCatch(
{
std.out <<- system(results$command_line_gwas, intern = T, ignore.stderr = T, ignore.stdout = F)
},
warning=function(cond) {
if(grepl(x = cond, pattern = "had status 7")) {
results$results[[s]][["error_message"]] <- paste0("ERROR [HAD STATUS 7] in processing phenotype ", phenotype_name, " : # samples <= # predictor columns.")
message(results$results[[s]][["error_message"]])
} else {
# I redo it without catching the warnings... It's ugly I know
std.out <<- system(results$command_line_gwas, intern = T, ignore.stderr = T, ignore.stdout = F)
}
})
if(!is.null(std.out)) {
file_name <- std.out[sapply(std.out, function(x) startsWith(x = x, prefix = "Results written to"))]
## Rename output
if(length(file_name) > 0)
{
file_name <- gsub(x = file_name, pattern = "Results written to ", replacement = "")
file_name <- gsub(x = file_name, pattern = " \\.", replacement = "")
file.rename(file_name, gsub(x = file_name, pattern = "/PLINK2\\.", replacement = "/"))
file_name <- gsub(x = file_name, pattern = "/PLINK2\\.", replacement = "/")
file.rename(paste0(results$output_folder, "PLINK2.log"), paste0(file_name, ".log"))
results$results[[s]]$plink_full_output_file <- file_name
results$results[[s]]$plink_full_log_file <- paste0(file_name, ".log")
} else {
results$results[[s]]$plink_full_output_file <- NA
results$results[[s]]$plink_full_log_file <- NA
}
## Reading Output
plink_results <- NULL
col6 <- "BETA"
results$results[[s]]$gwas_file_suffix <- ".glm.linear"
if(endsWith(results$results[[s]]$plink_full_output_file, "glm.linear")){
# Numeric / continuous
plink_results <<- fread(results$results[[s]]$plink_full_output_file, data.table = F, nThread = results$nb_threads, showProgress = F)
results$results[[s]]$gwas_model <- "GLM Linear"
results$results[[s]]$gwas_file_suffix <- ".glm.linear"
results$results[[s]]$gwas_type <- "continuous (quantitative)"
} else if(endsWith(results$results[[s]]$plink_full_output_file, "glm.logistic.hybrid")){
# Categorical
plink_results <<- fread(results$results[[s]]$plink_full_output_file, data.table = F, nThread = results$nb_threads, showProgress = F)
col6 <<- "OR"
results$results[[s]]$gwas_model <- "GLM Logistic Hybrid"
results$results[[s]]$gwas_file_suffix <- ".glm.logistic.hybrid"
results$results[[s]]$gwas_type <- "categorical (qualitative)"
} else {
results$results[[s]][["error_message"]] <- paste0("ERROR? I don't recognize the output file:", results$results[[s]]$plink_full_output_file)
message(results$results[[s]][["error_message"]])
}
if(!is.null(plink_results)){
# Change column names and add FDR
plink_results$P <- as.numeric(plink_results$P)
plink_results <- subset(plink_results, TEST == "ADD")
plink_results <- plink_results[!is.na(plink_results$P),c("#CHROM", "POS", "ID", "REF", "ALT", col6, "P")]
plink_results$FDR_BH <- p.adjust(plink_results$P, method = "fdr")
## QQPLOT before filtering
results$results[[s]][["qq_plot"]] <- paste0(results$output_folder, phenotype_name, ".qqplot.png")
results$results[[s]][["qq_plot_pdf"]] <- paste0(results$output_folder, phenotype_name, ".qqplot.pdf")
plink_results$P[plink_results$P == 0] <- 1E-255 # For avoiding errors
png(results$results[[s]][["qq_plot"]], height = 500, width = 500)
qqPlotFast(plink_results$P, ci.level = NULL, col = "black", makelegend = F, lwd = 1, newplot = T)
dev.off()
# SVG is way too big. So let's go PDF
# I recompute the plot... I know it's not optimal but I don't know how to do it else
CairoPDF(results$results[[s]][["qq_plot_pdf"]], height = 7, width = 7)
qqPlotFast(plink_results$P, ci.level = NULL, col = "black", makelegend = F, lwd = 1, newplot = T)
dev.off()
## Manhattan plot
results$results[[s]][["manhattan_plot"]] <- paste0(results$output_folder, phenotype_name, ".manhattan.png")
results$results[[s]][["manhattan_plot_pdf"]] <- paste0(results$output_folder, phenotype_name, ".manhattan.pdf")
chroms <- factor(plink_results$`#CHROM`)
levels(chroms) <- c("2L", "2R", "3L", "3R", "X", "4")
m <- manPlotPrepare(pvalues = plink_results$P, chr=chroms, pos = plink_results$POS)
png(results$results[[s]][["manhattan_plot"]], height = 500, width = 500)
manPlotFast(man = m, lwd = 1, colorSet = c("black", "darkgrey"))
dev.off()
# SVG is way too big. So let's go PDF
# I recompute the plot... I know it's not optimal but I don't know how to do it else
CairoPDF(results$results[[s]][["manhattan_plot_pdf"]], height = 7, width = 7)
manPlotFast(man = m, lwd = 1, colorSet = c("black", "darkgrey"))
dev.off()
## Filtered top results and annotate them
plink_results$P[plink_results$P == 1E-255] <- 0 # Putting it back
plink_results <- merge(plink_results, annotation.dr, by.x = "ID", by.y = "SNP")
plink_results <- plink_results[,c("#CHROM", "POS", "ID", "REF", "ALT", col6, "P", "FDR_BH", "gene_annotation", "regulatory_annotation")]
plink_results <- plink_results[with(plink_results, order(P)),]
plink_results$`#CHROM`[plink_results$`#CHROM` == 1] <- "2L"
plink_results$`#CHROM`[plink_results$`#CHROM` == 2] <- "2R"
plink_results$`#CHROM`[plink_results$`#CHROM` == 3] <- "3L"
plink_results$`#CHROM`[plink_results$`#CHROM` == 4] <- "3R"
plink_results$`#CHROM`[plink_results$`#CHROM` == 5] <- "X"
plink_results$`#CHROM`[plink_results$`#CHROM` == 6] <- "4"
# Writing full annotated file
fwrite(x = plink_results, file = paste0(results$output_folder, phenotype_name, results$results[[s]]$gwas_file_suffix, ".fdr.annot.tsv"), sep = "\t", quote = F, row.names = F, col.names = T)
system(paste0("gzip -f \"", results$output_folder, phenotype_name, results$results[[s]]$gwas_file_suffix, ".fdr.annot.tsv\""))
results$results[[s]][["plink_output_file"]] <- paste0(results$output_folder, phenotype_name, results$results[[s]]$gwas_file_suffix, ".fdr.annot.tsv.gz")
# Filtering
plink_results <- subset(plink_results, P <= results$filtering_pval)
fwrite(x = plink_results, file = paste0(results$output_folder, phenotype_name, results$results[[s]]$gwas_file_suffix, ".top_", results$filtering_pval, ".annot.tsv"), sep = "\t", quote = F, row.names = F, col.names = T)
system(paste0("gzip -f \"", results$output_folder, phenotype_name, results$results[[s]]$gwas_file_suffix, ".top_", results$filtering_pval, ".annot.tsv\""))
results$results[[s]][["plink_filtered_output_file"]] <- paste0(results$output_folder, phenotype_name, results$results[[s]]$gwas_file_suffix, ".top_", results$filtering_pval, ".annot.tsv.gz")
# Run gene enrichment script on Flybase phenotypes (PYTHON)
#system(paste0("python ", input_gene_enrichment_script, " ", output_gwas_directory, " ", input_flybase_phenotype_file, " ", input_ncsu_annot_file, " ", fstudy, " ", fphenotype, " ", fsex, " ", input_filtering_pval, " ", output_gwas_directory, "/", phenotype_name, file_suffix, ".top_", input_filtering_pval, ".gene_enrichment.phenotypes.tsv"))
# Run gene enrichment script on GO terms (PYTHON)
#system(paste0("python ", input_gene_enrichment_script, " ", output_gwas_directory, " ", input_go_file, " ", input_ncsu_annot_file, " ", fstudy, " ", fphenotype, " ", fsex, " ", input_filtering_pval, " ", output_gwas_directory, "/", phenotype_name, file_suffix, ".top_", input_filtering_pval, ".gene_enrichment.go.tsv"))
}
}
}
}
}
# Output
print(toJSON(results, pretty=TRUE, auto_unbox=TRUE))