forked from Blue-Lab/assoc_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample_qc.R
executable file
·44 lines (35 loc) · 1.27 KB
/
sample_qc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#! /usr/bin/env Rscript
library(argparser)
library(magrittr)
# read arguments
argp <- arg_parser("Sample QC") %>%
add_argument("gds_file", help="GDS file") %>%
add_argument("--out_prefix", help="Prefix for output files",
default="") %>%
add_argument("--variant_id", help="File with vector of variant IDs") %>%
add_argument("--num_cores", help="Number of cores to utilize for parallel processing", default=1)
argv <- parse_args(argp)
# load libraries
library(SeqArray)
library(ggplot2)
# log versions and arguments for reproducibility
sessionInfo()
print(argv)
# open GDS file
gds <- seqOpen(argv$gds_file)
# select variants
if (!is.na(argv$variant_id)) {
variant.id <- readRDS(argv$variant_id)
seqSetFilter(gds, variant.id=variant.id)
}
# missing call rate
# the SeqVarTools missingGenotypeRate is merely a wrapper around seqMissing
missing.rate <- seqMissing(gds, per.variant=FALSE, parallel=argv$num_cores)
sample.id <- seqGetData(gds, "sample.id")
miss.df <- data.frame(sample.id, missing.rate, stringsAsFactors=FALSE)
saveRDS(miss.df, paste0(argv$out_prefix, "missing_by_sample.rds"))
# plot
p <- ggplot(miss.df, aes(missing.rate)) +
geom_histogram(binwidth=0.01, boundary=0)
ggsave(paste0(argv$out_prefix, "missing_by_sample.pdf"), plot=p)
seqClose(gds)