Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
config_used/
pipeline/resources/
pipeline/resources/*
!pipeline/resources/software/prscs
!pipeline/resources/data/prscs_ref/1kg/ldblk_1kg_eur/ldblk_1kg_chr23.hdf5
!pipeline/resources/data/prscs_ref/1kg/ldblk_1kg_eur/snpinfo_1kg_hm3
pipeline/.snakemake/
pipeline/test_data/
pipeline/logs/*
Expand Down
32 changes: 29 additions & 3 deletions Scripts/format_target/format_target.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,40 @@ if(nrow(target_snp) < nrow(ref)*0.9){
# Determine target genome build
###################

# Check what build(s) are in the reference file
build_columns <- grep("BP_GRCh", names(ref), value = TRUE)
cat("Available builds in reference:", build_columns, "\n")

target_build <- detect_build( ref = ref,
targ = target_snp,
log_file = log_file)

###################
# Extract overlapping variants in plink format and insert RSIDs
###################
# Fallback if detection fails (for Maria's purpose only)
if (is.na(target_build)) {
warning("Could not detect genome build from target and reference. Defaulting to GRCh37.")
target_build <- "GRCh37"
}

bp_col <- paste0("BP_", target_build)
if (!(bp_col %in% names(ref))) {
stop(paste("Column", bp_col, "not found in reference. Available columns:", paste(names(ref), collapse = ", ")))
}

# Subset relevent build from ref
ref_subset<-ref[, c("CHR","SNP",paste0("BP_",target_build),"A1","A2","IUPAC"), with=F]
names(ref_subset)[names(ref_subset) == paste0("BP_",target_build)]<-'BP'

###################
# Extract overlapping variants in plink format and insert RSIDs
###################

#Ensure target matches ref if in X format
target_snp$CHR[target_snp$CHR == "X"] <- "23"
target_snp$CHR<-as.numeric(target_snp$CHR)

# Merge target and ref by CHR and BP
ref_target<-merge(target_snp, ref_subset, by=c('CHR','BP'))
nrow(ref_target)

# Insert IUPAC codes
names(ref_target)[names(ref_target) == 'IUPAC']<-'IUPAC.y'
Expand All @@ -91,6 +111,7 @@ ref_target$IUPAC.x<-snp_iupac(ref_target$A1.x, ref_target$A2.x)
# Only retain variants that are non-ambiguous SNPs in the target
# Some might have been retained due to matching CHR and BP position with SNPs in reference
ref_target<-ref_target[!is.na(ref_target$IUPAC.x),]
nrow(ref_target)

# Identify variants that need to be flipped
flip <- detect_strand_flip(ref_target$IUPAC.x, ref_target$IUPAC.y)
Expand All @@ -111,6 +132,9 @@ if(nrow(ref_target) < nrow(ref)*0.7){
write.table(ref_target$SNP.x, paste0(tmp_dir,'/extract_list_1.txt'), col.names = F, row.names = F, quote=F)
write.table(ref_target$SNP.y, paste0(tmp_dir,'/extract_list_2.txt'), col.names = F, row.names = F, quote=F)

cat("After write.table, extract_list_2.txt contents:\n")
print(readLines(paste0(tmp_dir,'/extract_list_2.txt'), n = 6))

# First extract variants based on original ID
if(opt$format == 'plink1'){
system(paste0(opt$plink2," --bfile ",opt$target, " --extract ", tmp_dir,"/extract_list_1.txt --make-pgen 'pvar-cols=' --threads 1 --out ", tmp_dir,"/subset"))
Expand Down Expand Up @@ -138,6 +162,8 @@ if(sum(names(targ_psam) %in% c('FID', 'IID')) == 1){
# Now edit bim file to update IDs to reference IDs
targ_pvar<-fread(paste0(tmp_dir,'/subset.pvar'))
names(targ_pvar)<-c('CHR','BP','SNP','A2','A1')
targ_pvar$CHR[targ_pvar$CHR == "X"] <- "23"
targ_pvar$CHR<-as.numeric(targ_pvar$CHR)

# Update SNP with reference SNP value based on CHR:BP:IUPAC in the previously matched ref and target data
targ_pvar$IUPAC<-snp_iupac(targ_pvar$A1, targ_pvar$A2)
Expand Down
471 changes: 409 additions & 62 deletions docs/pipeline_1kg_hgdp_prep.Rmd

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions functions/constants.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Set range of chromosome numbers to use by default
CHROMS <- 1:22
CHROMS <- 1:23

# Coordinates of high ld regions in build GRCh37
long_ld_coord<-do.call(rbind, list(
Expand Down Expand Up @@ -47,4 +47,3 @@ pgs_methods_noneur <- c('ptclump','lassosum','lassosum2','megaprs','prscs','dbsl

# Make vector indicating pgs_methods that are to be applied to gwas_groups
pgs_group_methods <- c('prscsx','xwing')

16 changes: 8 additions & 8 deletions functions/pgs.R
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ read_geno <- function(target, format) {
}

# Read in PLINK .bim file
read_bim<-function(dat, chr = 1:22){
read_bim<-function(dat, chr = 1:23){
bim<-NULL
for(i in chr){
bim<-rbind(bim, fread(paste0(dat, i,'.bim')))
Expand Down Expand Up @@ -230,7 +230,7 @@ read_pop_data <- function(x){
}

# Read in PLINK2 .pvar file
read_pvar<-function(dat, chr = 1:22){
read_pvar<-function(dat, chr = 1:23){
pvar<-NULL
for(i in chr){
pvar<-rbind(pvar, fread(paste0(dat, i,'.pvar')))
Expand Down Expand Up @@ -269,13 +269,13 @@ remove_regions<-function(dat, regions){
}

# Read in GWAS summary statistics
read_sumstats<-function(sumstats, chr = 1:22, log_file = NULL, extract = NULL, req_cols = c('CHR','BP','A1','A2','BETA','SE','P','FREQ')){
read_sumstats<-function(sumstats, chr = 1:23, log_file = NULL, extract = NULL, req_cols = c('CHR','BP','A1','A2','BETA','SE','P','FREQ')){
gwas<-fread(sumstats)

log_add(log_file = log_file, message = paste0('sumstats contains ',nrow(gwas),' variants.'))

# Retain requested chromosomes
if(!all(1:22 %in% chr)){
if(!all(1:23 %in% chr)){
if(!('CHR' %in% names(gwas))){
stop('Cannot filter sumstats by chromosome when CHR column is not present.')
}
Expand Down Expand Up @@ -352,7 +352,7 @@ ldsc <- function(sumstats, ldsc, hm3_snplist, munge_sumstats, ld_scores, pop_pre
}

# Match A1 and A2 match a reference
allele_match<-function(sumstats, ref_bim, chr = 1:22){
allele_match<-function(sumstats, ref_bim, chr = 1:23){
sumstats_ref<-merge(sumstats, ref_bim[, c('SNP','A1','A2'), with=F], by = 'SNP')
swap_index <- sumstats_ref$A1.x == sumstats_ref$A2.y & sumstats_ref$A2.x == sumstats_ref$A1.y
sumstats_ref$BETA[swap_index] <- -sumstats_ref$BETA[swap_index]
Expand All @@ -366,7 +366,7 @@ allele_match<-function(sumstats, ref_bim, chr = 1:22){
}

# Run DBSLMM
dbslmm <- function(dbslmm, plink = plink, ld_blocks, chr = 1:22, bfile, sumstats, h2, h2f = 1, nsnp, nindiv, log_file = NULL, ncores=1){
dbslmm <- function(dbslmm, plink = plink, ld_blocks, chr = 1:23, bfile, sumstats, h2, h2f = 1, nsnp, nindiv, log_file = NULL, ncores=1){
# Create temp directory
tmp_dir<-tempdir()

Expand Down Expand Up @@ -414,7 +414,7 @@ dbslmm <- function(dbslmm, plink = plink, ld_blocks, chr = 1:22, bfile, sumstats
}

# Calculate predictor correlations using LDAK
ldak_pred_cor<-function(bfile, ldak, n_cores, chr = 1:22, keep = NULL){
ldak_pred_cor<-function(bfile, ldak, n_cores, chr = 1:23, keep = NULL){
tmp_dir<-tempdir()
tmp_file<-tempfile()

Expand All @@ -436,7 +436,7 @@ ldak_pred_cor<-function(bfile, ldak, n_cores, chr = 1:22, keep = NULL){
}

# Read in external score file
read_score <- function(score, chr = 1:22, log_file = NULL){
read_score <- function(score, chr = 1:23, log_file = NULL){
# Read in score file
score <- fread(score)

Expand Down
18 changes: 9 additions & 9 deletions functions/plink.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/Rscript

print("Using chromosome X")
# Make a subset of plink1 binaries
plink_subset<-function(plink=NULL, plink2=NULL, chr = 1:22, keep = NULL, extract = NULL, bfile=NULL, pfile=NULL, out, memory = 4000, threads = 1, make_bed = F){
plink_subset<-function(plink=NULL, plink2=NULL, chr = 1:23, keep = NULL, extract = NULL, bfile=NULL, pfile=NULL, out, memory = 4000, threads = 1, make_bed = F){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -56,7 +56,7 @@ plink_subset<-function(plink=NULL, plink2=NULL, chr = 1:22, keep = NULL, extract
}

# Return a list of variants surviving QC using plink
plink_qc_snplist<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, keep = NULL, chr = 1:22, threads = 1, memory = 4000, geno = NULL, maf = NULL, hwe = NULL){
plink_qc_snplist<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, keep = NULL, chr = 1:23, threads = 1, memory = 4000, geno = NULL, maf = NULL, hwe = NULL){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -114,7 +114,7 @@ plink_qc_snplist<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, keep
}

# Merge plink files
plink_merge<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:22, extract = NULL, keep = NULL, flip = NULL, make_bed =F, memory = 4000, out, threads = 1){
plink_merge<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:23, extract = NULL, keep = NULL, flip = NULL, make_bed =F, memory = 4000, out, threads = 1){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -188,7 +188,7 @@ plink_merge<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:2
}

# Perform PCA using plink files
plink_pca<-function(bfile=NULL, pfile=NULL, chr = 1:22, plink2, extract = NULL, keep = NULL, flip = NULL, memory = 4000, n_pc = 6, threads = 1){
plink_pca<-function(bfile=NULL, pfile=NULL, chr = 1:23, plink2, extract = NULL, keep = NULL, flip = NULL, memory = 4000, n_pc = 6, threads = 1){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -231,7 +231,7 @@ plink_pca<-function(bfile=NULL, pfile=NULL, chr = 1:22, plink2, extract = NULL,
}

# Performing LD pruning
plink_prune<-function(bfile=NULL, pfile=NULL, keep = NULL, plink=NULL, plink2=NULL, chr = 1:22, extract = NULL, memory = 4000, threads = 1){
plink_prune<-function(bfile=NULL, pfile=NULL, keep = NULL, plink=NULL, plink2=NULL, chr = 1:23, extract = NULL, memory = 4000, threads = 1){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -292,7 +292,7 @@ plink_prune<-function(bfile=NULL, pfile=NULL, keep = NULL, plink=NULL, plink2=NU
}

# Peforming LD-based clumping
plink_clump<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:22, sumstats, keep = NULL, memory = 4000, log_file = NULL, threads = 1){
plink_clump<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:23, sumstats, keep = NULL, memory = 4000, log_file = NULL, threads = 1){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -356,7 +356,7 @@ plink_clump<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:2
}

# Generate kinship matrix and identify unrelated individuals (>2nd degree)
plink_king<-function(bfile=NULL, pfile=NULL, extract = NULL, chr = 1:22, plink2='plink2', out, keep=NULL, threads = 1){
plink_king<-function(bfile=NULL, pfile=NULL, extract = NULL, chr = 1:23, plink2='plink2', out, keep=NULL, threads = 1){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down Expand Up @@ -410,7 +410,7 @@ plink_king<-function(bfile=NULL, pfile=NULL, extract = NULL, chr = 1:22, plink2=
system(paste0('tail -n +2 ', tmp_dir, '/merged.king.cutoff.in.id > ', out, '.unrelated.keep'))
}

plink_score<-function(bfile=NULL, pfile=NULL, score, keep=NULL, extract=NULL, chr=1:22, frq=NULL, plink2=NULL, threads=1, fbm = F){
plink_score<-function(bfile=NULL, pfile=NULL, score, keep=NULL, extract=NULL, chr=1:23, frq=NULL, plink2=NULL, threads=1, fbm = F){
if(is.null(bfile) & is.null(pfile)){
stop("bfile or pfile must be specified.")
}
Expand Down
Loading