opain · whimsicalboo · Aug 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 config_used/
-pipeline/resources/
+pipeline/resources/*
+!pipeline/resources/software/prscs
+!pipeline/resources/data/prscs_ref/1kg/ldblk_1kg_eur/ldblk_1kg_chr23.hdf5
+!pipeline/resources/data/prscs_ref/1kg/ldblk_1kg_eur/snpinfo_1kg_hm3
 pipeline/.snakemake/
 pipeline/test_data/
 pipeline/logs/*

diff --git a/Scripts/format_target/format_target.R b/Scripts/format_target/format_target.R
@@ -69,20 +69,40 @@ if(nrow(target_snp) < nrow(ref)*0.9){
 # Determine target genome build
 ###################
 
+# Check what build(s) are in the reference file
+build_columns <- grep("BP_GRCh", names(ref), value = TRUE)
+cat("Available builds in reference:", build_columns, "\n")
+
 target_build <- detect_build( ref = ref,
                               targ = target_snp,
                               log_file = log_file)
 
-###################
-# Extract overlapping variants in plink format and insert RSIDs
-###################
+# Fallback if detection fails (for Maria's purpose only)
+if (is.na(target_build)) {
+  warning("Could not detect genome build from target and reference. Defaulting to GRCh37.")
+  target_build <- "GRCh37"
+}
+
+bp_col <- paste0("BP_", target_build)
+if (!(bp_col %in% names(ref))) {
+  stop(paste("Column", bp_col, "not found in reference. Available columns:", paste(names(ref), collapse = ", ")))
+}
 
 # Subset relevent build from ref
 ref_subset<-ref[, c("CHR","SNP",paste0("BP_",target_build),"A1","A2","IUPAC"), with=F]
 names(ref_subset)[names(ref_subset) == paste0("BP_",target_build)]<-'BP'
 
+###################
+# Extract overlapping variants in plink format and insert RSIDs
+###################
+
+#Ensure target matches ref if in X format
+target_snp$CHR[target_snp$CHR == "X"] <- "23"
+target_snp$CHR<-as.numeric(target_snp$CHR)
+
 # Merge target and ref by CHR and BP
 ref_target<-merge(target_snp, ref_subset, by=c('CHR','BP'))
+nrow(ref_target)
 
 # Insert IUPAC codes
 names(ref_target)[names(ref_target) == 'IUPAC']<-'IUPAC.y'
@@ -91,6 +111,7 @@ ref_target$IUPAC.x<-snp_iupac(ref_target$A1.x, ref_target$A2.x)
 # Only retain variants that are non-ambiguous SNPs in the target
 # Some might have been retained due to matching CHR and BP position with SNPs in reference
 ref_target<-ref_target[!is.na(ref_target$IUPAC.x),]
+nrow(ref_target)
 
 # Identify variants that need to be flipped
 flip <- detect_strand_flip(ref_target$IUPAC.x, ref_target$IUPAC.y)
@@ -111,6 +132,9 @@ if(nrow(ref_target) < nrow(ref)*0.7){
 write.table(ref_target$SNP.x, paste0(tmp_dir,'/extract_list_1.txt'), col.names = F, row.names = F, quote=F)
 write.table(ref_target$SNP.y, paste0(tmp_dir,'/extract_list_2.txt'), col.names = F, row.names = F, quote=F)
 
+cat("After write.table, extract_list_2.txt contents:\n")
+print(readLines(paste0(tmp_dir,'/extract_list_2.txt'), n = 6))
+
 # First extract variants based on original ID
 if(opt$format == 'plink1'){
   system(paste0(opt$plink2," --bfile ",opt$target, " --extract ", tmp_dir,"/extract_list_1.txt --make-pgen 'pvar-cols=' --threads 1 --out ", tmp_dir,"/subset"))
@@ -138,6 +162,8 @@ if(sum(names(targ_psam) %in% c('FID', 'IID')) == 1){
 # Now edit bim file to update IDs to reference IDs
 targ_pvar<-fread(paste0(tmp_dir,'/subset.pvar'))
 names(targ_pvar)<-c('CHR','BP','SNP','A2','A1')
+targ_pvar$CHR[targ_pvar$CHR == "X"] <- "23"
+targ_pvar$CHR<-as.numeric(targ_pvar$CHR)
 
 # Update SNP with reference SNP value based on CHR:BP:IUPAC in the previously matched ref and target data
 targ_pvar$IUPAC<-snp_iupac(targ_pvar$A1, targ_pvar$A2)

diff --git a/docs/pipeline_1kg_hgdp_prep.Rmd b/docs/pipeline_1kg_hgdp_prep.Rmd
diff --git a/functions/constants.R b/functions/constants.R
@@ -1,5 +1,5 @@
 # Set range of chromosome numbers to use by default
-CHROMS <- 1:22
+CHROMS <- 1:23
 
 # Coordinates of high ld regions in build GRCh37
 long_ld_coord<-do.call(rbind, list(
@@ -47,4 +47,3 @@ pgs_methods_noneur <- c('ptclump','lassosum','lassosum2','megaprs','prscs','dbsl
 
 # Make vector indicating pgs_methods that are to be applied to gwas_groups
 pgs_group_methods <- c('prscsx','xwing')
-
diff --git a/functions/pgs.R b/functions/pgs.R
@@ -202,7 +202,7 @@ read_geno <- function(target, format) {
 }
 
 # Read in PLINK .bim file
-read_bim<-function(dat, chr = 1:22){
+read_bim<-function(dat, chr = 1:23){
   bim<-NULL
   for(i in chr){
     bim<-rbind(bim, fread(paste0(dat, i,'.bim')))
@@ -230,7 +230,7 @@ read_pop_data <- function(x){
 }
 
 # Read in PLINK2 .pvar file
-read_pvar<-function(dat, chr = 1:22){
+read_pvar<-function(dat, chr = 1:23){
   pvar<-NULL
   for(i in chr){
     pvar<-rbind(pvar, fread(paste0(dat, i,'.pvar')))
@@ -269,13 +269,13 @@ remove_regions<-function(dat, regions){
 }
 
 # Read in GWAS summary statistics
-read_sumstats<-function(sumstats, chr = 1:22, log_file = NULL, extract = NULL, req_cols = c('CHR','BP','A1','A2','BETA','SE','P','FREQ')){
+read_sumstats<-function(sumstats, chr = 1:23, log_file = NULL, extract = NULL, req_cols = c('CHR','BP','A1','A2','BETA','SE','P','FREQ')){
   gwas<-fread(sumstats)
 
   log_add(log_file = log_file, message = paste0('sumstats contains ',nrow(gwas),' variants.'))
 
   # Retain requested chromosomes
-  if(!all(1:22 %in% chr)){
+  if(!all(1:23 %in% chr)){
     if(!('CHR' %in% names(gwas))){
       stop('Cannot filter sumstats by chromosome when CHR column is not present.')
     }
@@ -352,7 +352,7 @@ ldsc <- function(sumstats, ldsc, hm3_snplist, munge_sumstats, ld_scores, pop_pre
 }
 
 # Match A1 and A2 match a reference
-allele_match<-function(sumstats, ref_bim, chr = 1:22){
+allele_match<-function(sumstats, ref_bim, chr = 1:23){
   sumstats_ref<-merge(sumstats, ref_bim[, c('SNP','A1','A2'), with=F], by = 'SNP')
   swap_index <- sumstats_ref$A1.x == sumstats_ref$A2.y & sumstats_ref$A2.x == sumstats_ref$A1.y
   sumstats_ref$BETA[swap_index] <- -sumstats_ref$BETA[swap_index]
@@ -366,7 +366,7 @@ allele_match<-function(sumstats, ref_bim, chr = 1:22){
 }
 
 # Run DBSLMM
-dbslmm <- function(dbslmm, plink = plink, ld_blocks, chr = 1:22, bfile, sumstats, h2, h2f = 1, nsnp, nindiv, log_file = NULL, ncores=1){
+dbslmm <- function(dbslmm, plink = plink, ld_blocks, chr = 1:23, bfile, sumstats, h2, h2f = 1, nsnp, nindiv, log_file = NULL, ncores=1){
   # Create temp directory
   tmp_dir<-tempdir()
 
@@ -414,7 +414,7 @@ dbslmm <- function(dbslmm, plink = plink, ld_blocks, chr = 1:22, bfile, sumstats
 }
 
 # Calculate predictor correlations using LDAK
-ldak_pred_cor<-function(bfile, ldak, n_cores, chr = 1:22, keep = NULL){
+ldak_pred_cor<-function(bfile, ldak, n_cores, chr = 1:23, keep = NULL){
   tmp_dir<-tempdir()
   tmp_file<-tempfile()
 
@@ -436,7 +436,7 @@ ldak_pred_cor<-function(bfile, ldak, n_cores, chr = 1:22, keep = NULL){
 }
 
 # Read in external score file
-read_score <- function(score, chr = 1:22, log_file = NULL){
+read_score <- function(score, chr = 1:23, log_file = NULL){
 	# Read in score file
 	score <- fread(score)
 

diff --git a/functions/plink.R b/functions/plink.R
@@ -1,7 +1,7 @@
 #!/usr/bin/Rscript
-
+print("Using chromosome X")
 # Make a subset of plink1 binaries
-plink_subset<-function(plink=NULL, plink2=NULL, chr = 1:22, keep = NULL, extract = NULL, bfile=NULL, pfile=NULL, out, memory = 4000, threads = 1, make_bed = F){
+plink_subset<-function(plink=NULL, plink2=NULL, chr = 1:23, keep = NULL, extract = NULL, bfile=NULL, pfile=NULL, out, memory = 4000, threads = 1, make_bed = F){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -56,7 +56,7 @@ plink_subset<-function(plink=NULL, plink2=NULL, chr = 1:22, keep = NULL, extract
 }
 
 # Return a list of variants surviving QC using plink
-plink_qc_snplist<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, keep = NULL, chr = 1:22, threads = 1, memory = 4000, geno = NULL, maf = NULL, hwe = NULL){
+plink_qc_snplist<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, keep = NULL, chr = 1:23, threads = 1, memory = 4000, geno = NULL, maf = NULL, hwe = NULL){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -114,7 +114,7 @@ plink_qc_snplist<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, keep
 }
 
 # Merge plink files
-plink_merge<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:22, extract = NULL, keep = NULL, flip = NULL, make_bed =F, memory = 4000, out, threads = 1){
+plink_merge<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:23, extract = NULL, keep = NULL, flip = NULL, make_bed =F, memory = 4000, out, threads = 1){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -188,7 +188,7 @@ plink_merge<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:2
 }
 
 # Perform PCA using plink files
-plink_pca<-function(bfile=NULL, pfile=NULL, chr = 1:22, plink2, extract = NULL, keep = NULL, flip = NULL, memory = 4000, n_pc = 6, threads = 1){
+plink_pca<-function(bfile=NULL, pfile=NULL, chr = 1:23, plink2, extract = NULL, keep = NULL, flip = NULL, memory = 4000, n_pc = 6, threads = 1){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -231,7 +231,7 @@ plink_pca<-function(bfile=NULL, pfile=NULL, chr = 1:22, plink2, extract = NULL,
 }
 
 # Performing LD pruning
-plink_prune<-function(bfile=NULL, pfile=NULL, keep = NULL, plink=NULL, plink2=NULL, chr = 1:22, extract = NULL, memory = 4000, threads = 1){
+plink_prune<-function(bfile=NULL, pfile=NULL, keep = NULL, plink=NULL, plink2=NULL, chr = 1:23, extract = NULL, memory = 4000, threads = 1){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -292,7 +292,7 @@ plink_prune<-function(bfile=NULL, pfile=NULL, keep = NULL, plink=NULL, plink2=NU
 }
 
 # Peforming LD-based clumping
-plink_clump<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:22, sumstats, keep = NULL, memory = 4000, log_file = NULL, threads = 1){
+plink_clump<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:23, sumstats, keep = NULL, memory = 4000, log_file = NULL, threads = 1){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -356,7 +356,7 @@ plink_clump<-function(bfile=NULL, pfile=NULL, plink=NULL, plink2=NULL, chr = 1:2
 }
 
 # Generate kinship matrix and identify unrelated individuals (>2nd degree)
-plink_king<-function(bfile=NULL, pfile=NULL, extract = NULL, chr = 1:22, plink2='plink2', out, keep=NULL, threads = 1){
+plink_king<-function(bfile=NULL, pfile=NULL, extract = NULL, chr = 1:23, plink2='plink2', out, keep=NULL, threads = 1){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }
@@ -410,7 +410,7 @@ plink_king<-function(bfile=NULL, pfile=NULL, extract = NULL, chr = 1:22, plink2=
   system(paste0('tail -n +2 ', tmp_dir, '/merged.king.cutoff.in.id > ', out, '.unrelated.keep'))
 }
 
-plink_score<-function(bfile=NULL, pfile=NULL, score, keep=NULL, extract=NULL, chr=1:22, frq=NULL, plink2=NULL, threads=1, fbm = F){
+plink_score<-function(bfile=NULL, pfile=NULL, score, keep=NULL, extract=NULL, chr=1:23, frq=NULL, plink2=NULL, threads=1, fbm = F){
   if(is.null(bfile) & is.null(pfile)){
     stop("bfile or pfile must be specified.")
   }