Skip to content

Commit

Permalink
polyRAD performs spectacularly poorly on prelim test most probably du…
Browse files Browse the repository at this point in the history
…e to not using it's calls prioir to simulating missing data as the expected data
  • Loading branch information
jeffersonfparil committed Oct 24, 2024
1 parent e564d44 commit 0152f8f
Showing 1 changed file with 43 additions and 34 deletions.
77 changes: 43 additions & 34 deletions res/perf_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,9 @@ fn_imputation_accuracy = function(fname_imputed, list_sim_missing, mat_idx_high_
}

### PolyRAD
### 2024-10-25
### PERFORMS SPECTACULARYLY POORLY USING THE AD FIELDS AS FREQUENCIES PER SE (~3X worse than imputef for cocksfoot at 0.01 maf ant 10% missing)!!!
### WILL NEED TO TEST USING ITS OWN CALLS TO SIMULATE MISSING DATA (I.E. AS EXPECTED ALLELE FREQUENCIES).
vec_polyrad_required_packages = c("polyRAD")
vec_polyrad_required_packages = vec_polyrad_required_packages[!(vec_polyrad_required_packages %in% installed.packages()[,"Package"])]
if (length(vec_polyrad_required_packages) > 0) {
Expand All @@ -343,7 +346,7 @@ if (length(vec_polyrad_required_packages) > 0) {
}
library(polyRAD)
library(VariantAnnotation)
fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64) {
fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64, output_format=c("tsv", "vcf")[1]) {
##############################
### TEST
# vcf = vcfR::read.vcfR("../misc/grape.vcf")
Expand All @@ -361,6 +364,7 @@ fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64) {
# fname_vcf = list_sim_missing$fname_vcf
# ploidy = 4
# read_length_less_barcode = 64
# output_format=c("matrix", "vcf")[2]
##############################
### Adapted from: https://pmc.ncbi.nlm.nih.gov/articles/PMC6404598/
# prepare the VCF file for import
Expand All @@ -378,48 +382,53 @@ fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64) {
# # estimate contamination rate
# myRAD = SetBlankTaxa(myRAD, c(“blank1”, “blank2”))
# myRAD = EstimateContaminationRate(myRAD)
# ### Use the recommended typical contamination rate
# myRAD = 1/1000
# # genotype estimation with pop. structure pipeline
# myRAD = IteratePopStructLD(myRAD, LDdist = 5e4)
list_out = polyRAD::IterateHWE(myRAD)
G = GetWeightedMeanGenotypes(list_out)
dim(G)
vec_q = sample(G, size=1000, replace=FALSE)
vec_q = c(vec_q, 1-vec_q)
txtplot::txtdensity(vec_q)

list_strsplit_loci_names = strsplit(vec_loci_names, ":")
vec_chromosome_names = unlist(lapply(list_strsplit_loci_names, FUN=function(x){x[1]}))
vec_positions = unlist(lapply(list_strsplit_loci_names, FUN=function(x){as.numeric(unlist(strsplit(x[2], "_"))[1])}))
vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(x[2], "_"))[3]}))
# vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(unlist(strsplit(x[2], "_"))[2], "/"))[1]}))
vec_ids = rownames(G)
df_out = data.frame(
chr=vec_chromosome_names,
pos=vec_positions,
allele=vec_alleles,
t(G),
check.names=FALSE
)
str(df_out)
fname_out = paste0("POLYRAD-IMPUTED-", round(runif(min=1e10, max=(1e11)-1, n=1)), ".tsv")
write.table(x=df_out, file=fname_out, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
if (output_format == "vcf") {
vcf = RADdata2VCF(list_out)
fname_out = paste0(fname_vcf, "-POLYRAD_CALLs-", round(runif(min=1e10, max=(1e11)-1, n=1)), ".vcf")
} else {
G = GetWeightedMeanGenotypes(list_out)
dim(G)
head(colnames(G))
head(rownames(G))
vec_q = sample(G, size=1000, replace=FALSE)
vec_q = c(vec_q, 1-vec_q)
txtplot::txtdensity(vec_q)
### Use the reference allele frequencies (that's why we do 1-t(G)) and use the first allele (e.g. A/T, we use A)
list_strsplit_loci_names = strsplit(colnames(G), ":")
vec_chromosome_names = unlist(lapply(list_strsplit_loci_names, FUN=function(x){x[1]}))
vec_positions = unlist(lapply(list_strsplit_loci_names, FUN=function(x){as.numeric(unlist(strsplit(x[2], "_"))[1])}))
# vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(x[2], "_"))[3]})) ### alternative alleles
vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(unlist(strsplit(x[2], "_"))[2], "/"))[1]}))
vec_ids = rownames(G)
df_out = data.frame(
chr=vec_chromosome_names,
pos=vec_positions,
allele=vec_alleles,
1.00 - t(G),
check.names=FALSE
)
str(df_out)
fname_out = paste0("POLYRAD-IMPUTED-", round(runif(min=1e10, max=(1e11)-1, n=1)), ".tsv")
write.table(x=df_out, file=fname_out, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
return(fname_out)
### Test imputation accuracy metrics:
# metrics = fn_imputation_accuracy(fname_imputed=fname_out,
# list_sim_missing=list_sim_missing,
# mat_idx_high_conf_data=mat_idx_high_conf_data,
# ploidy=ploidy,
# strict_boundaries=FALSE,
# mat_genotypes=mat_genotypes,
# n_threads=2)
}
# # free up memory (removes the alleleFreq)
# list_out = StripDown(list_out)
# str(list_out)
# list_out$alleleFreq
# export for GAPIT
# myGM_GD = ExportGAPIT(list_out)
### Test imputation accuracy metrics:
# metrics = fn_imputation_accuracy(fname_imputed=fname_out,
# list_sim_missing=list_sim_missing,
# mat_idx_high_conf_data=mat_idx_high_conf_data,
# ploidy=ploidy,
# strict_boundaries=strict_boundaries,
# mat_genotypes=mat_genotypes,
# n_threads=n_threads)
return(fname_out)
}

### Performance assessment function
Expand Down

0 comments on commit 0152f8f

Please sign in to comment.