polyRAD performs spectacularly poorly on prelim test most probably du…

…e to not using it's calls prioir to simulating missing data as the expected data
jeffersonfparil · Oct 24, 2024 · 0152f8f · 0152f8f
1 parent e564d44
commit 0152f8f
Showing 1 changed file with 43 additions and 34 deletions.
diff --git a/res/perf_functions.R b/res/perf_functions.R
@@ -327,6 +327,9 @@ fn_imputation_accuracy = function(fname_imputed, list_sim_missing, mat_idx_high_
 }
 
 ### PolyRAD
+### 2024-10-25
+### PERFORMS SPECTACULARYLY POORLY USING THE AD FIELDS AS FREQUENCIES PER SE (~3X worse than imputef for cocksfoot at 0.01 maf ant 10% missing)!!!
+### WILL NEED TO TEST USING ITS OWN CALLS TO SIMULATE MISSING DATA (I.E. AS EXPECTED ALLELE FREQUENCIES).
 vec_polyrad_required_packages = c("polyRAD")
 vec_polyrad_required_packages = vec_polyrad_required_packages[!(vec_polyrad_required_packages %in% installed.packages()[,"Package"])]
 if (length(vec_polyrad_required_packages) > 0) {
@@ -343,7 +346,7 @@ if (length(vec_polyrad_required_packages) > 0) {
 }
 library(polyRAD)
 library(VariantAnnotation)
-fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64) {
+fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64, output_format=c("tsv", "vcf")[1]) {
     ##############################
     ### TEST
     # vcf = vcfR::read.vcfR("../misc/grape.vcf")
@@ -361,6 +364,7 @@ fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64) {
     # fname_vcf = list_sim_missing$fname_vcf
     # ploidy = 4
     # read_length_less_barcode = 64
+    # output_format=c("matrix", "vcf")[2]
     ##############################
     ### Adapted from: https://pmc.ncbi.nlm.nih.gov/articles/PMC6404598/
     # prepare the VCF file for import
@@ -378,48 +382,53 @@ fn_polyRAD = function(fname_vcf, ploidy=2, read_length_less_barcode=64) {
     # # estimate contamination rate
     # myRAD = SetBlankTaxa(myRAD, c(“blank1”, “blank2”))
     # myRAD = EstimateContaminationRate(myRAD)
-    # ### Use the recommended typical contamination rate
-    # myRAD = 1/1000
     # # genotype estimation with pop. structure pipeline
     # myRAD = IteratePopStructLD(myRAD, LDdist = 5e4)
     list_out = polyRAD::IterateHWE(myRAD)
-    G = GetWeightedMeanGenotypes(list_out)
-    dim(G)
-    vec_q = sample(G, size=1000, replace=FALSE)
-    vec_q = c(vec_q, 1-vec_q)
-    txtplot::txtdensity(vec_q)
-
-    list_strsplit_loci_names = strsplit(vec_loci_names, ":")
-    vec_chromosome_names = unlist(lapply(list_strsplit_loci_names, FUN=function(x){x[1]}))
-    vec_positions = unlist(lapply(list_strsplit_loci_names, FUN=function(x){as.numeric(unlist(strsplit(x[2], "_"))[1])}))
-    vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(x[2], "_"))[3]}))
-    # vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(unlist(strsplit(x[2], "_"))[2], "/"))[1]}))
-    vec_ids = rownames(G)
-    df_out = data.frame(
-        chr=vec_chromosome_names,
-        pos=vec_positions,
-        allele=vec_alleles,
-        t(G),
-        check.names=FALSE
-    )
-    str(df_out)
-    fname_out = paste0("POLYRAD-IMPUTED-", round(runif(min=1e10, max=(1e11)-1, n=1)), ".tsv")
-    write.table(x=df_out, file=fname_out, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
+    if (output_format == "vcf") {
+        vcf = RADdata2VCF(list_out)
+        fname_out = paste0(fname_vcf, "-POLYRAD_CALLs-", round(runif(min=1e10, max=(1e11)-1, n=1)), ".vcf")
+    } else {
+        G = GetWeightedMeanGenotypes(list_out)
+        dim(G)
+        head(colnames(G))
+        head(rownames(G))
+        vec_q = sample(G, size=1000, replace=FALSE)
+        vec_q = c(vec_q, 1-vec_q)
+        txtplot::txtdensity(vec_q)
+        ### Use the reference allele frequencies (that's why we do 1-t(G)) and use the first allele (e.g. A/T, we use A)
+        list_strsplit_loci_names = strsplit(colnames(G), ":")
+        vec_chromosome_names = unlist(lapply(list_strsplit_loci_names, FUN=function(x){x[1]}))
+        vec_positions = unlist(lapply(list_strsplit_loci_names, FUN=function(x){as.numeric(unlist(strsplit(x[2], "_"))[1])}))
+        # vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(x[2], "_"))[3]})) ### alternative alleles
+        vec_alleles = unlist(lapply(list_strsplit_loci_names, FUN=function(x){unlist(strsplit(unlist(strsplit(x[2], "_"))[2], "/"))[1]}))
+        vec_ids = rownames(G)
+        df_out = data.frame(
+            chr=vec_chromosome_names,
+            pos=vec_positions,
+            allele=vec_alleles,
+            1.00 - t(G),
+            check.names=FALSE
+        )
+        str(df_out)
+        fname_out = paste0("POLYRAD-IMPUTED-", round(runif(min=1e10, max=(1e11)-1, n=1)), ".tsv")
+        write.table(x=df_out, file=fname_out, sep="\t", row.names=FALSE, col.names=TRUE, quote=FALSE)
+        return(fname_out)
+        ### Test imputation accuracy metrics:
+        # metrics = fn_imputation_accuracy(fname_imputed=fname_out,
+        #     list_sim_missing=list_sim_missing,
+        #     mat_idx_high_conf_data=mat_idx_high_conf_data,
+        #     ploidy=ploidy,
+        #     strict_boundaries=FALSE,
+        #     mat_genotypes=mat_genotypes,
+        #     n_threads=2)
+    }
     # # free up memory (removes the alleleFreq)
     # list_out = StripDown(list_out)
     # str(list_out)
     # list_out$alleleFreq
     # export for GAPIT
     # myGM_GD = ExportGAPIT(list_out)
-    ### Test imputation accuracy metrics:
-    # metrics = fn_imputation_accuracy(fname_imputed=fname_out,
-    #     list_sim_missing=list_sim_missing,
-    #     mat_idx_high_conf_data=mat_idx_high_conf_data,
-    #     ploidy=ploidy,
-    #     strict_boundaries=strict_boundaries,
-    #     mat_genotypes=mat_genotypes,
-    #     n_threads=n_threads)
-    return(fname_out)
 }
 
 ### Performance assessment function