Skip to content

Commit

Permalink
adding allele freq distribution in the plots
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffersonfparil committed Apr 16, 2024
1 parent be9c6d1 commit e5ecbad
Show file tree
Hide file tree
Showing 9 changed files with 19,571 additions and 23,192 deletions.
4,947 changes: 2,291 additions & 2,656 deletions res/cocksfoot-Concordance.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5,075 changes: 2,333 additions & 2,742 deletions res/cocksfoot-Concordance_high_confidence_data.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4,995 changes: 2,272 additions & 2,723 deletions res/cocksfoot-Mean_absolute_error.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5,113 changes: 2,309 additions & 2,804 deletions res/cocksfoot-Mean_absolute_error_high_confidence_data.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6,207 changes: 2,835 additions & 3,372 deletions res/grape-Concordance.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6,297 changes: 2,853 additions & 3,444 deletions res/grape-Mean_absolute_error.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
131 changes: 70 additions & 61 deletions res/perf_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
# }
# }
# vec_2_metrics = c("mae_frequencies", "concordance_classes"); vec_2_metrics_labels = c("Mean absolute error", "Concordance")
# Sort algorithms according to increasing complexity and with LinkImpute at the bottom as it will not be assessed for polyploid and pool datasets
### Load allele frequencies
source("perf_functions.R")
vcf = vcfR::read.vcfR(paste0("../misc/", dataset, ".vcf"), verbose=TRUE)
mat_genotypes = fn_extract_allele_frequencies(vcf=vcf, min_depth=10, max_depth=200)$mat_genotypes
vec_mu_af = colMeans(mat_genotypes, na.rm=TRUE)
idx_af = which((vec_mu_af>=0.01) & (vec_mu_af<=(1-0.01)))
vec_af = as.vector(mat_genotypes[, idx_af])
vec_af = vec_af[!is.na(vec_af)]
vec_af = c(vec_af, 1-vec_af)
txtplot::txtdensity(vec_af)
### Sort algorithms according to increasing complexity and with LinkImpute at the bottom as it will not be assessed for polyploid and pool datasets
df$algorithm = as.character(df$algorithm)
df$algorithm[df$algorithm=="mvi"] = "a"
df$algorithm[df$algorithm=="aldknni_fixed"] = "b"
Expand All @@ -30,49 +40,58 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
levels(df$algorithm)[levels(df$algorithm)=="b"] = "AFIXED"
levels(df$algorithm)[levels(df$algorithm)=="c"] = "AOPTIM"
levels(df$algorithm)[levels(df$algorithm)=="d"] = "LINKIMPUTE"

# agg_concordance = aggregate(concordance_classes ~ algorithm + maf + missing_rate, data=df, FUN=mean)
### Mean concordance
agg_concordance = aggregate(concordance_classes ~ algorithm, data=df, FUN=mean)
agg_concordance = agg_concordance[order(agg_concordance$concordance_classes, decreasing=TRUE), ]
print(agg_concordance)

# agg_duration = aggregate(duration_mins ~ algorithm + maf + missing_rate, data=df, FUN=mean)
### Mean computation time
agg_duration = aggregate(duration_mins ~ algorithm, data=df, FUN=mean)
agg_duration = agg_duration[order(agg_duration$duration_mins, decreasing=FALSE), ]
print(agg_duration)

### Plot colours
vec_algorithm = levels(df$algorithm)
print(paste(vec_algorithm, collapse=" | "))
vec_maf = sort(unique(df$maf))
vec_missing_rate = sort(unique(df$missing_rate))
vec_colours = c("#b2df8a", "#33a02c", "#1f78b4", "#a6cee3")
vec_colours = rep(vec_colours, times=ceiling(length(vec_algorithm)/length(vec_colours)))[1:length(vec_algorithm)]

#### Plot per dataset
vec_fnames_svg = c()
for (i in 1:length(vec_2_metrics)) {
# i = 1
metric = vec_2_metrics[i]
metric_label = vec_2_metrics_labels[i]
if (grepl("mae", metric) | grepl("rmse", metric)) {
eval(parse(text=paste0("ylim = c(0, max(df$", metric, ", na.rm=TRUE)+(2*sd(df$", metric, ", na.rm=TRUE)))")))
} else {
ylim = c(0, 1.17)
}
# if (grepl("mae", metric) | grepl("rmse", metric)) {
# eval(parse(text=paste0("ylim = c(0, max(df$", metric, ", na.rm=TRUE)+(2*sd(df$", metric, ", na.rm=TRUE)))")))
# } else {
# ylim = c(0, 1.17)
# }
fname_svg = paste0(dataset, "-", gsub(" ", "_", metric_label), ".svg")
vec_fnames_svg = c(vec_fnames_svg, fname_svg)
svg(fname_svg, width=15, height=9)
n_cols = 48
layout(matrix(c(rep(1, n_cols*(5/8)), rep(2, n_cols*(3/8)),
rep(3, n_cols*(5/8)), rep(4, n_cols*(3/8)),
rep(5:8, each=n_cols/4),
rep(9:12, each=n_cols/4)
), byrow=TRUE, ncol=n_cols))
layout(matrix(c(
1, 1, 1, 2,
1, 1, 1, 2,
1, 1, 1, 2,
1, 1, 1, 2,
3, 3, 3, 4,
3, 3, 3, 4,
3, 3, 3, 4,
3, 3, 3, 4,
5, 6, 7, 8,
5, 6, 7, 8,
5, 6, 7, 8,
9,10,11,12,
9,10,11,12,
9,10,11,12
), byrow=TRUE, ncol=4))
par(mar=c(5,5,5,1))
for (maf in vec_maf) {
# maf = vec_maf[1]
subdf = df[df$maf == maf, ]
df_algo_miss = expand.grid(algorithm=vec_algorithm, missing_rate=vec_missing_rate)
eval(parse(text=paste0("agg_mu = aggregate(", metric, " ~ algorithm + missing_rate, data=subdf, FUN=mean, na.rm=FALSE)")))
eval(parse(text=paste0("agg_sd = aggregate(", metric, " ~ algorithm + missing_rate, data=subdf, FUN=sd, na.rm=TRUE); agg_sd$", metric, "[is.na(agg_sd$", metric, ")] = 0")))
eval(parse(text=paste0("agg_sd = aggregate(", metric, " ~ algorithm + missing_rate, data=subdf, FUN=function(x){3*sd(x,na.rm=TRUE)}); agg_sd$", metric, "[is.na(agg_sd$", metric, ")] = 0")))
agg_mu = merge(df_algo_miss, agg_mu, by=c("algorithm", "missing_rate"), all=TRUE)
agg_sd = merge(df_algo_miss, agg_sd, by=c("algorithm", "missing_rate"), all=TRUE)
idx_sort = order(agg_mu$missing_rate)
Expand All @@ -88,37 +107,33 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
mat_sd = mat_sd[idx_sort, ]
### Barplot
par(xpd=TRUE) ### xpd=TRUE allows us to place the legend outside the plot area
bplot = barplot(mat_mu, beside=TRUE, col=vec_colours, border=NA, ylim=ylim, main=paste0("maf = ", maf), xlab="Sparsity (missing/total)", ylab=metric_label, las=1)
bplot = barplot(mat_mu, beside=TRUE, col=vec_colours, ylim=c(0, max(mat_mu, na.rm=TRUE)+max(c(0.01, max(agg_sd[,3], na.rm=TRUE)))), border=NA, main=paste0("maf = ", maf), xlab="Sparsity (missing/total)", ylab=metric_label, las=1)
if (maf==min(vec_maf)) {
legend("topright", inset=c(-0.001, -0.5), legend=vec_algorithm, fill=vec_colours, bty="n", horiz=TRUE)
legend("topleft", inset=c(-0.001, -0.5), legend=vec_algorithm, fill=vec_colours, bty="n", horiz=TRUE)
legend("topright", inset=c(-0.001, -0.5), legend="Error bars refer to ± 3 standard deviations", bty="n", horiz=TRUE)
par(xpd=FALSE)
}
arrows(bplot, mat_mu-mat_sd,
bplot, mat_mu+mat_sd, length=0.05, angle=90, code=3)
grid()
text_pos = mat_mu+mat_sd
text_pos = text_pos + (0.1*text_pos)
if ((grepl("mae", metric) | grepl("rmse", metric)) == FALSE) {
text_pos[text_pos < 0.1] = 0.1 ### So that the labels are still visible
}
# text_pos = mat_mu+mat_sd
# text_pos = text_pos + (0.1*text_pos)
# if ((grepl("mae", metric) | grepl("rmse", metric)) == FALSE) {
# text_pos[text_pos < 0.1] = 0.1 ### So that the labels are still visible
# }
par(xpd=TRUE)
text((bplot-0.25), text_pos, labels=round(mat_mu,4), cex=0.9, srt=90, pos=4)
text((bplot-0.25), 0.0, labels=round(mat_mu,4), cex=0.9, srt=90, pos=4)
par(xpd=FALSE)
### Line plot
plot(x=range(agg_mu[,2], na.rm=TRUE), y=range(c(agg_mu[,3]-(2*agg_sd[,3]), agg_mu[,3]+(2*agg_sd[,3])), na.rm=TRUE),
main=paste0("maf = ", maf), xlab="Sparsity (missing/total)", ylab=metric_label, las=1, type="n")
grid()
for (algo in vec_algorithm) {
# algo = vec_algorithm[1]
idx = agg_mu$algorithm == algo
agg_mu_sub = agg_mu[idx, ]
agg_sd_sub = agg_sd[idx, ]
lines(x=agg_mu_sub[,2], y=agg_mu_sub[,3], col=vec_colours[vec_algorithm==algo], lwd=2)
points(x=agg_mu_sub[,2], y=agg_mu_sub[,3], col="black", bg=vec_colours[vec_algorithm==algo], pch=23)
arrows(x0=agg_mu_sub[,2], y0=agg_mu_sub[,3]-agg_sd_sub[,3],
x1=agg_mu_sub[,2], y1=agg_mu_sub[,3]+agg_sd_sub[,3], length=0.05, angle=90, code=3)
}
### Allele frequency distribution
idx_af = which((vec_mu_af>=maf) & (vec_mu_af<=(1-maf)))
vec_af = as.vector(mat_genotypes[, idx_af])
vec_af = vec_af[!is.na(vec_af)]
vec_af = c(vec_af, 1-vec_af)
d = density(vec_af)
plot(d, main=paste0("maf = ", maf), xlab="Allele frequency", ylab="Density", las=1, bty="n")
}

### FOR MAF=5% ONLY:

### Plot mae across expected freqs at 5% MAF
subdf = df[df$maf == 0.05, ]
q = seq(0, 1, by=0.1)
Expand All @@ -128,9 +143,10 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
idx_col_mae = which(grepl("^mae_", colnames(subdf)))
plot(x=c(0,1), y=range(subdf[, idx_col_mae], na.rm=TRUE), type="n", xlab="Observed allele frequencies (5% MAF)", ylab="Imputation error\n(mean absolute error)", main=algo, las=1)
colour = vec_colours[vec_algorithm==algo]
for (missing_rate in vec_missing_rate) {
# missing_rate = vec_missing_rate[1]
eval(parse(text=paste0("colour = rgb(", paste(as.vector(col2rgb(colour)), collapse = "/256,"), "/256, alpha=", missing_rate, ")")))
for (i in 1:6) {
# i = 1
missing_rate = vec_missing_rate[i]
# eval(parse(text=paste0("colour = rgb(", paste(as.vector(col2rgb(colour)), collapse = "/256,"), "/256, alpha=", missing_rate, ")")))
idx = which((subdf$algorithm == algo) & (subdf$missing_rate == missing_rate))
if (length(idx) > 1) {
mae_across_freqs = NULL
Expand All @@ -152,19 +168,15 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
# ### The grape dataset only includes the minor alleles, hence to get the full picture we need to unfold the alleles to represent the reference alleles
# lines(x=q[idx], y=(mae_across_freqs[idx] + rev(mae_across_freqs[idx]))/2, lwd=7*missing_rate, col=colour)
### The grape dataset uses the minor allele frequencies, hence to be uniform across all the datasets we used we take the additive inverse
lines(x=1.00-q[idx], y=mae_across_freqs[idx], lwd=7*missing_rate, col=colour)
lines(x=1.00-q[idx], y=mae_across_freqs[idx], lty=i, col=colour)
} else {
### The other datasets have all the alleles present and should expectedly generate a bell-ish curve or MAE across allele frequencies
lines(x=q[idx], y=mae_across_freqs[idx], lwd=7*missing_rate, col=colour)
lines(x=q[idx], y=mae_across_freqs[idx], lty=i, col=colour)
}
}
grid()
if (algo == vec_algorithm[1]) {
n0 = 1
n2 = length(unique(subdf$missing_rate))
n1 = floor(n2/2)
legend("topleft", legend=c("Sparsity", vec_missing_rate[n0:n1]), col=c(0, rgb(0,0,0,alpha=vec_missing_rate[n0:n1])), lwd=c(0, 7*vec_missing_rate[n0:n1]), bty="n")
legend("top", legend=c("", vec_missing_rate[(n1+1):n2]), col=c(0, rgb(0,0,0,alpha=vec_missing_rate[(n1+1):n2])), lwd=c(0, 7*vec_missing_rate[(n1+1):n2]), bty="n")
legend("topleft", legend=c("Sparsity", vec_missing_rate[1:6]), col=c(0, rep("black", 6)), lty=c(1,1:6), bty="n", cex=0.75)
}
par(mar=c(5, 4, 4, 2) + 0.1)
}
Expand All @@ -184,9 +196,10 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
y_temp = data.frame(y=unlist(subdf[idx_temp, idx_col[seq(2, length(idx_col), by=2)]]), b=rep(1:n_bins, each=length(idx_temp)), m=rep(subdf$missing_rate[idx_temp], time=n_bins))
y_temp = aggregate(y ~ b + m, data=y_temp, FUN=mean, na.rm=TRUE)[,3]
plot(x=range(subdf[, idx_col[c(1, (length(idx_col)-1))]], na.rm=TRUE), y=range(y_temp, na.rm=TRUE), type="n", xlab="Observed variance in allele frequency per locus (5% MAF)", ylab="Imputation error\n(mean absolute error)", main=algo, las=1)
for (missing_rate in vec_missing_rate) {
# missing_rate = vec_missing_rate[1]
eval(parse(text=paste0("colour = rgb(", paste(as.vector(col2rgb(colour)), collapse = "/256,"), "/256, alpha=", missing_rate, ")")))
for (i in 1:6) {
# i = 1
missing_rate = vec_missing_rate[i]
# eval(parse(text=paste0("colour = rgb(", paste(as.vector(col2rgb(colour)), collapse = "/256,"), "/256, alpha=", missing_rate, ")")))
idx_row = which((subdf$algo == algo) & (subdf$missing_rate == missing_rate))
vec_bin = c()
vec_mae = c()
Expand All @@ -195,15 +208,11 @@ plot_metrics = function(df, dataset, vec_2_metrics=c("mae_frequencies", "concord
vec_bin = c(vec_bin, mean(subdf[idx_row, idx_col[2*(j-1)+1]]))
vec_mae = c(vec_mae, mean(subdf[idx_row, idx_col[2*j]]))
}
lines(x=vec_bin, y=vec_mae, lwd=7*missing_rate, col=colour)
lines(x=vec_bin, y=vec_mae, lty=i, col=colour)
}
grid()
if (algo == vec_algorithm[1]) {
n0 = 1
n2 = length(unique(subdf$missing_rate))
n1 = floor(n2/2)
legend("topleft", legend=c("Sparsity", vec_missing_rate[n0:n1]), col=c(0, rgb(0,0,0,alpha=vec_missing_rate[n0:n1])), lwd=c(0, 7*vec_missing_rate[n0:n1]), bty="n")
legend("top", legend=c("", vec_missing_rate[(n1+1):n2]), col=c(0, rgb(0,0,0,alpha=vec_missing_rate[(n1+1):n2])), lwd=c(0, 7*vec_missing_rate[(n1+1):n2]), bty="n")
legend("topleft", legend=c("Sparsity", vec_missing_rate[1:6]), col=c(0, rep("black", 6)), lty=c(1, 1:6), bty="n", cex=0.75)
}
par(mar=c(5, 4, 4, 2) +0.1)
}
Expand Down
Loading

0 comments on commit e5ecbad

Please sign in to comment.