|
| 1 | +library(Biostrings) |
| 2 | +library(DECIPHER) |
| 3 | +library(stringi) |
| 4 | +library(stringr) |
| 5 | +library(ape) |
| 6 | +library(igraph) |
| 7 | +library(reshape2) |
| 8 | +library(dplyr) |
| 9 | +library(uwot) |
| 10 | +library(RColorBrewer) |
| 11 | +library(ggplot2) |
| 12 | +library(ggthemes) |
| 13 | +library(plotly) |
| 14 | +library(ggnetwork) |
| 15 | + |
| 16 | +# Load color palettes |
| 17 | + |
| 18 | +kev_palette <- c( |
| 19 | + "dodgerblue2", "#E31A1C", |
| 20 | + "green4", |
| 21 | + "#6A3D9A", |
| 22 | + "#FF7F00", |
| 23 | + "black", "gold1", |
| 24 | + "skyblue2", "#FB9A99", |
| 25 | + "palegreen2", |
| 26 | + "#CAB2D6", |
| 27 | + "#FDBF6F", |
| 28 | + "gray70", "khaki2", |
| 29 | + "maroon", "orchid1", "deeppink1", "blue1", "steelblue4", |
| 30 | + "darkturquoise", "green1", "yellow4", "yellow3", |
| 31 | + "darkorange4", "brown" |
| 32 | +) |
| 33 | + |
| 34 | +qual_palettes = brewer.pal.info[brewer.pal.info$category == "qual", ] |
| 35 | +qual_vector = unlist(mapply(brewer.pal, qual_palettes$maxcolors, rownames(qual_palettes))) |
| 36 | + |
| 37 | +# Functions |
| 38 | + |
| 39 | +align_get <- function(fasta, align) { |
| 40 | + |
| 41 | + covid_seq <- readDNAStringSet(fasta) |
| 42 | + if (length(covid_seq) > 10) { |
| 43 | + stop("Too many sequences in fasta file, only up to 10 viral genomes allowed at a time.") |
| 44 | + } |
| 45 | + if (length(covid_seq) < 1) { |
| 46 | + stop("No sequences found in fasta file.") |
| 47 | + } |
| 48 | + if (length(which((lapply(covid_seq, length)) < 29000)) > 0) { |
| 49 | + stop("One or more sequences not complete (length < 29000 nucleotides).") |
| 50 | + } |
| 51 | + covid_align <- align |
| 52 | + covid_seq <- RemoveGaps(covid_seq, removeGaps = "all", processors = NULL) |
| 53 | + fasta_final <- AlignProfiles(covid_align, covid_seq) |
| 54 | + return(fasta_final) |
| 55 | + |
| 56 | +} |
| 57 | + |
| 58 | +dist_get <- function(align) { |
| 59 | + |
| 60 | + dec_dist <- dist.dna(as.DNAbin(align), model = "K80", as.matrix = TRUE, pairwise.deletion = FALSE) |
| 61 | + colnames(dec_dist) <- (str_split_fixed(colnames(dec_dist), fixed("."), 2)[,1]) |
| 62 | + rownames(dec_dist) <- (str_split_fixed(rownames(dec_dist), fixed("."), 2)[,1]) |
| 63 | + return(dec_dist) |
| 64 | + |
| 65 | +} |
| 66 | + |
| 67 | +umap_process <- function(covid_dist, meta_df) { |
| 68 | + |
| 69 | + acc_names = rownames(covid_dist) |
| 70 | + covid_dist <- dist(covid_dist) |
| 71 | + set.seed(2020) |
| 72 | + covid_umap <- uwot::umap(covid_dist, init = "spectral", metric = "cosine", n_neighbors = 50, min_dist = 0.001, spread = 40, local_connectivity = 10) |
| 73 | + covid_umap_df <- as.data.frame(covid_umap) |
| 74 | + umap_df_final <- data.frame("Accession" = acc_names, "UMAP_1" = covid_umap_df[,1], "UMAP_2" = covid_umap_df[,2]) |
| 75 | + umap_df_final <- merge(umap_df_final, meta_df) |
| 76 | + colnames(umap_df_final) <- c("Accession", "UMAP_1", "UMAP_2", "Region", "Country", "Date") |
| 77 | + return(umap_df_final) |
| 78 | + |
| 79 | +} |
| 80 | + |
| 81 | +mst_graph <- function(covid_dist, meta_data) { |
| 82 | + |
| 83 | + g <- graph.adjacency(covid_dist, mode = "undirected", weighted = TRUE, diag = FALSE) |
| 84 | + g_mst <- mst(g) |
| 85 | + acc_ordering <- match(meta_data[,1], names(V(g_mst)[[]])) |
| 86 | + meta_ordered <- meta_data[order(acc_ordering), ] |
| 87 | + |
| 88 | + g_mst_1 <- g_mst |
| 89 | + g_mst_2 <- g_mst |
| 90 | + g_mst_3 <- g_mst |
| 91 | + |
| 92 | + meta_colors_1 <- (kev_palette[1:length(unique(meta_ordered[,(ncol(meta_ordered)-2)]))])[factor(meta_ordered[, (ncol(meta_ordered)-2)])] |
| 93 | + meta_colors_2 <- (qual_vector[1:length(unique(meta_ordered[,(ncol(meta_ordered)-1)]))])[factor(meta_ordered[, (ncol(meta_ordered)-1)])] |
| 94 | + color_ramp <- colorRampPalette(c("dodgerblue2", "white", "firebrick1"))(max(meta_ordered[,(ncol(meta_ordered))])) |
| 95 | + meta_colors_3 <- color_ramp[meta_ordered[,(ncol(meta_ordered))]] |
| 96 | + |
| 97 | + V(g_mst_1)$color <- meta_colors_1 |
| 98 | + V(g_mst_2)$color <- meta_colors_2 |
| 99 | + V(g_mst_3)$color <- meta_colors_3 |
| 100 | + |
| 101 | + V(g_mst_1)$Region <- meta_ordered[,(ncol(meta_ordered)-2)] |
| 102 | + V(g_mst_2)$Country <- meta_ordered[,(ncol(meta_ordered)-1)] |
| 103 | + V(g_mst_3)$Date <- meta_ordered[,(ncol(meta_ordered))] |
| 104 | + |
| 105 | + lay <- layout_with_graphopt(g_mst_1, niter = 1000) |
| 106 | + |
| 107 | + g_mst_list <- list(g_mst_1, g_mst_2, g_mst_3, lay) |
| 108 | + return(g_mst_list) |
| 109 | + |
| 110 | +} |
| 111 | + |
| 112 | +snps_get <- function(alignment, metadata, positions) { |
| 113 | + |
| 114 | + align <- alignment |
| 115 | + pos <- paste(positions$Pos, positions$Gene, positions$Effect, sep = " ") |
| 116 | + acc <- metadata[,1] |
| 117 | + meta_vars <- metadata[,-1] |
| 118 | + align_df <- as.data.frame(as.matrix(align)) |
| 119 | + meta_order <- match(rownames(align_df), acc) |
| 120 | + align_df <- align_df[order(meta_order),] |
| 121 | + align_df$m3 <- meta_vars[,3] |
| 122 | + align_df$m2 <- meta_vars[,2] |
| 123 | + align_df$m1 <- meta_vars[,1] # Ensure ordering, cbind() likely reorders rownames and will reorder metadata |
| 124 | + |
| 125 | + freq_pct <- function(d_col) { |
| 126 | + freq_table <- table(d_col) |
| 127 | + pct_table <- as.table(sapply(freq_table, function(x) x/sum(freq_table))) |
| 128 | + return(pct_table) |
| 129 | + } |
| 130 | + |
| 131 | + table_get <- function(df, position, meta_num) { |
| 132 | + pos_full <- position |
| 133 | + pos_num <- as.numeric(str_split_fixed(position, " ", 3)[,1][1]) |
| 134 | + align_pos <- df[,c(pos_num, (ncol(df) - as.numeric(meta_num) + 1))] # Backward ordering |
| 135 | + colnames(align_pos) <- c("position", "meta") |
| 136 | + align_grouped <- group_by(.data = align_pos, meta) |
| 137 | + align_tables <- as.data.frame(do(.data = align_grouped, data.frame(val = freq_pct(.[,1])))) |
| 138 | + align_final <- data.frame("Position" = rep(pos_full, length(align_tables[,1])), "Meta" = align_tables[,1], "Allele" = align_tables[,2], "Freq" = align_tables[,3]) |
| 139 | + return(align_final) |
| 140 | + } |
| 141 | + |
| 142 | + meta_nums <- list(1, 2, 3) |
| 143 | + all_tables <- lapply(meta_nums, function(x) lapply(pos, function(y) table_get(align_df, y, x))) |
| 144 | + table_concat <- lapply(all_tables, function(x) base::Reduce(rbind, x)) |
| 145 | + return(table_concat) |
| 146 | + |
| 147 | +} |
| 148 | + |
| 149 | +umap_plotter <- function(umap_df) { |
| 150 | + |
| 151 | + p1 <- ggplot(data = umap_df, aes(x = UMAP_1, y = UMAP_2)) + |
| 152 | + theme_few() + |
| 153 | + geom_jitter(aes(fill = Region), size = 3, position = "jitter", colour = "black", pch = 21, stroke = 0.25) + |
| 154 | + scale_fill_manual(name = "", values = kev_palette[1:length(unique(umap_df$Region))]) + |
| 155 | + labs(x = "UMAP 1", y = "UMAP 2") + |
| 156 | + theme(axis.ticks.x = element_blank()) + |
| 157 | + theme(axis.ticks.y = element_blank()) + |
| 158 | + theme(axis.text.y = element_blank()) + |
| 159 | + theme(axis.text.x = element_blank()) + |
| 160 | + theme(axis.title.y = element_text(size = 16, face = "bold")) + |
| 161 | + theme(axis.title.x = element_text(size = 16, face = "bold")) + |
| 162 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 163 | + theme(legend.text = element_text(size = 14)) |
| 164 | + |
| 165 | + p2 <- ggplot(data = umap_df, aes(x = UMAP_1, y = UMAP_2)) + |
| 166 | + theme_few() + |
| 167 | + geom_jitter(aes(fill = Country), size = 3, position = "jitter", colour = "black", pch = 21, stroke = 0.25) + |
| 168 | + scale_fill_manual(name = "", values = qual_vector[1:length(unique(umap_df$Country))]) + |
| 169 | + labs(x = "UMAP 1", y = "UMAP 2") + |
| 170 | + theme(axis.ticks.x = element_blank()) + |
| 171 | + theme(axis.ticks.y = element_blank()) + |
| 172 | + theme(axis.text.y = element_blank()) + |
| 173 | + theme(axis.text.x = element_blank()) + |
| 174 | + theme(axis.title.y = element_text(size = 16, face = "bold")) + |
| 175 | + theme(axis.title.x = element_text(size = 16, face = "bold")) + |
| 176 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 177 | + theme(legend.text = element_text(size = 14)) |
| 178 | + |
| 179 | + p3 <- ggplot(data = umap_df, aes(x = UMAP_1, y = UMAP_2)) + |
| 180 | + theme_few() + |
| 181 | + geom_jitter(aes(color = Date), size = 3, position = "jitter", alpha = 1) + |
| 182 | + scale_color_gradient(name = "Days from \nfirst case", low = "#b92b27", high = "#1565C0") + |
| 183 | + labs(x = "UMAP 1", y = "UMAP 2") + |
| 184 | + theme(axis.ticks.x = element_blank()) + |
| 185 | + theme(axis.ticks.y = element_blank()) + |
| 186 | + theme(axis.text.y = element_blank()) + |
| 187 | + theme(axis.text.x = element_blank()) + |
| 188 | + theme(axis.title.y = element_text(size = 16, face = "bold")) + |
| 189 | + theme(axis.title.x = element_text(size = 16, face = "bold")) + |
| 190 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 191 | + theme(legend.text = element_text(size = 14)) |
| 192 | + |
| 193 | + plot_list <- list(p1, p2, p3) |
| 194 | + return(plot_list) |
| 195 | + |
| 196 | +} |
| 197 | + |
| 198 | +mst_plotter <- function(mst_list, meta_df) { |
| 199 | + |
| 200 | + graph_m1 <- mst_list[[1]] |
| 201 | + graph_m2 <- mst_list[[2]] |
| 202 | + graph_m3 <- mst_list[[3]] |
| 203 | + lay <- mst_list[[4]] |
| 204 | + |
| 205 | + ggnet_1 <- ggnetwork(graph_m1, layout = lay) |
| 206 | + p1 <- ggplot(ggnet_1, aes(x = x, y = y, xend = xend, yend = yend)) + |
| 207 | + theme_few() + |
| 208 | + geom_edges(color = "gray") + |
| 209 | + geom_nodes(aes(fill = Region), size = 3) + |
| 210 | + scale_fill_manual(name = "", values = kev_palette[1:length(unique(meta_df$Region))]) + |
| 211 | + theme(axis.title.x = element_blank()) + |
| 212 | + theme(axis.title.y = element_blank()) + |
| 213 | + theme(axis.text.x = element_blank()) + |
| 214 | + theme(axis.text.y = element_blank()) + |
| 215 | + theme(axis.ticks.x = element_blank()) + |
| 216 | + theme(axis.ticks.y = element_blank()) + |
| 217 | + theme(axis.line.x = element_blank()) + |
| 218 | + theme(axis.line.y = element_blank()) + |
| 219 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 220 | + theme(legend.text = element_text(size = 14)) |
| 221 | + |
| 222 | + ggnet_2 <- ggnetwork(graph_m2, layout = lay) |
| 223 | + p2 <- ggplot(ggnet_2, aes(x = x, y = y, xend = xend, yend = yend)) + |
| 224 | + theme_few() + |
| 225 | + geom_edges(color = "gray") + |
| 226 | + geom_nodes(aes(fill = Country), size = 3) + |
| 227 | + scale_fill_manual(name = "", values = qual_vector[1:length(unique(meta_df$Geo_Location))]) + |
| 228 | + theme(axis.title.x = element_blank()) + |
| 229 | + theme(axis.title.y = element_blank()) + |
| 230 | + theme(axis.text.x = element_blank()) + |
| 231 | + theme(axis.text.y = element_blank()) + |
| 232 | + theme(axis.ticks.x = element_blank()) + |
| 233 | + theme(axis.ticks.y = element_blank()) + |
| 234 | + theme(axis.line.x = element_blank()) + |
| 235 | + theme(axis.line.y = element_blank()) + |
| 236 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 237 | + theme(legend.text = element_text(size = 14)) |
| 238 | + |
| 239 | + ggnet_3 <- ggnetwork(graph_m3, layout = lay) |
| 240 | + p3 <- ggplot(ggnet_3, aes(x = x, y = y, xend = xend, yend = yend)) + |
| 241 | + theme_few() + |
| 242 | + geom_edges(color = "gray") + |
| 243 | + geom_nodes(aes(color = Date), size = 3) + |
| 244 | + scale_color_continuous(name = "Days from \nfirst case", low = "#b92b27", high = "#1565C0") + |
| 245 | + theme(axis.title.x = element_blank()) + |
| 246 | + theme(axis.title.y = element_blank()) + |
| 247 | + theme(axis.text.x = element_blank()) + |
| 248 | + theme(axis.text.y = element_blank()) + |
| 249 | + theme(axis.ticks.x = element_blank()) + |
| 250 | + theme(axis.ticks.y = element_blank()) + |
| 251 | + theme(axis.line.x = element_blank()) + |
| 252 | + theme(axis.line.y = element_blank()) + |
| 253 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 254 | + theme(legend.text = element_text(size = 14)) |
| 255 | + |
| 256 | + plot_list <- list(p1, p2, p3) |
| 257 | + return(plot_list) |
| 258 | + |
| 259 | +} |
| 260 | + |
| 261 | +snp_plotter <- function(snp_list, meta_df) { |
| 262 | + |
| 263 | + snps_1 <- snp_list[[1]] |
| 264 | + colnames(snps_1) <- c("Position", "Region", "Allele", "Freq") |
| 265 | + |
| 266 | + snps_2 <- snp_list[[2]] |
| 267 | + colnames(snps_2) <- c("Position", "Country", "Allele", "Freq") |
| 268 | + |
| 269 | + snps_3 <- snp_list[[3]] |
| 270 | + colnames(snps_3) <- c("Position", "Date", "Allele", "Freq") |
| 271 | + |
| 272 | + p1 <- ggplot(data = snps_1, aes(x = Allele, y = Freq)) + |
| 273 | + theme_few () + |
| 274 | + geom_bar(stat = "identity", position = "dodge2", aes(fill = Region), color = "black") + |
| 275 | + scale_fill_manual(name = "", values = kev_palette[1:length(unique(meta_df$Region))]) + |
| 276 | + facet_wrap(~Position, scales = "free_x") + |
| 277 | + labs(x = "Allele", y = "Frequency") + |
| 278 | + theme(axis.text.y = element_text(size = 12)) + |
| 279 | + theme(axis.text.x = element_text(size = 12)) + |
| 280 | + theme(axis.title.y = element_text(size = 16, face = "bold")) + |
| 281 | + theme(axis.title.x = element_text(size = 16, face = "bold")) + |
| 282 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 283 | + theme(legend.text = element_text(size = 14)) + |
| 284 | + theme(strip.text = element_text(size = 12, face = "bold")) |
| 285 | + |
| 286 | + p2 <- ggplot(data = snps_2, aes(x = Allele, y = Freq)) + |
| 287 | + theme_few () + |
| 288 | + geom_bar(stat = "identity", position = "dodge2", aes(fill = Country), width = 1) + |
| 289 | + scale_fill_manual(name = "", values = qual_vector[1:length(unique(meta_df$Geo_Location))]) + |
| 290 | + facet_wrap(~Position, scales = "free") + |
| 291 | + labs(x = "Allele", y = "Frequency") + |
| 292 | + theme(axis.text.y = element_text(size = 12)) + |
| 293 | + theme(axis.text.x = element_text(size = 12)) + |
| 294 | + theme(axis.title.y = element_text(size = 16, face = "bold")) + |
| 295 | + theme(axis.title.x = element_text(size = 16, face = "bold")) + |
| 296 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 297 | + theme(legend.text = element_text(size = 14)) + |
| 298 | + theme(strip.text = element_text(size = 12, face = "bold")) |
| 299 | + |
| 300 | + int_text = paste("Functionality currently not supported") |
| 301 | + |
| 302 | + p3 <- ggplot(data = snps_3, aes(x = Allele, y = Freq)) + |
| 303 | + theme_few () + |
| 304 | + geom_bar(stat = "identity", position = "dodge2", aes(color = Date)) + |
| 305 | + scale_color_continuous(name = "Days from \nfirst case", low = "#b92b27", high = "#1565C0") + |
| 306 | + facet_wrap(~Position, scales = "free_x") + |
| 307 | + labs(x = "Allele", y = "Frequency") + |
| 308 | + theme(axis.text.y = element_text(size = 12)) + |
| 309 | + theme(axis.text.x = element_text(size = 12)) + |
| 310 | + theme(axis.title.y = element_text(size = 16, face = "bold")) + |
| 311 | + theme(axis.title.x = element_text(size = 16, face = "bold")) + |
| 312 | + theme(legend.title = element_text(size = 15, face = "bold")) + |
| 313 | + theme(legend.text = element_text(size = 14)) + |
| 314 | + theme(strip.text = element_text(size = 12, face = "bold")) |
| 315 | + |
| 316 | + plot_list <- list(p1, p2, p3) |
| 317 | + return(plot_list) |
| 318 | + |
| 319 | +} |
0 commit comments