Skip to content

Commit c8dc681

Browse files
authored
Merge pull request #5 from hsmaan/release
Release
2 parents 7a2cc4e + e7bafb3 commit c8dc681

21 files changed

+5736
-134
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,6 @@
1515
*.vcf
1616
*.log
1717
*.frq
18+
*.dcf
19+
slurm*
1820

R/global.R

Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
library(Biostrings)
2+
library(DECIPHER)
3+
library(stringi)
4+
library(stringr)
5+
library(ape)
6+
library(igraph)
7+
library(reshape2)
8+
library(dplyr)
9+
library(uwot)
10+
library(RColorBrewer)
11+
library(ggplot2)
12+
library(ggthemes)
13+
library(plotly)
14+
library(ggnetwork)
15+
16+
# Load color palettes
17+
18+
kev_palette <- c(
19+
"dodgerblue2", "#E31A1C",
20+
"green4",
21+
"#6A3D9A",
22+
"#FF7F00",
23+
"black", "gold1",
24+
"skyblue2", "#FB9A99",
25+
"palegreen2",
26+
"#CAB2D6",
27+
"#FDBF6F",
28+
"gray70", "khaki2",
29+
"maroon", "orchid1", "deeppink1", "blue1", "steelblue4",
30+
"darkturquoise", "green1", "yellow4", "yellow3",
31+
"darkorange4", "brown"
32+
)
33+
34+
qual_palettes = brewer.pal.info[brewer.pal.info$category == "qual", ]
35+
qual_vector = unlist(mapply(brewer.pal, qual_palettes$maxcolors, rownames(qual_palettes)))
36+
37+
# Functions
38+
39+
align_get <- function(fasta, align) {
40+
41+
covid_seq <- readDNAStringSet(fasta)
42+
if (length(covid_seq) > 10) {
43+
stop("Too many sequences in fasta file, only up to 10 viral genomes allowed at a time.")
44+
}
45+
if (length(covid_seq) < 1) {
46+
stop("No sequences found in fasta file.")
47+
}
48+
if (length(which((lapply(covid_seq, length)) < 29000)) > 0) {
49+
stop("One or more sequences not complete (length < 29000 nucleotides).")
50+
}
51+
covid_align <- align
52+
covid_seq <- RemoveGaps(covid_seq, removeGaps = "all", processors = NULL)
53+
fasta_final <- AlignProfiles(covid_align, covid_seq)
54+
return(fasta_final)
55+
56+
}
57+
58+
dist_get <- function(align) {
59+
60+
dec_dist <- dist.dna(as.DNAbin(align), model = "K80", as.matrix = TRUE, pairwise.deletion = FALSE)
61+
colnames(dec_dist) <- (str_split_fixed(colnames(dec_dist), fixed("."), 2)[,1])
62+
rownames(dec_dist) <- (str_split_fixed(rownames(dec_dist), fixed("."), 2)[,1])
63+
return(dec_dist)
64+
65+
}
66+
67+
umap_process <- function(covid_dist, meta_df) {
68+
69+
acc_names = rownames(covid_dist)
70+
covid_dist <- dist(covid_dist)
71+
set.seed(2020)
72+
covid_umap <- uwot::umap(covid_dist, init = "spectral", metric = "cosine", n_neighbors = 50, min_dist = 0.001, spread = 40, local_connectivity = 10)
73+
covid_umap_df <- as.data.frame(covid_umap)
74+
umap_df_final <- data.frame("Accession" = acc_names, "UMAP_1" = covid_umap_df[,1], "UMAP_2" = covid_umap_df[,2])
75+
umap_df_final <- merge(umap_df_final, meta_df)
76+
colnames(umap_df_final) <- c("Accession", "UMAP_1", "UMAP_2", "Region", "Country", "Date")
77+
return(umap_df_final)
78+
79+
}
80+
81+
mst_graph <- function(covid_dist, meta_data) {
82+
83+
g <- graph.adjacency(covid_dist, mode = "undirected", weighted = TRUE, diag = FALSE)
84+
g_mst <- mst(g)
85+
acc_ordering <- match(meta_data[,1], names(V(g_mst)[[]]))
86+
meta_ordered <- meta_data[order(acc_ordering), ]
87+
88+
g_mst_1 <- g_mst
89+
g_mst_2 <- g_mst
90+
g_mst_3 <- g_mst
91+
92+
meta_colors_1 <- (kev_palette[1:length(unique(meta_ordered[,(ncol(meta_ordered)-2)]))])[factor(meta_ordered[, (ncol(meta_ordered)-2)])]
93+
meta_colors_2 <- (qual_vector[1:length(unique(meta_ordered[,(ncol(meta_ordered)-1)]))])[factor(meta_ordered[, (ncol(meta_ordered)-1)])]
94+
color_ramp <- colorRampPalette(c("dodgerblue2", "white", "firebrick1"))(max(meta_ordered[,(ncol(meta_ordered))]))
95+
meta_colors_3 <- color_ramp[meta_ordered[,(ncol(meta_ordered))]]
96+
97+
V(g_mst_1)$color <- meta_colors_1
98+
V(g_mst_2)$color <- meta_colors_2
99+
V(g_mst_3)$color <- meta_colors_3
100+
101+
V(g_mst_1)$Region <- meta_ordered[,(ncol(meta_ordered)-2)]
102+
V(g_mst_2)$Country <- meta_ordered[,(ncol(meta_ordered)-1)]
103+
V(g_mst_3)$Date <- meta_ordered[,(ncol(meta_ordered))]
104+
105+
lay <- layout_with_graphopt(g_mst_1, niter = 1000)
106+
107+
g_mst_list <- list(g_mst_1, g_mst_2, g_mst_3, lay)
108+
return(g_mst_list)
109+
110+
}
111+
112+
snps_get <- function(alignment, metadata, positions) {
113+
114+
align <- alignment
115+
pos <- paste(positions$Pos, positions$Gene, positions$Effect, sep = " ")
116+
acc <- metadata[,1]
117+
meta_vars <- metadata[,-1]
118+
align_df <- as.data.frame(as.matrix(align))
119+
meta_order <- match(rownames(align_df), acc)
120+
align_df <- align_df[order(meta_order),]
121+
align_df$m3 <- meta_vars[,3]
122+
align_df$m2 <- meta_vars[,2]
123+
align_df$m1 <- meta_vars[,1] # Ensure ordering, cbind() likely reorders rownames and will reorder metadata
124+
125+
freq_pct <- function(d_col) {
126+
freq_table <- table(d_col)
127+
pct_table <- as.table(sapply(freq_table, function(x) x/sum(freq_table)))
128+
return(pct_table)
129+
}
130+
131+
table_get <- function(df, position, meta_num) {
132+
pos_full <- position
133+
pos_num <- as.numeric(str_split_fixed(position, " ", 3)[,1][1])
134+
align_pos <- df[,c(pos_num, (ncol(df) - as.numeric(meta_num) + 1))] # Backward ordering
135+
colnames(align_pos) <- c("position", "meta")
136+
align_grouped <- group_by(.data = align_pos, meta)
137+
align_tables <- as.data.frame(do(.data = align_grouped, data.frame(val = freq_pct(.[,1]))))
138+
align_final <- data.frame("Position" = rep(pos_full, length(align_tables[,1])), "Meta" = align_tables[,1], "Allele" = align_tables[,2], "Freq" = align_tables[,3])
139+
return(align_final)
140+
}
141+
142+
meta_nums <- list(1, 2, 3)
143+
all_tables <- lapply(meta_nums, function(x) lapply(pos, function(y) table_get(align_df, y, x)))
144+
table_concat <- lapply(all_tables, function(x) base::Reduce(rbind, x))
145+
return(table_concat)
146+
147+
}
148+
149+
umap_plotter <- function(umap_df) {
150+
151+
p1 <- ggplot(data = umap_df, aes(x = UMAP_1, y = UMAP_2)) +
152+
theme_few() +
153+
geom_jitter(aes(fill = Region), size = 3, position = "jitter", colour = "black", pch = 21, stroke = 0.25) +
154+
scale_fill_manual(name = "", values = kev_palette[1:length(unique(umap_df$Region))]) +
155+
labs(x = "UMAP 1", y = "UMAP 2") +
156+
theme(axis.ticks.x = element_blank()) +
157+
theme(axis.ticks.y = element_blank()) +
158+
theme(axis.text.y = element_blank()) +
159+
theme(axis.text.x = element_blank()) +
160+
theme(axis.title.y = element_text(size = 16, face = "bold")) +
161+
theme(axis.title.x = element_text(size = 16, face = "bold")) +
162+
theme(legend.title = element_text(size = 15, face = "bold")) +
163+
theme(legend.text = element_text(size = 14))
164+
165+
p2 <- ggplot(data = umap_df, aes(x = UMAP_1, y = UMAP_2)) +
166+
theme_few() +
167+
geom_jitter(aes(fill = Country), size = 3, position = "jitter", colour = "black", pch = 21, stroke = 0.25) +
168+
scale_fill_manual(name = "", values = qual_vector[1:length(unique(umap_df$Country))]) +
169+
labs(x = "UMAP 1", y = "UMAP 2") +
170+
theme(axis.ticks.x = element_blank()) +
171+
theme(axis.ticks.y = element_blank()) +
172+
theme(axis.text.y = element_blank()) +
173+
theme(axis.text.x = element_blank()) +
174+
theme(axis.title.y = element_text(size = 16, face = "bold")) +
175+
theme(axis.title.x = element_text(size = 16, face = "bold")) +
176+
theme(legend.title = element_text(size = 15, face = "bold")) +
177+
theme(legend.text = element_text(size = 14))
178+
179+
p3 <- ggplot(data = umap_df, aes(x = UMAP_1, y = UMAP_2)) +
180+
theme_few() +
181+
geom_jitter(aes(color = Date), size = 3, position = "jitter", alpha = 1) +
182+
scale_color_gradient(name = "Days from \nfirst case", low = "#b92b27", high = "#1565C0") +
183+
labs(x = "UMAP 1", y = "UMAP 2") +
184+
theme(axis.ticks.x = element_blank()) +
185+
theme(axis.ticks.y = element_blank()) +
186+
theme(axis.text.y = element_blank()) +
187+
theme(axis.text.x = element_blank()) +
188+
theme(axis.title.y = element_text(size = 16, face = "bold")) +
189+
theme(axis.title.x = element_text(size = 16, face = "bold")) +
190+
theme(legend.title = element_text(size = 15, face = "bold")) +
191+
theme(legend.text = element_text(size = 14))
192+
193+
plot_list <- list(p1, p2, p3)
194+
return(plot_list)
195+
196+
}
197+
198+
mst_plotter <- function(mst_list, meta_df) {
199+
200+
graph_m1 <- mst_list[[1]]
201+
graph_m2 <- mst_list[[2]]
202+
graph_m3 <- mst_list[[3]]
203+
lay <- mst_list[[4]]
204+
205+
ggnet_1 <- ggnetwork(graph_m1, layout = lay)
206+
p1 <- ggplot(ggnet_1, aes(x = x, y = y, xend = xend, yend = yend)) +
207+
theme_few() +
208+
geom_edges(color = "gray") +
209+
geom_nodes(aes(fill = Region), size = 3) +
210+
scale_fill_manual(name = "", values = kev_palette[1:length(unique(meta_df$Region))]) +
211+
theme(axis.title.x = element_blank()) +
212+
theme(axis.title.y = element_blank()) +
213+
theme(axis.text.x = element_blank()) +
214+
theme(axis.text.y = element_blank()) +
215+
theme(axis.ticks.x = element_blank()) +
216+
theme(axis.ticks.y = element_blank()) +
217+
theme(axis.line.x = element_blank()) +
218+
theme(axis.line.y = element_blank()) +
219+
theme(legend.title = element_text(size = 15, face = "bold")) +
220+
theme(legend.text = element_text(size = 14))
221+
222+
ggnet_2 <- ggnetwork(graph_m2, layout = lay)
223+
p2 <- ggplot(ggnet_2, aes(x = x, y = y, xend = xend, yend = yend)) +
224+
theme_few() +
225+
geom_edges(color = "gray") +
226+
geom_nodes(aes(fill = Country), size = 3) +
227+
scale_fill_manual(name = "", values = qual_vector[1:length(unique(meta_df$Geo_Location))]) +
228+
theme(axis.title.x = element_blank()) +
229+
theme(axis.title.y = element_blank()) +
230+
theme(axis.text.x = element_blank()) +
231+
theme(axis.text.y = element_blank()) +
232+
theme(axis.ticks.x = element_blank()) +
233+
theme(axis.ticks.y = element_blank()) +
234+
theme(axis.line.x = element_blank()) +
235+
theme(axis.line.y = element_blank()) +
236+
theme(legend.title = element_text(size = 15, face = "bold")) +
237+
theme(legend.text = element_text(size = 14))
238+
239+
ggnet_3 <- ggnetwork(graph_m3, layout = lay)
240+
p3 <- ggplot(ggnet_3, aes(x = x, y = y, xend = xend, yend = yend)) +
241+
theme_few() +
242+
geom_edges(color = "gray") +
243+
geom_nodes(aes(color = Date), size = 3) +
244+
scale_color_continuous(name = "Days from \nfirst case", low = "#b92b27", high = "#1565C0") +
245+
theme(axis.title.x = element_blank()) +
246+
theme(axis.title.y = element_blank()) +
247+
theme(axis.text.x = element_blank()) +
248+
theme(axis.text.y = element_blank()) +
249+
theme(axis.ticks.x = element_blank()) +
250+
theme(axis.ticks.y = element_blank()) +
251+
theme(axis.line.x = element_blank()) +
252+
theme(axis.line.y = element_blank()) +
253+
theme(legend.title = element_text(size = 15, face = "bold")) +
254+
theme(legend.text = element_text(size = 14))
255+
256+
plot_list <- list(p1, p2, p3)
257+
return(plot_list)
258+
259+
}
260+
261+
snp_plotter <- function(snp_list, meta_df) {
262+
263+
snps_1 <- snp_list[[1]]
264+
colnames(snps_1) <- c("Position", "Region", "Allele", "Freq")
265+
266+
snps_2 <- snp_list[[2]]
267+
colnames(snps_2) <- c("Position", "Country", "Allele", "Freq")
268+
269+
snps_3 <- snp_list[[3]]
270+
colnames(snps_3) <- c("Position", "Date", "Allele", "Freq")
271+
272+
p1 <- ggplot(data = snps_1, aes(x = Allele, y = Freq)) +
273+
theme_few () +
274+
geom_bar(stat = "identity", position = "dodge2", aes(fill = Region), color = "black") +
275+
scale_fill_manual(name = "", values = kev_palette[1:length(unique(meta_df$Region))]) +
276+
facet_wrap(~Position, scales = "free_x") +
277+
labs(x = "Allele", y = "Frequency") +
278+
theme(axis.text.y = element_text(size = 12)) +
279+
theme(axis.text.x = element_text(size = 12)) +
280+
theme(axis.title.y = element_text(size = 16, face = "bold")) +
281+
theme(axis.title.x = element_text(size = 16, face = "bold")) +
282+
theme(legend.title = element_text(size = 15, face = "bold")) +
283+
theme(legend.text = element_text(size = 14)) +
284+
theme(strip.text = element_text(size = 12, face = "bold"))
285+
286+
p2 <- ggplot(data = snps_2, aes(x = Allele, y = Freq)) +
287+
theme_few () +
288+
geom_bar(stat = "identity", position = "dodge2", aes(fill = Country), width = 1) +
289+
scale_fill_manual(name = "", values = qual_vector[1:length(unique(meta_df$Geo_Location))]) +
290+
facet_wrap(~Position, scales = "free") +
291+
labs(x = "Allele", y = "Frequency") +
292+
theme(axis.text.y = element_text(size = 12)) +
293+
theme(axis.text.x = element_text(size = 12)) +
294+
theme(axis.title.y = element_text(size = 16, face = "bold")) +
295+
theme(axis.title.x = element_text(size = 16, face = "bold")) +
296+
theme(legend.title = element_text(size = 15, face = "bold")) +
297+
theme(legend.text = element_text(size = 14)) +
298+
theme(strip.text = element_text(size = 12, face = "bold"))
299+
300+
int_text = paste("Functionality currently not supported")
301+
302+
p3 <- ggplot(data = snps_3, aes(x = Allele, y = Freq)) +
303+
theme_few () +
304+
geom_bar(stat = "identity", position = "dodge2", aes(color = Date)) +
305+
scale_color_continuous(name = "Days from \nfirst case", low = "#b92b27", high = "#1565C0") +
306+
facet_wrap(~Position, scales = "free_x") +
307+
labs(x = "Allele", y = "Frequency") +
308+
theme(axis.text.y = element_text(size = 12)) +
309+
theme(axis.text.x = element_text(size = 12)) +
310+
theme(axis.title.y = element_text(size = 16, face = "bold")) +
311+
theme(axis.title.x = element_text(size = 16, face = "bold")) +
312+
theme(legend.title = element_text(size = 15, face = "bold")) +
313+
theme(legend.text = element_text(size = 14)) +
314+
theme(strip.text = element_text(size = 12, face = "bold"))
315+
316+
plot_list <- list(p1, p2, p3)
317+
return(plot_list)
318+
319+
}

0 commit comments

Comments
 (0)