-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrlm_moBV_V2.R
90 lines (67 loc) · 3.43 KB
/
rlm_moBV_V2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# set the working directory to where the contents of the script are
setwd("~/Desktop/nature_communications_script")
# load the required packages
# Only the phyloseq is necessary for the core calculation if using the biom format for input
library(ggplot2)
library(phyloseq)
library(ggpubr)
# load the data
clst.expt <- import_biom(BIOMfilename = "test_data/converted_otu.biom", parseFunction = parse_taxonomy_greengenes)
clst.expt_gen <- tax_glom(physeq = clst.expt, taxrank = "Genus")
taxa_names(clst.expt_gen) <- make.unique(tax_table(clst.expt_gen)[,"Genus"])
# Extract the OTU table
# If you have you'r data as a standard matrix with genus names as rows and samples as columns, you cna just import here
mbiome_data_in <- as.data.frame(as.matrix(otu_table(clst.expt_gen)))
# This is a function to make it easier to get specific taxa out
bug_returner <- function(bug = "Gardnerella", samp_order, df_in = mbiome_data_in){
if(sum(!samp_order%in%colnames(df_in))>1){
df_in[,samp_order[!samp_order%in%colnames(df_in)]] <- 0
}
get_bug <- grepl(bug, rownames(df_in))
return(colSums(df_in[get_bug, samp_order]))
}
# Importing the beta coefficients for each BV association ratio
molecuL <- read.csv(file = "dependencies/robust_regression_coefficient_map.csv", as.is = T)
rownames(molecuL) <- molecuL$bug2
# Making a df to store the scores
map_in <- as.data.frame(sample_names(clst.expt_gen))
colnames(map_in)[1] <- "SID"
# Setting up a vector to loop over
top_bogsels <- c("Lactobacillus", molecuL$bug2)
# now to get out beta coefficients
# load all of them pugs into the map
for(i in 1:length(top_bogsels)){
map_in[[top_bogsels[i]]] <- bug_returner(bug = top_bogsels[i], samp_order = map_in[[1]])
}
# Adding the entry for molBV
map_in$molBV <- NA
for(i in 1:dim(map_in)[1]){
calci_vec <- sapply(molecuL$bug2, FUN = function(x) molecuL[x,"intercept"] +
log((map_in[i,"Lactobacillus"]+1)/(map_in[i,x]+1))*molecuL[x,"beta"])
map_in$molBV[i] <- mean(calci_vec)
}
# truncate the results at molBV 10 to match Nugent range
# map_in$molBV <- ifelse(test = map_in$molBV>10, yes = 10, no = map_in$molBV)
# scaling based on Reviewer #2
map_in$molBV <- 10*(map_in$molBV - min(map_in$molBV, na.rm = T))/(max(map_in$molBV, na.rm = T)-min(map_in$molBV, na.rm = T))
# Write out a sample map with the molBV socre
write.table(x = map_in, file = "expected_output/map_with_molBV.txt", quote = F, sep = "\t", row.names = F)
##############################################################################################################
############ This part is just to visualize the distribution of molBV across Clinical BV diagnoses ###########
##############################################################################################################
# import clinical map
clinical_map <- read.csv(file = "test_data/Mt_Sinai_sample_map.txt", sep = "\t", as.is = T)
# merge the two dfs
full_set <- merge(x = map_in, y = clinical_map, by.x = "SID", by.y = "Cerv_SampleID")
# make a nice plotter function to see the distribution of molBV across amsel and nugent diagnoses
basic_boxes <- function(df_in=full_set, x_variable="Amsel_Diagnosis"){
p <- ggplot(df_in, aes_string(x = x_variable, y = "molBV", fill = x_variable)) +
geom_boxplot()+
theme_minimal() + stat_compare_means() +
ggtitle(label = paste("molBV across", x_variable))
print(p)
}
# plot for Amsel
basic_boxes()
# Plot for Nugent Diagnoses
basic_boxes(x_variable = "Nugent_diagnosis")