Author: Carly Moreno
March 30, 2025
New York University Abu Dhabi
_________________________________________________________________________________________________________
install.packages("tidyverse")
install.packages('caroline')
install.packages("ggtext")
install.packages("wesanderson")
install.packages("factoextra")
install_github("kassambara/factoextra")
install.packages("missMDA")
install.packages("FactoMineR")
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(RColorBrewer)
library(wesanderson)
library(caroline)
library(tidyverse)
library(dplyr)
library(vegan)
library(devtools)
library(factoextra)
library("FactoMineR")
library(missMDA)
library(Hmisc)
library(corrplot)
# Make an environmental correlation plot
# Using all the data
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) %>%
select(c(7:14)) %>%
rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3' = 'No2.No3 umol L') %>%
rename_all(~ gsub("umol L", "", .)) %>%
rename_all(~ gsub("ODO mgL", "DO", .)) %>%
select(!3)
# Correlation matrix
ccc <- cor(env, method = 'pearson', use = 'complete.obs')
# Correlation coefficients and the p-values of the correlation for all possible pairs
res <- rcorr(as.matrix(ccc), type = 'pearson')
# Customize the color of the correlogram
color <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
COL2(diverging = c("RdBu", "BrBG", "PiYG", "PRGn", "PuOr", "RdYlBu"), n = 200)
png(filename = "CorrelationSelectedEnvReversedFinal_nopH_12pt.png", res = 300, units = 'in', width = 7, height = 7)
g <- corrplot(ccc, method="color", col=rev(color(200)),
type = "upper",
p.mat = res$P,
tl.col = "black",
tl.srt = 45,
tl.cex = 1.4,
cl.cex = 1.4,
cl.align.text ='l',
sig.level = c(0.001, 0.01, 0.05), pch.cex = 1.75,
insig = 'label_sig', pch.col = 'grey20',
diag = FALSE)
dev.off()# Using all the data, clean it up
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F)
allenvnum <- env %>%
select(c(2:4,5,6,7,8,10:15)) %>%
filter(Sample.Type == 2) %>%
rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3' = 'No2.No3 umol L') %>%
rename_all(~ gsub("umol L", "", .)) %>%
rename_all(~ gsub("ODO mgL", "DO", .)) %>%
drop_na()
# The matrix
allenv <- allenvnum %>%
select(c(6:13))
# Compute PCA
env.pca <- prcomp(allenv, scale = T)
summary(env.pca)
# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(env.pca)
# Graph of individuals. Individuals with a similar profile are grouped together.
fviz_pca_ind(env.pca,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
# Graph of variables. Positive correlated variables point to the same side of the plot.
# Negative correlated variables point to opposite sides of the graph.
fviz_pca_var(env.pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
# Biplot of individuals and variables
fviz_pca_biplot(env.pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
# PCA results
# Eigenvalues
eig.val <- get_eigenvalue(env.pca)
eig.val
# Results for Variables
res.var <- get_pca_var(env.pca)
res.var$coord # Coordinates
res.var$contrib # Contributions to the PCs
res.var$cos2 # Quality of representation
# Results for individuals
res.ind <- get_pca_ind(env.pca)
res.ind$coord # Coordinates
res.ind$contrib # Contributions to the PCs
res.ind$cos2 # Quality of representation
var <- get_pca_var(env.pca)
# Color individuals by groups or some qualitative variable
groups <- as.factor(allenvnum$Season)
# Set the theme for all the following plots.
theme_set(theme_bw(base_size = 14))
png("PCAbiplot_EnvironmentalFactors_12pt.png", width = 2200, height = 1800, res = 300)
g <- fviz_pca_biplot(env.pca,
# Individuals (stations)
geom.ind = 'point',
pointsize = 2,
col.ind = groups,
palette = c('#696969',"#00AFBB", "#FC4E07", 'darkgreen') ,
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence", # ellipse.level = 0.95,
# Variable (environmental variable)
geom.var = c('arrow', 'text'),
labelsize = 4,
repel = T, # Avoid label overplotting
#label = 5,
ggtheme = theme(axis.text=element_text(size=16.5),
axis.title=element_text(size=12),
legend.text=element_text(size=16.5),
axis.title.x = element_text(size = 16.5),
axis.title.y = element_text(size = 16.5)))
print(g)
dev.off()
# PerMANOVA - partitioning the euclidean distance matrix by Season to determine if the differences are statistically different
adonis2(allenv ~ Season, data = allenvnum, method='eu')_________________________________________________________________________________________________________
cells <- read.csv('cell_counts.csv', header=T, stringsAsFactors=F) %>%
rename('Pseudo nitzschia sp.' = 'Pseudo.nitzschia_sp.')
diatoms <- as_tibble(cells) %>%
rename(date=Sampling_Date) %>%
rename_all(~ gsub("_", " ", .)) %>%
select(date, 'Pseudo nitzschia sp.','Chaetoceros sp.','Skeletonema sp.', 'Rhizosolenia sp.', 'unidentified diatoms',"Cyanobacteria","Dinoflagellates") %>%
pivot_longer(-date, names_to='taxon', values_to='counts')
diatoms$date <- as.Date(diatoms$date, "%d/%m/%Y")
diatoms$date <- as.character(diatoms$date)
diatoms
datelist <- as.factor(diatoms$date)
datelist
png("diatom_with_phyto_totalCellcounts_12pt.png", res=300, units='in', width=12, height=9)
g <- ggplot(diatoms, aes(x=rev(date), y=counts, fill=taxon)) +
geom_col(color='black', width=0.7) +
coord_flip() +
labs(x="Date",
y="Total counts (cells/mL)") +
theme_classic() +
scale_fill_manual(name=NULL,
breaks=c("Cyanobacteria","Dinoflagellates",'Pseudo nitzschia sp.','Chaetoceros sp.','Skeletonema sp.', 'Rhizosolenia sp.', 'unidentified diatoms'),
values=c('#771155', '#114477', '#117777','#774411', 'darkgrey','#88CCAA', '#777711',
'#AAAA44', '#DDDD77', '#AA7744', '#DDAA77', '#771122',
'#AA4455')) +
scale_x_discrete(breaks=datelist,
labels=rev(datelist)) +
scale_y_continuous(expand=c(0,0)) +
theme(legend.text=element_markdown(size = 14),
legend.key.size=unit(13, "pt"),
strip.background=element_blank(),
strip.placement='outside',
axis.text.x=element_text(angle=0, size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16))
print(g)
dev.off()
ggsave("just_the_diatoms.tiff", width=6, height=5)data <- read.csv('YasBay_FCM.csv', header = T, stringsAsFactors = F, na = "NA") # included na = 'NA', to make it stored as an NA not a string
data$Date <- as.Date(data$Date, "%m/%d/%Y")
metadata$date <- as.character(metadata$date)
# Plot the data
ggplot(data, aes(x = Date, y = Synechococcus)) +
geom_line(color = "blue", size = 1) + # Line graph
geom_point(size = 2, color = 'blue') + # Add points for emphasis
scale_x_date(date_labels = "%b-%Y", date_breaks = "1 month") + # Format x-axis as Month-Year
labs(title = 'Synechoccus', x = "Date", y = 'cells/mL') +
theme_classic() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) # Rotate x-axis labels
ggsave("SynechoccosYB.tiff", width = 6, height = 5)
# Plot the data
ggplot(data, aes(x = Date, y = Heterotrophic.Prokaryotes)) +
geom_line(color = "darkorange", size = 1) + # Line graph
geom_point(size = 2, color = 'darkorange') + # Add points for emphasis
scale_x_date(date_labels = "%b-%Y", date_breaks = "1 month") + # Format x-axis as Month-Year
labs(title = 'Hetertrophic prokaryotes', x = "Date", y = 'cells/mL') +
theme_classic() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) # Rotate x-axis labels
ggsave("HeterProksYB.tiff", width = 6, height = 5)_________________________________________________________________________________________________________
install.packages("tidyverse")
install.packages('caroline')
install.packages("ggtext")
install.packages("wesanderson")
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(RColorBrewer)
library(wesanderson)
library(caroline)
library(dplyr)
# Import the metadata
metadata <- read.csv('metadata.csv', header=T, stringsAsFactors=F, na="NA") %>% # included na='NA', to make it stored as an NA not a string
select(SampleID, Sample.Type, Date) %>%
rename(Filter=Sample.Type, sample=SampleID) %>%
rename_all(tolower) %>%
filter(!date == '23/02/2024') %>%
drop_na(filter)
metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y")
metadata$date <- as.character(metadata$date)
metadata
datelist <- metadata$date
# ASV counts for each feature
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)
otu_counts <- as_tibble(merged %>%
rename(otu=id) %>%
select(-Sequence,-Confidence,-Taxon) %>%
pivot_longer(-otu,names_to='sample',values_to='count'))
otu_counts
# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>%
rename(otu=id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into=c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep="; ",
fill='right') %>% # NA values on the right if it need to add information
mutate(domain=str_replace(domain, "d__", ""),
phylum=str_replace(phylum, "p__", ""),
class=str_replace(class, "c__", ""),
order=str_replace(order, "o__", ""),
family=str_replace(family, "f__", ""),
genus=str_replace(genus, "g__", ""),
species=str_replace(species, "s__", "")) %>%
filter(domain == 'Eukaryota' | domain == 'Unassigned') %>%
filter(!phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 'Incertae_Sedis',
'Labyrinthulomycetes','Bicosoecida'))
taxonomy
# Join the dataframes with inner_join and create a relative abundance column
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by='otu') %>%
group_by(date) %>%
mutate(rel_abund=count / sum(count)) %>%
ungroup()
# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>%
group_by(sample) %>%
dplyr::summarize(total=sum(rel_abund))
# Make the dataframe even tidyer
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by='otu') %>%
group_by(date) %>%
mutate(rel_abund=count / sum(count)) %>%
ungroup() %>%
pivot_longer(cols=c("domain", "phylum", "class", "order", "family", "genus", "species"),
names_to="level",
values_to="taxon") %>%
mutate(filter=factor(filter,
levels=c("2", "3", "10")))
taxon_rel_abund <- otu_rel_abund %>%
filter(level == 'phylum') %>%
group_by(date, taxon) %>%
dplyr::summarize(rel_abund=100*sum(rel_abund), .groups='drop') %>% # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
mutate(taxon=replace_na(taxon, "Unclassified Eukaryota")) %>%
mutate(taxon=str_replace(taxon,
"Unclassified (.*)", "Unclassified *\\1*"),
taxon=str_replace(taxon, "^(\\S*)$", "*\\1*"))
taxon_rel_abund %>%
group_by(taxon) %>%
dplyr::summarize(max=max(rel_abund)) %>% # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
arrange(desc(max))
taxon_pool <- taxon_rel_abund %>%
group_by(date, taxon) %>%
dplyr::summarize(mean=mean(rel_abund), .groups='drop') %>% # filter, taxon, and mean, 3 filters and 50 phyla
group_by(taxon) %>% # relative abund of all samples across filters
dplyr::summarize(pool=max(mean) < 10, # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
mean=mean(mean), # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
.groups='drop') # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes
test <- inner_join(taxon_rel_abund, taxon_pool, by='taxon') %>%
mutate(taxon=if_else(pool, "Other", taxon)) %>% # if pool was true, we put Other, if it was false we put the taxon name
group_by(date, taxon) %>% # group by taxon, and pool the different phyla, with each filter size
dplyr::summarize(rel_abund=sum(rel_abund),
mean=min(mean),
.groups='drop') %>%
mutate(taxon=factor(taxon),
taxon=fct_reorder(taxon, mean, .desc=TRUE),
taxon=fct_shift(taxon, n=1)) # reorder legend, by reordering the factors, go back to pool line
png("euk_phylum_bar_rel_abund_all_dates_12pt.png", res=300, units='in', width=11, height=16)
g <- ggplot(test, aes(x=rev(date), y=rel_abund, fill=taxon)) +
geom_col(color='black', width=0.7) +
coord_flip() +
labs(x="Date",
y="Relative Abundance (%)") +
theme_classic() +
scale_fill_manual(name=NULL,
breaks=c("*Ascomycota*","*Chlorophyta*","*Cryptophyceae*","*Diatomea*","*Dinoflagellata*", "*Ochrophyta*",
"*Ochrophyta*", "*Peronosporomycetes*", "*Protalveolata*","*Prymnesiophyceae*",
"Unclassified *Eukaryota*", "Other"),
values=c('#771155', '#AA4488', '#CC99BB', '#114477', '#4477AA', '#77AADD', '#117777',
'#44AAAA', '#77CCCC', '#117744', '#44AA77', 'darkgrey', '#777711',
'#AAAA44', '#D1D1D1','#DDDD77', '#774411', '#AA7744', '#DDAA77', '#771122',
'#AA4455')) +
scale_x_discrete(breaks=datelist,
labels=rev(datelist)) +
scale_y_continuous(expand=c(0.0,3), ) +
theme(legend.position="bottom",
legend.text=element_markdown(size = 23),
legend.key.size=unit(16, "pt"),
strip.background=element_blank(),
strip.placement='outside',
axis.text.x=element_text(angle=0, size = 25),
axis.text.y = element_text(size = 25),
axis.title.x = element_text(size = 25)) +
guides(fill = guide_legend(nrow = 5, byrow = TRUE))
print(g)
dev.off()# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>% # included na = 'NA', to make it stored as an NA not a string
select(SampleID, Sample.Type, Date) %>%
rename(Filter = Sample.Type, sample = SampleID) %>%
rename_all(tolower) %>%
drop_na(filter)
metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y")
metadata$date <- as.character(metadata$date)
metadata
datelist <- metadata$date
# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)
otu_counts <- as_tibble(merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon) %>%
pivot_longer(-otu,names_to = 'sample',values_to = 'count'))
otu_counts
# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(domain == 'Bacteria' | domain == 'Unassigned')
taxonomy
# Join the dataframes with inner_join and create a relative abundance column
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup()
# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>%
group_by(date) %>%
dplyr::summarize(total = sum(rel_abund))
# Make the dataframe even tidyer
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup() %>%
pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
names_to = "level",
values_to = "taxon") %>%
mutate(filter = factor(filter,
levels = c("2", "3", "10")))
taxon_rel_abund <- otu_rel_abund %>%
filter(level == 'phylum') %>%
group_by(date, taxon) %>%
dplyr::summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>% # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>%
mutate(taxon = str_replace(taxon,
"Unclassified (.*)", "Unclassified *\\1*"),
taxon = str_replace(taxon, "^(\\S*)$", "*\\1*"))
taxon_rel_abund %>%
group_by(taxon) %>%
dplyr::summarize(max = max(rel_abund)) %>% # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
arrange(desc(max))
taxon_pool <- taxon_rel_abund %>%
group_by(date, taxon) %>%
dplyr::summarize(mean = mean(rel_abund), .groups = 'drop') %>% # filter, taxon, and mean, 3 filters and 50 phyla
group_by(taxon) %>% # relative abund of all samples across filters
dplyr::summarize(pool = max(mean) < 1, # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
mean = mean(mean), # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
.groups = 'drop') # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes
test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>%
mutate(taxon = if_else(pool, "Other", taxon)) %>% # if pool was true, we put Other, if it was false we put the taxon name
group_by(date, taxon) %>% # group by taxon, and pool the different phyla, with each filter size
dplyr::summarize(rel_abund = sum(rel_abund),
mean = min(mean),
.groups = 'drop') %>%
mutate(taxon = factor(taxon),
taxon = fct_reorder(taxon, mean, .desc=TRUE),
taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line
png("bacteria_phylum_bar_rel_abund_all_dates_12pt.png", res = 300, units = 'in', width=11, height=16)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
geom_col(color = 'black', width = 0.7) +
coord_flip() +
labs(x = "Date",
y = "Relative Abundance (%)") +
theme_classic() +
scale_fill_manual(name = NULL,
breaks = c("*Actinobacteriota*","*Bacteroidota*","*Bdellovibrionota*","*Chloroflexi*",
"*Cyanobacteria*","*Desulfobacterota*", "*Firmicutes*", "*Patescibacteria*",
"*Planctomycetota*","*Proteobacteria*","*SAR324_clade(Marine_group_B)*","*Verrucomicrobiota*", "Other"),
values = c('#771155', '#AA4488', '#114477', '#4477AA', '#77AADD', '#117777',
'#774411', '#44AAAA', '#77CCCC', '#44AA77', '#AA7744','#88CCAA','darkgray', '#AAAA44','#777711',
'#117744', '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
scale_x_discrete(breaks = rev(datelist),
labels = datelist) +
scale_y_continuous(expand=c(0.0,3), ) +
theme(legend.position="bottom",
legend.text=element_markdown(size = 23),
legend.key.size=unit(16, "pt"),
strip.background=element_blank(),
strip.placement='outside',
axis.text.x=element_text(angle=0, size = 25),
axis.text.y = element_text(size = 25),
axis.title.x = element_text(size = 25)) +
guides(fill = guide_legend(nrow = 7, byrow = TRUE))
print(g)
dev.off()# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>% # included na = 'NA', to make it stored as an NA not a string
select(SampleID, Sample.Type, Date) %>%
rename(Filter = Sample.Type, sample = SampleID) %>%
rename_all(tolower) %>%
drop_na(filter)
metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y")
metadata$date <- as.character(metadata$date)
metadata
datelist <- metadata$date
# ASV counts for each feature
merged <- read.csv('metadata_16S_arc.csv', header = T, stringsAsFactors = F)
otu_counts <- as_tibble(merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon) %>%
pivot_longer(-otu,names_to = 'sample',values_to = 'count'))
otu_counts
# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(domain == 'Archaea' | domain == 'Unassigned')
taxonomy
# Join the dataframes with inner_join and create a relative abundance column
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup()
# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>%
group_by(date) %>%
summarize(total = sum(rel_abund))
# Make the dataframe even tidyer
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup() %>%
pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
names_to = "level",
values_to = "taxon") %>%
mutate(filter = factor(filter,
levels = c("2", "3", "10")))
taxon_rel_abund <- otu_rel_abund %>%
filter(level == 'genus') %>%
group_by(date, taxon) %>%
summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>% # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
mutate(taxon = replace_na(taxon, "Unclassified Archaea")) %>%
mutate(taxon = str_replace(taxon,
"Unclassified (.*)", "Unclassified *\\1*"),
taxon = str_replace(taxon, "^(\\S*)$", "*\\1*"))
taxon_rel_abund %>%
group_by(taxon) %>%
summarize(max = max(rel_abund)) %>% # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
arrange(desc(max))
taxon_pool <- taxon_rel_abund %>%
group_by(date, taxon) %>%
summarize(mean = mean(rel_abund), .groups = 'drop') %>% # filter, taxon, and mean, 3 filters and 50 phyla
group_by(taxon) %>% # relative abund of all samples across filters
summarize(pool = max(mean) < 15, # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
mean = mean(mean), # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
.groups = 'drop') # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes
test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>%
mutate(taxon = if_else(pool, "Other", taxon)) %>% # if pool was true, we put Other, if it was false we put the taxon name
group_by(date, taxon) %>% # group by taxon, and pool the different phyla, with each filter size
summarize(rel_abund = sum(rel_abund),
mean = min(mean),
.groups = 'drop') %>%
mutate(taxon = factor(taxon),
taxon = fct_reorder(taxon, mean, .desc=TRUE),
taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line
png("archaea_bar_rel_abund_all_dates.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
geom_col(color = 'black', width = 0.5) +
coord_flip() +
labs(x = "Date",
y = "Relative Abundance (%)") +
theme_classic() +
scale_fill_manual(name = NULL,
breaks = c("*Bathyarchaeia*","*Halococcus*","*Marine_Group_II*","*Methanobrevibacter*",
"*Methanosaeta*","*Nitrososphaeraceae*", "Unclassified *Archaea*","*uncultured*", "Other"),
values = c('#771155', '#AA4488', '#114477', '#4477AA', '#77AADD', '#117777',
'#774411', '#44AAAA','darkgray', '#77CCCC', '#44AA77', '#AA7744','#88CCAA', '#AAAA44','#777711',
'#117744', '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
scale_x_discrete(breaks = datelist,
labels = rev(datelist)) +
scale_y_continuous(expand = c(0,0)) +
theme(legend.text = element_markdown(size = 12),
legend.key.size = unit(14, "pt"),
strip.background = element_blank(),
strip.placement = 'outside',
axis.text.x = element_text(angle = 0, size = 12))
print(g)
dev.off()
dev.off()
(device = "RStudioGD")# Import the metadata
metadata <- read.csv('metadata.csv', header=T, stringsAsFactors=F, na="NA") %>% # included na='NA', to make it stored as an NA not a string
select(SampleID, Sample.Type, Date) %>%
rename(Filter=Sample.Type, sample=SampleID) %>%
rename_all(tolower) %>%
drop_na(filter)
metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y")
metadata$date <- as.character(metadata$date)
metadata
datelist <- as.factor(metadata$date)
datelist <- datelist
# ASV counts for each feature
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)
otu_counts <- as_tibble(merged %>%
rename(otu=id) %>%
select(-Sequence,-Confidence,-Taxon) %>%
pivot_longer(-otu,names_to='sample',values_to='count'))
otu_counts
# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>%
rename(otu=id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into=c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep="; ",
fill='right') %>% # NA values on the right if it need to add information
mutate(domain=str_replace(domain, "d__", ""),
phylum=str_replace(phylum, "p__", ""),
class=str_replace(class, "c__", ""),
order=str_replace(order, "o__", ""),
family=str_replace(family, "f__", ""),
genus=str_replace(genus, "g__", ""),
species=str_replace(species, "s__", "")) %>%
filter(phylum == 'Diatomea')
# Join the dataframes with inner_join and create a relative abundance column
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by='otu') %>%
group_by(date) %>%
mutate(rel_abund=count / sum(count)) %>%
ungroup()
# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>%
group_by(date) %>%
summarize(total=sum(rel_abund))
# Make the dataframe even tidyer
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by='otu') %>%
group_by(date) %>%
mutate(rel_abund=count / sum(count)) %>%
ungroup() %>%
pivot_longer(cols=c("domain", "phylum", "class", "order", "family", "genus", "species"),
names_to="level",
values_to="taxon") %>%
mutate(filter=factor(filter,
levels=c("2", "3", "10")))
taxon_rel_abund <- otu_rel_abund %>%
filter(level == 'genus') %>%
group_by(date, taxon) %>%
summarize(rel_abund=100*sum(rel_abund), .groups='drop') %>% # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
mutate(taxon=replace_na(taxon, "Unclassified Diatomea")) %>%
mutate(taxon=str_replace(taxon,
"Unclassified (.*)", "Unclassified *\\1*"),
taxon=str_replace(taxon, "^(\\S*)$", "*\\1*"))
taxon_rel_abund %>%
group_by(taxon) %>%
summarize(max=max(rel_abund)) %>% # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
arrange(desc(max))
taxon_pool <- taxon_rel_abund %>%
group_by(date, taxon) %>%
summarize(mean=mean(rel_abund), .groups='drop') %>% # filter, taxon, and mean, 3 filters and 50 phyla
group_by(taxon) %>% # relative abund of all samples across filters
summarize(pool=max(mean) < 30, # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
mean=mean(mean), # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
.groups='drop') # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes
test <- inner_join(taxon_rel_abund, taxon_pool, by='taxon') %>%
mutate(taxon=if_else(pool, "Other", taxon)) %>% # if pool was true, we put Other, if it was false we put the taxon name
group_by(date, taxon) %>% # group by taxon, and pool the different phyla, with each filter size
summarize(rel_abund=sum(rel_abund),
mean=min(mean),
.groups='drop') %>%
mutate(taxon=factor(taxon),
taxon=fct_reorder(taxon, mean, .desc=TRUE),
taxon=fct_shift(taxon, n=1)) # reorder legend, by reordering the factors, go back to pool line
png("diatom_phylum_bar_rel_abund_all_dates.png", res=300, units='in', width=9, height=8)
g <- ggplot(test, aes(x=rev(date), y=rel_abund, fill=taxon)) +
geom_col(color='black', width=0.5) +
coord_flip() +
labs(y="Relative Abundance (%)") +
theme_classic() +
scale_fill_manual(name=NULL,
breaks=c("*Arcocellulus*","*Cerataulina*","*Chaetoceros*","*Cyclotella*",
"*Cylindrotheca*","*Leptocylindrus*","*Nitzschia*","*Pseudo-nitzschia*",
"*Skeletonema*","*Thalassiosira*","Unclassified *Diatomea*","Other"),
values=c('#771155', '#AA4488', '#CC99BB', '#114477', '#4477AA', '#77AADD', '#117777',
'#44AAAA', '#77CCCC', '#117744', '#44AA77', '#D1D1D1','#88CCAA', '#AAAA44','#777711',
'#DDDD77','#DDAA77', '#774411', '#AA7744', '#771122',
'#AA4455')) +
scale_x_discrete(breaks=datelist,
labels=rev(datelist)) +
scale_y_continuous(expand=c(0,0)) +
theme(legend.text=element_markdown(size = 12),
legend.key.size=unit(14, "pt"),
strip.background=element_blank(),
strip.placement='outside',
axis.text.x=element_text(angle=0, size = 12))
print(g)
dev.off()# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>% # included na = 'NA', to make it stored as an NA not a string
select(SampleID, Sample.Type, Date) %>%
rename(Filter = Sample.Type, sample = SampleID) %>%
rename_all(tolower) %>%
drop_na(filter)
metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y")
metadata$date <- as.character(metadata$date)
metadata
datelist <- metadata$date
# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)
otu_counts <- as_tibble(merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon) %>%
pivot_longer(-otu,names_to = 'sample',values_to = 'count'))
otu_counts
# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(phylum == 'Proteobacteria' | phylum == 'Unassigned')
taxonomy
# Join the dataframes with inner_join and create a relative abundance column
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup()
# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>%
group_by(date) %>%
summarize(total = sum(rel_abund))
# Make the dataframe even tidyer
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup() %>%
pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
names_to = "level",
values_to = "taxon") %>%
mutate(filter = factor(filter,
levels = c("2", "3", "10")))
taxon_rel_abund <- otu_rel_abund %>%
filter(level == 'family') %>%
group_by(date, taxon) %>%
summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>% # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>%
mutate(taxon = str_replace(taxon,
"Unclassified (.*)", "Unclassified *\\1*"),
taxon = str_replace(taxon, "^(\\S*)$", "*\\1*"))
taxon_rel_abund %>%
group_by(taxon) %>%
summarize(max = max(rel_abund)) %>% # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
arrange(desc(max))
taxon_pool <- taxon_rel_abund %>%
group_by(date, taxon) %>%
summarize(mean = mean(rel_abund), .groups = 'drop') %>% # filter, taxon, and mean, 3 filters and 50 phyla
group_by(taxon) %>% # relative abund of all samples across filters
summarize(pool = max(mean) < 10, # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
mean = mean(mean), # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
.groups = 'drop') # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes
test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>%
mutate(taxon = if_else(pool, "Other", taxon)) %>% # if pool was true, we put Other, if it was false we put the taxon name
group_by(date, taxon) %>% # group by taxon, and pool the different phyla, with each filter size
summarize(rel_abund = sum(rel_abund),
mean = min(mean),
.groups = 'drop') %>%
mutate(taxon = factor(taxon),
taxon = fct_reorder(taxon, mean, .desc=TRUE),
taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line
png("bacteria_Proteobacteria_phylum_bar_rel_abund_all_dates_top12.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
geom_col(color = 'black', width = 0.5) +
coord_flip() +
labs(x = "Date",
y = "Relative Abundance (%)") +
theme_classic() +
scale_fill_manual(name = NULL,
breaks = c("*AEGEAN-169_marine_group*", "*Alcanivoracaceae1*", "*Algiphilaceae*", "*Alteromonadaceae*",
"*Arenicellaceae*","*Burkholderiaceae*", "*Cellvibrionaceae*", "*Nitriliruptoraceae*",
"*Clade_I*","*Colwelliaceae*","*Comamonadaceae*","*Halieaceae*",
"*Halomonadaceae*","*Hyphomicrobiaceae*","*Kangiellaceae*","*Legionellaceae*",
"*Marinobacteraceae*", "*Micavibrionaceae*","*Moraxellaceae*","*Pseudoalteromonadaceae*",
"*Pseudomonadaceae*","*Rhizobiaceae*", "*Rhodobacteraceae*", "*SAR116_clade*","*SAR86_clade*",
"*Salinisphaeraceae*","*Solimonadaceae*", "*Sphingomonadaceae*","*Spongiibacteraceae*",
"*Stappiaceae*", "*Vibrionaceae*","*Unknown_Family*", "Unclassified *Bacteria*", "Other"),
# breaks = c("*AEGEAN-169_marine_group*", "*Alcanivoracaceae1*", "*Algiphilaceae*", "*Alteromonadaceae*",
# "*Arenicellaceae*","*Clade_I*","*Halieaceae*","*Halomonadaceae*",
# "*Pseudoalteromonadaceae*", "*Rhodobacteraceae*", "*SAR86_clade*","*Solimonadaceae*",
# "*Sphingomonadaceae*", "*Stappiaceae*", "*Unknown_Family*","Other"),
values = c("#771155", "#AA4488", "#CC99BB", "#114477",
"#4477AA", "#77AADD", "#117777", "#44AAAA",
"#77CCCC", "#117744", "#44AA77", "#88CCAA",
"#777711", "#AAAA44", "#DDDD77", "#774411",
"#AA7744", "#DDAA77", "#771122", "#AA4455",
"#DD7788", "#005A32", "purple", "green",
"#CAE0AB", "#F7EE55", "#F6C141", "#F1932D",
"#E8601C", "#DC050C","#99000D", "#000000",
"yellow")) +
scale_x_discrete(breaks = datelist,
labels = rev(datelist)) +
scale_y_continuous(expand = c(0,0)) +
theme(legend.text = element_markdown(size = 12),
legend.key.size = unit(14, "pt"),
strip.background = element_blank(),
strip.placement = 'outside',
axis.text.x = element_text(angle = 0, size = 12))
print(g)
dev.off()
dev.off()
(device = "RStudioGD")# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>% # included na = 'NA', to make it stored as an NA not a string
select(SampleID, Sample.Type, Date) %>%
rename(Filter = Sample.Type, sample = SampleID) %>%
rename_all(tolower) %>%
drop_na(filter)
metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y")
metadata$date <- as.character(metadata$date)
metadata
datelist <- metadata$date
# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)
otu_counts <- as_tibble(merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon) %>%
pivot_longer(-otu,names_to = 'sample',values_to = 'count'))
otu_counts
# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(phylum == 'Bacteroidota' | domain == 'Unassigned')
taxonomy
# Join the dataframes with inner_join and create a relative abundance column
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup()
# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>%
group_by(date) %>%
summarize(total = sum(rel_abund))
# Make the dataframe even tidyer
otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts)) # to see what is missing from metadata
inner_join(., taxonomy, by = 'otu') %>%
group_by(date) %>%
mutate(rel_abund = count / sum(count)) %>%
ungroup() %>%
pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
names_to = "level",
values_to = "taxon") %>%
mutate(filter = factor(filter,
levels = c("2", "3", "10")))
taxon_rel_abund <- otu_rel_abund %>%
filter(level == 'family') %>%
group_by(date, taxon) %>%
summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>% # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>%
mutate(taxon = str_replace(taxon,
"Unclassified (.*)", "Unclassified *\\1*"),
taxon = str_replace(taxon, "^(\\S*)$", "*\\1*"))
taxon_rel_abund %>%
group_by(taxon) %>%
summarize(max = max(rel_abund)) %>% # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
arrange(desc(max))
taxon_pool <- taxon_rel_abund %>%
group_by(date, taxon) %>%
summarize(mean = mean(rel_abund), .groups = 'drop') %>% # filter, taxon, and mean, 3 filters and 50 phyla
group_by(taxon) %>% # relative abund of all samples across filters
summarize(pool = max(mean) < 2, # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
mean = mean(mean), # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
.groups = 'drop') # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes
test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>%
mutate(taxon = if_else(pool, "Other", taxon)) %>% # if pool was true, we put Other, if it was false we put the taxon name
group_by(date, taxon) %>% # group by taxon, and pool the different phyla, with each filter size
summarize(rel_abund = sum(rel_abund),
mean = min(mean),
.groups = 'drop') %>%
mutate(taxon = factor(taxon),
taxon = fct_reorder(taxon, mean, .desc=TRUE),
taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line
png("bacteria_Bacteroidota_phylum_bar_rel_abund_all_dates.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
geom_col(color = 'black', width = 0.5) +
coord_flip() +
labs(x = "Date",
y = "Relative Abundance (%)") +
theme_classic() +
scale_fill_manual(name = NULL,
breaks = c("*Balneolaceae*","*Crocinitomicaceae*","*Cryomorphaceae*","*Cyclobacteriaceae*",
"*Flavobacteriaceae*","*NS11-12_marine_group*", "*NS9_marine_group*", "*Rhodothermaceae*",
"*Saprospiraceae**","*Schleiferiaceae**","*Weeksellaceae*","*uncultured*", "Other","Unclassified *Bacteria*"),
values = c('#771155', '#AA4488', '#114477', '#4477AA', '#77AADD', '#117777',
'#774411', '#44AAAA', '#77CCCC', '#44AA77', '#AA7744','#88CCAA','darkgray', '#AAAA44','#777711',
'#117744', '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
scale_x_discrete(breaks = datelist,
labels = rev(datelist)) +
scale_y_continuous(expand = c(0,0)) +
theme(legend.text = element_markdown(size = 12),
legend.key.size = unit(14, "pt"),
strip.background = element_blank(),
strip.placement = 'outside',
axis.text.x = element_text(angle = 0, size = 12))
print(g)
dev.off()_________________________________________________________________________________________________________
remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)
# Eukaryotes
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)
# Taxonomy
taxonomy <- merged %>%
rename(otu=id) %>%
select(otu, Taxon) %>%
rename_all(toupper) %>%
separate(TAXON,
into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
sep="; ",
fill='right') %>% # NA values on the right if it need to add information
mutate(Domain=str_replace(Domain, "d__", ""),
Phylum=str_replace(Phylum, "p__", ""),
Class=str_replace(Class, "c__", ""),
Order=str_replace(Order, "o__", ""),
Family=str_replace(Family, "f__", ""),
Genus=str_replace(Genus, "g__", ""),
Species=str_replace(Species, "s__", "")) %>%
mutate(Domain=str_replace(Domain, "Unassigned", "Unknown")) %>%
filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa',
'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Input sample metadata
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))
abund.mat <- datas[,14:28118]
# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]
index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))
png("euks_diversity_12pt.png", res=300, units='in', width=15, height=8)
par(mar=c(3,5,3,2), mfrow=c(1,2))
colors=c('#ED7D31','#548235','#C00000', '#4472C4')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors,
cex.axis=2, cex.lab=2, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors,
cex.axis=2, cex.lab=2, ylab="Shannon diversity")
dev.off()
# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)
aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
dist <- vegdist(otumatdf, method = "bray")
metaMDS(dist)
# Copy the very first part of the Randi Griffith tutorial for the NMDS
install.packages("remotes")
remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)
# Eukaryotes
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)
# Taxonomy
taxonomy <- merged %>%
rename(otu=id) %>%
select(otu, Taxon) %>%
rename_all(toupper) %>%
separate(TAXON,
into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
sep="; ",
fill='right') %>% # NA values on the right if it need to add information
mutate(Domain=str_replace(Domain, "d__", ""),
Phylum=str_replace(Phylum, "p__", ""),
Class=str_replace(Class, "c__", ""),
Order=str_replace(Order, "o__", ""),
Family=str_replace(Family, "f__", ""),
Genus=str_replace(Genus, "g__", ""),
Species=str_replace(Species, "s__", "")) %>%
mutate(Domain=str_replace(Domain, "Unassigned", "Unknown")) %>%
filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa',
'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# NMDS
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# The ASVs (rows = sample)
abund.mat <- datas[,14:28118]
# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>%
na.omit() %>%
select(-'NO2.NO3')
# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)
# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com, dmethod = "bray", sample = 2500)
dist <- vegdist(m_com, method = "bray")
nmds = metaMDS(dist)
# nmds <- dist %>%
# metaMDS(trace = F) %>%
# ordiplot(type = "t") %>%
# text("sites")
nmds_stress <- nmds$stress
# stress = 0.17
# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en
# Plot with base R
plot(nmds)
plot(en)
# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')
en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)
png("NMDS_Euks.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
geom_point(data = data.scores,
aes(colour = as.factor(Season), shape = as.factor(Sample.Type)),
show.legend = T, size = 3) +
scale_color_manual(name = "Seasons",
values=colors,
labels = c('Autumn','Spring','Summer','Winter')) +
scale_shape_manual(name = "Size fraction",
values=c(15, 16, 17),
breaks = c('2', '3', '10'),
labels = c('0.2 um', '3.0 um', '10.0 um')) +
geom_segment(data = en_coord_cont,
aes(x = 0, y = 0,
xend = NMDS1*.6, yend = NMDS2*.6),
arrow = arrow(length = unit(.6, 'picas')), colour="red") +
geom_text(data = en_coord_cont, aes(x = NMDS1*.63, y = NMDS2*.63), colour = "black",
fontface = "bold", label = row.names(en_coord_cont)) +
# stat_ellipse(data = data.scores,
# aes(color=as.factor(Season), show.legend = F)) +
xlab("NMDS1") +
ylab("NMDS2") +
theme_bw(base_size = 15) +
theme(axis.text.y = element_text(color="black", size=17, angle=0),
axis.text.x = element_text(color="black", size=17, angle=0),
axis.title.x = element_text(color="black", size=20, angle=0),
axis.title.y = element_text(color="black", size=20, angle=90),
legend.text=element_markdown(),
legend.key.size=unit(15, "pt"),
strip.background=element_blank(),
strip.placement='outside',
panel.border = element_rect(colour = "black", fill = NA))
dev.off()remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)
# Bacteria
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(domain == 'Bacteria' | domain == 'Unassigned')
taxonomy
rownames(taxonomy) <- taxonomy$otu
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Input sample metadata
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))
abund.mat <- datas[,14:83966]
# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]
index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
#index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))
png("bac_diversity_12pt.png", res=300, units='in', width=15, height=8)
par(mar=c(3,5,3,2), mfrow=c(1,2))
colors = c('#ED7D31','#548235','#C00000', '#4472C4')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors,
cex.axis=2, cex.lab=2, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors,
cex.axis=2, cex.lab=2, ylab="Shannon diversity")
dev.off()
# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)
aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)
# NMDS
# Install and load the following packages
install.packages("vegan")
install.packages("ape")
install.packages("dplyr")
library(vegan)
library(ape)
library(dplyr)
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(domain == 'Bacteria' | domain == 'Unassigned')
taxonomy
rownames(taxonomy) <- taxonomy$otu
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Input sample metadata
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))
abund.mat <- datas[,14:83966]
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# The ASVs (rows = sample)
abund.mat <- datas[,14:83966]
# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>%
na.omit() %>%
select(-'NO2.NO3')
# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)
# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com, dmethod = "bray", sample = 10000)
dist <- vegdist(m_com, method = "bray")
nmds = metaMDS(dist)
# nmds <- dist %>%
# metaMDS(trace = F) %>%
# ordiplot(type = "t") %>%
# text("sites")
nmds_stress <- nmds$stress
# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en
# Plot with base R
plot(nmds)
plot(en)
# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
#data.scores = as.data.frame(scores(nmds)$sites)
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')
en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)
png("NMDS_Bac.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
geom_point(data = data.scores,
aes(colour = as.factor(Season), shape = as.factor(Sample.Type)),
show.legend = T, size = 3) +
scale_color_manual(name = "Seasons",
values=colors,
labels = c('Autumn','Spring','Summer','Winter')) +
scale_shape_manual(name = "Size fraction",
values=c(15, 16, 17),
breaks = c('2', '3', '10'),
labels = c('0.2 um', '3.0 um', '10.0 um')) +
geom_segment(data = en_coord_cont,
aes(x = 0, y = 0,
xend = NMDS1*.6, yend = NMDS2*.6),
arrow = arrow(length = unit(.6, 'picas')), colour="red") +
geom_text(data = en_coord_cont, aes(x = NMDS1*.64, y = NMDS2*.64), colour = "black",
fontface = "bold", label = row.names(en_coord_cont)) +
xlab("NMDS1") +
ylab("NMDS2") +
theme_bw(base_size = 15) +
theme(axis.text.y = element_text(color="black", size=17, angle=0),
axis.text.x = element_text(color="black", size=17, angle=0),
axis.title.x = element_text(color="black", size=20, angle=0),
axis.title.y = element_text(color="black", size=20, angle=90),
legend.text=element_markdown(),
legend.key.size=unit(15, "pt"),
strip.background=element_blank(),
strip.placement='outside',
panel.border = element_rect(colour = "black", fill = NA))
dev.off()remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)
# Eukaryotes
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_arc.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(domain == 'Archaea' | domain == 'Unassigned')
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Input sample metadata
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))
abund.mat <- datas[,14:9586]
# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]
index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))
png("arc_diversity.png", res=300, units='in', width=9, height=8)
par(mar=c(3,4,3,2), mfrow=c(2,2))
colors=c('#771155','#114477','#117777', '#AA7744')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors,
cex.axis=1, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors,
cex.axis=1, ylab="Shannon diversity")
dev.off()
# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)
aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)
install.packages("vegan")
install.packages("ape")
install.packages("dplyr")
library(vegan)
library(ape)
library(dplyr)
library(tidyverse)
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_arc.csv', header=T, stringsAsFactors=F)
otumat <- merged %>%
select(-Sequence,-Confidence,-Taxon) %>%
column_to_rownames(var="id")
otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(tolower) %>%
separate(taxon,
into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(domain = str_replace(domain, "d__", ""),
phylum = str_replace(phylum, "p__", ""),
class = str_replace(class, "c__", ""),
order = str_replace(order, "o__", ""),
family = str_replace(family, "f__", ""),
genus = str_replace(genus, "g__", ""),
species = str_replace(species, "s__", "")) %>%
filter(domain == 'Archaea' | domain == 'Unassigned')
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Input sample metadata
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
#filter(Sample.Type == 2) %>%
#rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .))
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))
abund.mat <- datas[,14:9598]
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))
abund.mat <- datas[,14:9598]
# Merge the otumat datafram with the environment dataframe
datas <- merge(allenvnum, otumatdf, by='SampleID') %>%
na.omit()
rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))
# The ASVs (rows = sample)
abund.mat <- datas[,14:9598]
# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>%
na.omit() %>%
select(-'NO2.NO3')
# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)
# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com, dmethod = "bray", sample = 10000)
dist <- vegdist(m_com, method = "bray")
nmds = metaMDS(dist)
nmds_stress <- nmds$stress
# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en
# Plot with base R
plot(nmds)
plot(en)
# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
#data.scores = as.data.frame(scores(nmds)$sites)
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')
en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)
png("NMDS_Arc.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
geom_point(data = data.scores,
aes(colour = as.factor(Season), shape = as.factor(Sample.Type)),
show.legend = T, size = 3) +
scale_color_manual(name = "Seasons",
values=colors,
labels = c('Autumn','Spring','Summer','Winter')) +
scale_shape_manual(name = "Size fraction",
values=c(15, 16, 17),
breaks = c('2', '3', '10'),
labels = c('0.2 um', '3.0 um', '10.0 um')) +
geom_segment(data = en_coord_cont,
aes(x = 0, y = 0,
xend = NMDS1*.3, yend = NMDS2*.3),
arrow = arrow(length = unit(.6, 'picas')), colour="red") +
geom_text(data = en_coord_cont, aes(x = NMDS1*.34, y = NMDS2*.34), colour = "black",
fontface = "bold", label = row.names(en_coord_cont)) +
#stat_ellipse(aes(color=Season), show.legend = F) +
xlab("NMDS1") +
ylab("NMDS2") +
theme_bw(base_size = 15) +
theme(axis.text.y = element_text(color="black", size=17, angle=0),
axis.text.x = element_text(color="black", size=17, angle=0),
axis.title.x = element_text(color="black", size=20, angle=0),
axis.title.y = element_text(color="black", size=20, angle=90),
legend.text=element_markdown(),
legend.key.size=unit(15, "pt"),
strip.background=element_blank(),
strip.placement='outside',
panel.border = element_rect(colour = "black", fill = NA))
dev.off()_________________________________________________________________________________________________________
# Load the pheatmap package
install.packages('matrixStats')
library(pheatmap)
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(matrixStats)
library(RColorBrewer)
library(caroline)
# Input sample metadata
envs <- read_csv('Metadata_YB.csv')
allenvnum <- envs %>%
select(c(2:8,10:15)) %>%
rename('NO2+NO3'='No2.No3_umol_L') %>%
rename_all(~ gsub("_umol_L", "", .)) %>%
rename_all(~ gsub("ODO_mgL", "DO", .)) %>%
filter(!SampleID == 'NA') %>%
filter(!SampleID == 'BB10')
# Annotation column: a data frame where the row names amtch the column names of the data_matrix
rownames(allenvnum) <- allenvnum$SampleID
# ASV counts for each feature into a matrix
merged <- read.csv('MetaCycPathways.csv', header=T, stringsAsFactors=F) %>%
column_to_rownames(var='Description') %>%
select(-pathway)
# Check to see if they're relative abundance counts
# count <- rbind(counts, colSums(counts[1:90]))
merge_mat <- as.matrix(merged)
### Log2 normalization and then determine the genes that have the most variance
log_merge_mat <- as.matrix(log2(merged+1))
# Get the top most 50 variable genes
rv <- rowVars(merge_mat)
idx <- order(-rv)[1:40]
rv <- rowVars(log_merge_mat)
idx <- order(-rv)[1:40]
# Create column annotations for the sample designations.
anncol <- data.frame(Var1 = factor(1:90, labels = c('1')))
rownames(anncol) <- allenvnum$SampleID
anncol$Season <- as.factor(allenvnum$Season)
anncol$Filter <- as.factor(allenvnum$Sample.Type)
anncol <- anncol[,-1]
anncol <- anncol[order(anncol$Season),]
anncol
# Change color of Seasons and filter size
# select my row annotation colors. Name the color vector clas2
my_colour = list(
anncol,
Filter = c('2' = 'yellow', '3' = "#FC4E07", '10' = '#696969'),
Season = c(spring = '#AA4488',autumn = '#4477AA', summer = '#117777', winter = '#AA7744'))
# Reorder columns in merge_mat based on column order of annotation column, and Z-score the values
mat_ordered <- merge_mat[, rownames(anncol)]
mat_orderedz <- scale(mat_ordered)
head(mat_orderedz)
# set the color create color palette
paletteLength <- 100
my_color <- rev(brewer.pal(11, "RdBu"))
my_color = colorRampPalette(my_color)(paletteLength)
myBreaks <- c(seq(min(mat_orderedz), 0, length.out=ceiling(paletteLength/2) + 1),
seq(max(mat_orderedz)/paletteLength, max(mat_orderedz), length.out=floor(paletteLength/2)))
# Create the heatmap with annotation
png("metacycPie2log.png", res=300, units='in', width=18, height=14)
pheatmap(mat_orderedz[idx,],
annotation_col = anncol,
annotation_colors = my_colour,
main = "MetaCyc annotations",
breaks = myBreaks,
color = my_color,
cellwidth = 6,
cellheight = 6,
fontsize_row = 6,
fontsize_col = 6,
#boarder_color = 'black',
cluster_rows = TRUE,
cluster_cols = TRUE,
cutree_cols = 6)
dev.off()_________________________________________________________________________________________________________
# Install packages (only once on each device)
setwd("/Users/carlymoreno/Dropbox/Yas_bay")
# Required packages
install.packages("devtools")
install.packages("BiocManager")
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# download gfortran dev package
# https://cran.r-project.org/bin/macosx/tools/
# Install NetCoMi
devtools::install_github("stefpeschel/NetCoMi",
dependencies = c("Depends", "Imports", "LinkingTo"),
repos = c("https://cloud.r-project.org/",
BiocManager::repositories()))
install.packages('ape','limma','tidyverse','phyloseq')
###############################################################################
# If previous installation fails, with warning that packages cannot be built, try first modifying the R Makevars file in the computer terminal
# ~/.R/Makevars - custom configuration to fix SpiecEasi install on macOS ARM64
# Uses Homebrew GCC/gfortran and overrides system R Makeconf which points to old /opt/gfortran
#nano ~/.R/Makevars # copy and paste the 10 lines below
# Fortran compilers - point to Homebrew-installed gfortran-15
FC = /opt/homebrew/bin/gfortran-15
F77 = /opt/homebrew/bin/gfortran-15
# Fortran libraries - use Homebrew gcc libs only, no /opt/gfortran
FLIBS = -L/opt/homebrew/opt/gcc/lib/gcc/15 -lgfortran -lquadmath -lm
# C and C++ compilers - Homebrew gcc/g++ 15
CC = /opt/homebrew/bin/gcc-15
CXX = /opt/homebrew/bin/g++-15
# Compiler flags for C and C++ (enable OpenMP)
CFLAGS = -g -O2 -fopenmp
CXXFLAGS = -g -O2 -fopenmp
# Linker flags - libomp and gcc lib paths from Homebrew
LDFLAGS = -L/opt/homebrew/opt/libomp/lib -lomp -L/opt/homebrew/opt/gcc/lib/gcc/15
CPPFLAGS = -I/opt/homebrew/opt/libomp/include
# Force R to use the above Fortran libs during shared library linking
SHLIB_FCLD = $(FLIBS)
# close Terminal and R, and then reopen R
# And then these installation steps in R:
# Required packages
install.packages("devtools")
install.packages("BiocManager")
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
install.packages("pkgbuild")
pkgbuild::check_build_tools(debug = TRUE)
# Since two of NetCoMi's dependencies are only available on GitHub,
# it is recommended to install them first:
devtools::install_github("zdk123/SpiecEasi")
devtools::install_github("GraceYoon/SPRING")
# Install NetCoMi
devtools::install_github("stefpeschel/NetCoMi",
repos = c("https://cloud.r-project.org/",
BiocManager::repositories()))
###############################################################################
# Start here
library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)
###############################################################################
# Make a phyloseq object
# Eukaryotes
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header = T, stringsAsFactors = F)
otumat <- merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon)
rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(toupper) %>%
separate(TAXON,
into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(Domain = str_replace(Domain, "d__", ""),
Phylum = str_replace(Phylum, "p__", ""),
Class = str_replace(Class, "c__", ""),
Order = str_replace(Order, "o__", ""),
Family = str_replace(Family, "f__", ""),
Genus = str_replace(Genus, "g__", ""),
Species = str_replace(Species, "s__", "")) %>%
mutate(Domain = str_replace(Domain, "Unassigned", "Unknown")) %>%
filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa',
'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Combine into phyloseq object
OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU
TAX = tax_table(taxmat)
TAX
physeq = phyloseq(OTU, TAX)
physeq
plot_bar(physeq, fill = "Phylum")
# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans
TAXtrans = tax_table(taxmatrans)
TAXtrans
physeq1 = phyloseq(OTUtrans, TAX)
physeq1
plot_bar(physeq1, fill = "Phylum")
# Input sample metadata
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F)
allenvnum <- env %>%
select(c(2:8,10:15)) %>%
filter(Sample.Type == 2) %>%
rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3' = 'No2.No3 umol L') %>%
rename_all(~ gsub("umol L", "", .)) %>%
rename_all(~ gsub("ODO mgL", "DO", .)) %>%
drop_na()
rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]
sampledata = sample_data(allenvnum)
# Create a phylogentic tree
random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)
# Merge new data with current phyloseq object:
physeq_euks = merge_phyloseq(physeq, sampledata, random_tree)
physeq_euks
plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)
plot_heatmap(physeq_euks)
plot_heatmap(physeq_euks, taxa.label="Phylum")
saveRDS(physeq_euks, "physeq_euks.rds")
# Make a phyloseq object
# Bacteria
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)
otumat <- merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon)
rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(toupper) %>%
separate(TAXON,
into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(Domain = str_replace(Domain, "d__", ""),
Phylum = str_replace(Phylum, "p__", ""),
Class = str_replace(Class, "c__", ""),
Order = str_replace(Order, "o__", ""),
Family = str_replace(Family, "f__", ""),
Genus = str_replace(Genus, "g__", ""),
Species = str_replace(Species, "s__", "")) %>%
filter(Domain == 'Bacteria') %>%
filter(!Family %in% c('Chloroplast', 'Mitochondria'))
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Combine into phyloseq object
OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU
TAX = tax_table(taxmat)
TAX
physeq = phyloseq(OTU, TAX)
physeq
plot_bar(physeq, fill = "Phylum")
# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans
TAXtrans = tax_table(taxmatrans)
TAXtrans
physeq1 = phyloseq(OTUtrans, TAX)
physeq1
plot_bar(physeq1, fill = "Phylum")
# Input sample metadata
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F)
allenvnum <- env %>%
select(c(2:8,10:15)) %>%
filter(Sample.Type == 2) %>%
rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3' = 'No2.No3 umol L') %>%
rename_all(~ gsub("umol L", "", .)) %>%
rename_all(~ gsub("ODO mgL", "DO", .)) %>%
drop_na()
rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]
sampledata = sample_data(allenvnum)
# Create a phylogentic tree
install.packages('ape')
library(ape)
random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)
# Merge new data with current phyloseq object:
physeq_bac = merge_phyloseq(physeq, sampledata, random_tree)
physeq_bac
plot_tree(physeq_bac, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)
plot_heatmap(physeq_bac)
plot_heatmap(physeq_bac, taxa.label="Phylum")
saveRDS(physeq_bac, "physeq_bac.rds")
###############################################################################
# Start here to make one network with eukaryotes and bacteria for the entire year
### Eukaryotes
pseuk <- readRDS('physeq_euks.rds')
# zero taxa 제거
ps.f = prune_samples(sample_sums(pseuk) > 0, pseuk)
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f) # 14733 taxa and 22 samples
# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Class") # 161 orders and 22 samples
# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")
# necomi 형식으로 변환 및 Genus level 설정
ps_euks_renamed <- NetCoMi::renameTaxa(ps_genus,
pat = "<name>",
substPat = "<name>_<subst_name>(<subst_R>)",
numDupli = "Class")
sum_euks_mat <- as.matrix(t(otu_table(ps_euks_renamed)@.Data))
sum_euks_df <- as.data.frame(otu_table(ps_euks_renamed)@.Data)
sum_euks_df$OTU <- rownames(sum_euks_df)
sum_euks_df <- merge(sum_euks_df, taxonomy, by = 'OTU') %>%
drop_na()
sum_euks_df <- sum_euks_df %>%
filter(!Family %in% c('uncultured','Labyrinthulomycetes'))
rownames(sum_euks_df) <- sum_euks_df$Class
sum_euks_df <- as.matrix(t(sum_euks_df[,c(-1, -c(24:30))]))
### Bacteria
psbac <- readRDS("physeq_bac.rds")
# zero taxa 제거
ps.b = prune_samples(sample_sums(psbac) > 0, psbac)
ps.b = prune_taxa(rowSums(otu_table(ps.b)) > 0, ps.b) # 21585 taxa and 22 samples
# Agglomerate to genus level
ps_bacgenus <- tax_glom(ps.b, taxrank = "Class") # 413 taxa and 22 samples
# Taxonomic table
taxtab <- as(tax_table(ps_bacgenus), "matrix")
# necomi 형식으로 변환 및 Genus level 설정
ps_bac_genus_renamed <- NetCoMi::renameTaxa(ps_bacgenus,
pat = "<name>",
substPat = "<name>_<subst_name>(<subst_R>)",
numDupli = "Class")
sum_bac_mat <- as.matrix(t(otu_table(ps_bac_genus_renamed)@.Data))
sum_bac_df <- as.data.frame(otu_table(ps_bac_genus_renamed)@.Data)
sum_bac_df$OTU <- rownames(sum_bac_df)
sum_bac_df <- merge(sum_bac_df, taxonomy, by = 'OTU') %>%
drop_na()
sum_bac_df <- sum_bac_df %>%
filter(!Family %in% c('uncultured'))
rownames(sum_bac_df) <- sum_bac_df$Class
sum_bac_df <- as.matrix(t(sum_bac_df[,c(-1, -c(24:30))]))
# Summer (group 1 bac and euks)
set.seed(123456)
spiec <- multi.spiec.easi(list(sum_euks_df, sum_bac_df),
method='mb', nlambda=40,
lambda.min.ratio=1e-1,
pulsar.params = list(thresh = 0.05))
assoMat1 <- SpiecEasi::symBeta(SpiecEasi::getOptBeta(spiec), mode = "ave")
assoMat1 <- as.matrix(assoMat1)
# Get taxa names
taxnames <- c(taxa_names(ps_genus), taxa_names(ps_bacgenus))
taxnames <- c(colnames(sum_euks_df), colnames(sum_bac_df))
colnames(assoMat1) <- rownames(assoMat1) <- taxnames
diag(assoMat1) <- 1
all <- netConstruct(data = assoMat1,
dataType = "condDependence",
sparsMethod = "none")
netall <- netAnalyze(all, hubPar = "eigenvector")
nodeCols <- c(rep("darkturquoise", 57), rep("orange", 73))
names(nodeCols) <- taxnames
png("CrossDomain_EuksBac.png", res = 300, units = 'in', width = 10, height = 8)
plot(netall,
layoutGroup = "union",
nodeColor = "colorVec",
colorVec = nodeCols,
nodeSize = "eigen",
nodeSizeSpread = 5,
labelScale = FALSE,
posCol = "darkgreen",
negCol = "red",
cexNodes = 2, # size of node
cexHubs = 2.2, # Size of the central node among nodes
cexLabels = .5, # Size of the regular nodes labesl
cexHubLabels = .7, # Size of the central node among nodes
cexTitle = 3)
legend(.7, 1, cex = .7, pt.cex = 2,
legend = c("Eukaryotes", "Bacteria"), col = c("lightblue", "orange"),
bty = "n", pch = 16)
dev.off()
summary(netall)library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)
# Make a phyloseq object
# Eukaryotes
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header = T, stringsAsFactors = F)
otumat <- merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon)
rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(toupper) %>%
separate(TAXON,
into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(Domain = str_replace(Domain, "d__", ""),
Phylum = str_replace(Phylum, "p__", ""),
Class = str_replace(Class, "c__", ""),
Order = str_replace(Order, "o__", ""),
Family = str_replace(Family, "f__", ""),
Genus = str_replace(Genus, "g__", ""),
Species = str_replace(Species, "s__", "")) %>%
mutate(Domain = str_replace(Domain, "Unassigned", "Unknown")) %>%
filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa',
'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Combine into phyloseq object
OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU
TAX = tax_table(taxmat)
TAX
physeq = phyloseq(OTU, TAX)
physeq
plot_bar(physeq, fill = "Phylum")
# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans
TAXtrans = tax_table(taxmatrans)
TAXtrans
physeq1 = phyloseq(OTUtrans, TAX)
physeq1
plot_bar(physeq1, fill = "Phylum")
# Input sample metadata
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F)
allenvnum <- env %>%
select(c(2:8,10:15)) %>%
filter(Sample.Type == 2) %>%
rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3' = 'No2.No3 umol L') %>%
rename_all(~ gsub("umol L", "", .)) %>%
rename_all(~ gsub("ODO mgL", "DO", .)) %>%
drop_na()
rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]
sampledata = sample_data(allenvnum)
# Create a phylogentic tree
install.packages('ape')
library(ape)
random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)
# Merge new data with current phyloseq object:
physeq1 = merge_phyloseq(physeq, sampledata, random_tree)
physeq1
plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)
plot_heatmap(physeq1)
plot_heatmap(physeq1, taxa.label="phylum")
saveRDS(physeq1, "physeq1.rds")
###############################################################################
# Start the tutorial and incorporate my data
# Summer time
# 1.import data
ps <- readRDS("physeq1.rds")
summer = subset_samples(ps, Season %in% c("summer")) #Extract only summer microbiome samples
# 2. processing data
Taxa100 = names(sort(taxa_sums(summer), decreasing = TRUE)[1:50]) # Top 100 taxa extract
summer.1 = prune_taxa(Taxa100, summer)
# 3. filtering
summer_genus <- tax_glom(summer.1, taxrank = "Genus") # Agglomerate to genus level
taxtabsummer <- as(tax_table(summer_genus), "matrix")
# 4. Rename non-unique names using rename function
summer_genus_renamed <- NetCoMi::renameTaxa(summer_genus,
pat = "<name>",
substPat = "<name>_<subst_name>(<subst_R>)",
numDupli = "Genus")
# 5. NetConstruct Functionto construct a network using spearman correlation. Write measure
# as "spearman", normalization as "clr", and zero value as "multiRepl" method.
netsum_genus <- netConstruct(summer_genus_renamed, # phyloseq
taxRank = "Genus", # 기준 tax level
filtTax = "highestFreq",
filtTaxPar = list(highestfreq = 50),
zeroMethod = "pseudo",
zeroPar = list(pseudocount = 0.5), # 기준 tax level
measure = "spearman", # spearman correlation 으로 계산해보자
normMethod = "clr", # transformation method to deal with zero, tutorial pairs this with Pearson method
sparsMethod = "threshold",
thresh = 0.3,
verbose = 3,
seed = 1234)
sum_props_genus <- netAnalyze(netsum_genus,
clustMethod = "cluster_fast_greedy",
weightDeg = TRUE,
normDeg = FALSE,
gcmHeat = FALSE,
hubPar = "eigenvector")
plotHeat(mat = sum_props_genus$graphletLCC$gcm1,
pmat = sum_props_genus$graphletLCC$pAdjust1,
type = "mixed",
title = "GCM",
colorLim = c(-1, 1),
mar = c(2, 0, 2, 0))
# 6. Information of the created network
summary(sum_props_genus, numbNodes = 5L)
# 7. plot layout setting up
graph4 <- igraph::graph_from_adjacency_matrix(netsum_genus$adjaMat1,
weighted = TRUE)
set.seed(42) # fixing the shape
lay_fr <- igraph::layout_with_fr(graph4)
rownames(lay_fr) <- rownames(netsum_genus$adjaMat1) # Row names of the layout matrix must match the node names
# 8. Drawing a network plot 그리기
png("euks_genera_50_AI_summer_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(sum_props_genus, # Network Ingredients
layout = "spring", # Network plot 의 layout # "spring" also available
labelLength = 10, # label length
repulsion = 0.84,
cexNodes = 0.9, # size of node
cexHubs = 1.1, # Size of the central node among nodes
colorVec = phylcol, # edge color
posCol = "darkturquoise", # positive correlation edge color
negCol = "orange", # negative correlation edge color
cexLabels = 1.8, # node label size
title1 = "Summer Top 50 Eukaryote genera (Spearman)", # Title
showTitle = TRUE, # Show title
cexTitle = 1.5)
# Get phyla names
taxtab <- as(tax_table(summer_genus_renamed), "matrix")
phyla <- as.factor(gsub("p__", "", taxtab[, "Phylum"]))
names(phyla) <- taxtab[, "Phylum"]
# index color
phylcol <- c("cyan", "blue3", "red", "lawngreen", "yellow", "deeppink")
phylcol_transp <- colToTransp(phylcol, 60)
# Phylum color index
legend(-1.4, 1.7, cex = 1, pt.cex = 1, title = "Phylum:",
legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 12)
# correlation color index 0.4, 1.1,
legend(.8, 1.2, cex = 1, title = "estimated correlation:",
legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"),
bty = "n", horiz = FALSE,
bg = grey)
dev.off()
###############################################################################
# Wintertime
# 1.import data
ps <- readRDS("physeq1.rds")
winter = subset_samples(ps, Season %in% c("winter")) #Extract only winter microbiome samples
# 2. processing data
Taxa100 = names(sort(taxa_sums(winter), decreasing = TRUE)[1:50]) # Top 100 taxa extract
winter.1 = prune_taxa(Taxa100, winter)
# 3. filtering
winter_genus <- tax_glom(winter.1, taxrank = "Genus") # Agglomerate to genus level
taxtabwinter <- as(tax_table(winter_genus), "matrix")
# 4. Rename non-unique names using rename function
winter_genus_renamed <- NetCoMi::renameTaxa(winter_genus,
pat = "<name>",
substPat = "<name>_<subst_name>(<subst_R>)",
numDupli = "Genus")
# 5. NetConstruct Functionto construct a network using spearman correlation. Write measure
# as "spearman", normalization as "clr", and zero value as "multiRepl" method.
netwinter_genus <- netConstruct(winter_genus_renamed, # phyloseq
taxRank = "Genus",
filtTax = "highestFreq",
filtTaxPar = list(highestfreq = 50),
zeroMethod = "pseudo",
zeroPar = list(pseudocount = 0.5), # 기준 tax level
measure = "spearman", # spearman correlation 으로 계산해보자
normMethod = "clr", # transformation
sparsMethod = "threshold",
thresh = 0.3,
verbose = 3,
seed = 1234)
# taxRank = "Genus", # 기준 tax level
# measure = "spearman", # spearman correlation 으로 계산해보자
# zeroMethod = "multRepl",
# normMethod = "clr", # transformation
# sparsMethod = "threshold",
# thresh = 0.3,
# verbose = 3,
# seed = 1234)
winter_props_genus <- netAnalyze(netwinter_genus,
clustMethod = "cluster_fast_greedy",
weightDeg = TRUE,
normDeg = FALSE,
gcmHeat = FALSE,
hubPar = "eigenvector")
plotHeat(mat = winter_props_genus$graphletLCC$gcm1,
pmat = winter_props_genus$graphletLCC$pAdjust1,
type = "mixed",
title = "GCM",
colorLim = c(-1, 1),
mar = c(2, 0, 2, 0))
# 6. Information of the created network
summary(winter_props_genus, numbNodes = 5L)
# 7. plot layout setting up
graph5 <- igraph::graph_from_adjacency_matrix(netwinter_genus$adjaMat1,
weighted = TRUE)
set.seed(42) # fixing the shape
lay_fr <- igraph::layout_with_fr(graph5)
rownames(lay_fr) <- rownames(netwinter_genus$adjaMat1) # Row names of the layout matrix must match the node names
# 8. Drawing a network plot 그리기
png("euks_winter_top50_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(winter_props_genus, # Network Ingredients
layout = "spring", # Network plot 의 layout # "spring" also available
labelLength = 10, # label length
cexNodes = 0.9, # size of node
cexHubs = 1.1, # Size of the central node among nodes
colorVec = phylcol, # edge color
posCol = "darkturquoise", # positive correlation edge color
negCol = "orange", # negative correlation edge color
cexLabels = 1.8, # node label size
title1 = "Winter Eukaryote genera (Spearman)", # Title
showTitle = TRUE, # Show title
cexTitle = 1.5, # Title size
mar = c(5,5,5,5))
# Get phyla names
taxtab <- as(tax_table(winter_genus_renamed), "matrix")
phyla <- as.factor(gsub("p__", "", taxtab[, "Phylum"]))
names(phyla) <- taxtab[, "Phylum"]
# index color
phylcol <- c("cyan", "blue3", "red", "lawngreen", "yellow", "deeppink")
phylcol_transp <- colToTransp(phylcol, 60)
# Phylum color index
legend(-1.1, 1.1, cex = 1, pt.cex = 1, title = "Phylum:",
legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16)
# correlation color index 0.4, 1.1,
legend(.7, -.3, cex = 1, title = "estimated correlation:",
legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"),
bty = "n", horiz = FALSE,
bg = grey)
dev.off()
###############################################################################
# Summer versus winter eukaryotic networks
ps <- readRDS("physeq1.rds")
ps.pt <- ps %>%
subset_samples(Season %in% c("summer", "winter")) # 19083 taxa and 12 samples
# zero taxa 제거
ps.f = prune_samples(sample_sums(ps.pt)>0, ps.pt)
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f) # 3090 taxa and 12 samples
# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Family")
# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")
# necomi 형식으로 변환 및 Genus level 설정
ps_genus_renamed <- NetCoMi::renameTaxa(ps_genus,
pat = "<name>",
substPat = "<name>_<subst_name>(<subst_R>)",
numDupli = "Family")
# 각 부위에 대한 샘플 추출
ps_genus_renamed_summer <- phyloseq::subset_samples(ps_genus_renamed,
Season == "summer")
ps_genus_renamed_winter <- phyloseq::subset_samples(ps_genus_renamed,
Season == "winter")
ps_genus.sparcc <- netConstruct(
data = ps_genus_renamed_summer,
data2 = ps_genus_renamed_winter,
filtTax = "highestFreq", # Top을 뽑는 기준
filtTaxPar = list(highestFreq = 50), # Top 100
taxRank = "Family", # Genus level
measure = "sparcc", # sparcc
measurePar = list(nlambda=20, rep.num=10),
normMethod = "clr", # transformation
zeroMethod = "none", # zero 값 보정
sparsMethod = "threshold",
adjust = "adaptBH", # p-value 값 보정
thresh = 0.3,
dissFunc = "signed",
verbose = 2,
seed = 42)
ps_props_sparcc <- netAnalyze(ps_genus.sparcc,
clustMethod = "cluster_fast_greedy",
hubPar = "eigenvector", # Hub node 판별방법
normDeg = FALSE)
summary(ps_props_sparcc, groupNames = c("summer", "winter"))
# Compute layout
ps_graph <- igraph::graph_from_adjacency_matrix(ps_genus.sparcc$adjaMat1,
weighted = TRUE)
set.seed(42)
ps_lay_fr <- igraph::layout_with_fr(ps_graph)
# Row names of the layout matrix must match the node names
rownames(ps_lay_fr) <- rownames(ps_genus.sparcc$adjaMat1)
# Get phyla names
taxtab <- as(tax_table(ps_genus_renamed), "matrix")
phyla <- as.factor(taxtab[, "Phylum"])
names(phyla) <- taxtab[, "Family"]
table(phyla)
# Define phylum colors
library(RColorBrewer)
phylcol <- c(brewer.pal(12, "Paired"), brewer.pal(12, "Set3"))
# Colors used in the legend should be equally transparent as in the plot
phylcol_transp <- colToTransp(phylcol, 60)
png("euks_family_50_summer_vs_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(ps_props_sparcc,
sameLayout = TRUE,
#nodeColor = "cluster",
#nodeSize = "mclr",
#labelScale = FALSE,
nodeSize = "eigenvector", # "clr",
nodeColor = "cluster", #"feature",
#labelLength = 8, # label length
repulsion = 0.85,
#layoutGroup = "union",
cexNodes = 1.5, # size of node
cexHubs = 1.1, # Size of the central node among nodes
featVecCol = phyla,
colorVec = phylcol,
cexLabels = 2.5,
posCol = "darkturquoise",
negCol = "orange",
title1 = "Network on genus level with sparcc correlations",
showTitle = TRUE,
groupNames = c("Summer","Winter"))
#legend(-1.2, 10, cex = 1.5, pt.cex = 2.5, title = "Phylum:",
# legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16)
legend(0.7, 1.3, cex = 2.2, title = "estimated correlation:",
legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"),
bty = "n", horiz = TRUE)
dev.off()
comp_season <- netCompare(ps_props_sparcc,
permTest = FALSE,
verbose = FALSE,
seed = 123456)
summary(comp_season,
groupNames = c("summer", "winter"),
showCentr = c("degree", "between", "closeness"),
numbNodes = 5)
plot(ps_props_sparcc,
sameLayout = TRUE,
layoutGroup = "union",
#colorVec = col,
borderCol = "gray40",
nodeSize = "degree",
cexNodes = 0.9,
nodeSizeSpread = 3,
edgeTranspLow = 80,
edgeTranspHigh = 50,
groupNames = c("summer", "winter"),
showTitle = TRUE,
cexTitle = 2.8,
mar = c(1,1,3,1),
repulsion = 0.9,
labels = FALSE,
rmSingles = "inboth",
nodeFilter = "clustMin",
nodeFilterPar = 10,
nodeTransp = 50,
hubTransp = 30)library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)
###############################################################################
# Make a phyloseq object
# Bacteria
# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)
otumat <- merged %>%
rename(otu = id) %>%
select(-Sequence,-Confidence,-Taxon)
rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))
# Taxonomy
taxonomy <- merged %>%
rename(otu = id) %>%
select(otu, Taxon) %>%
rename_all(toupper) %>%
separate(TAXON,
into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
sep = "; ",
fill = 'right') %>% # NA values on the right if it need to add information
mutate(Domain = str_replace(Domain, "d__", ""),
Phylum = str_replace(Phylum, "p__", ""),
Class = str_replace(Class, "c__", ""),
Order = str_replace(Order, "o__", ""),
Family = str_replace(Family, "f__", ""),
Genus = str_replace(Genus, "g__", ""),
Species = str_replace(Species, "s__", "")) %>%
filter(Domain == 'Bacteria') %>%
filter(!Family %in% c('Chloroplast', 'Mitochondria'))
taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)
taxmatrans <- as.matrix(t(taxmat))
# Combine into phyloseq object
OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU
TAX = tax_table(taxmat)
TAX
physeq = phyloseq(OTU, TAX)
physeq
plot_bar(physeq, fill = "phylum")
# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans
TAXtrans = tax_table(taxmatrans)
TAXtrans
physeq1 = phyloseq(OTUtrans, TAX)
physeq1
plot_bar(physeq1, fill = "Phylum")
# Input sample metadata
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F)
allenvnum <- env %>%
select(c(2:8,10:15)) %>%
filter(Sample.Type == 2) %>%
rename_all(~ gsub("_", " ", .)) %>%
rename('NO2+NO3' = 'No2.No3 umol L') %>%
rename_all(~ gsub("umol L", "", .)) %>%
rename_all(~ gsub("ODO mgL", "DO", .)) %>%
drop_na()
rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]
sampledata = sample_data(allenvnum)
# Create a phylogentic tree
install.packages('ape')
library(ape)
random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)
# Merge new data with current phyloseq object:
physeq1 = merge_phyloseq(physeq, sampledata, random_tree)
physeq1
plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)
plot_heatmap(physeq1)
plot_heatmap(physeq1, taxa.label="phylum")
saveRDS(physeq1, "physeq1.rds")
###############################################################################
# Start the tutorial and incorporate my data
# Summer versus winter eukaryotic networks
ps <- readRDS("physeq1.rds")
ps.pt <- ps %>%
subset_samples(Season %in% c("summer", "winter")) # 19083 taxa and 12 samples
# zero taxa 제거
ps.f = prune_samples(sample_sums(ps.pt)>0, ps.pt)
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f) # 3090 taxa and 12 samples
# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Family")
# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")
# necomi 형식으로 변환 및 Genus level 설정
ps_genus_renamed <- NetCoMi::renameTaxa(ps_genus,
pat = "<name>",
substPat = "<name>_<subst_name>(<subst_R>)",
numDupli = "Family")
# 각 부위에 대한 샘플 추출
ps_genus_renamed_summer <- phyloseq::subset_samples(ps_genus_renamed,
Season == "summer")
ps_genus_renamed_winter <- phyloseq::subset_samples(ps_genus_renamed,
Season == "winter")
ps_genus.sparcc <- netConstruct(
data = ps_genus_renamed_summer,
data2 = ps_genus_renamed_winter,
filtTax = "highestFreq", # Top을 뽑는 기준
filtTaxPar = list(highestFreq = 50), # Top 100
taxRank = "Family", # Genus level
measure = "sparcc", # sparcc
measurePar = list(nlambda=20, rep.num=10),
normMethod = "clr", # transformation
zeroMethod = "none", # zero 값 보정
sparsMethod = "threshold",
adjust = "adaptBH", # p-value 값 보정
thresh = 0.3,
dissFunc = "signed",
verbose = 2,
seed = 42)
ps_props_sparcc <- netAnalyze(ps_genus.sparcc,
clustMethod = "cluster_fast_greedy",
hubPar = "eigenvector", # Hub node 판별방법
normDeg = FALSE)
summary(ps_props_sparcc, groupNames = c("summer", "winter"))
# Compute layout
ps_graph <- igraph::graph_from_adjacency_matrix(ps_genus.sparcc$adjaMat1,
weighted = TRUE)
set.seed(42)
ps_lay_fr <- igraph::layout_with_fr(ps_graph)
# Row names of the layout matrix must match the node names
rownames(ps_lay_fr) <- rownames(ps_genus.sparcc$adjaMat1)
# Get phyla names
taxtab <- as(tax_table(ps_genus_renamed), "matrix")
phyla <- as.factor(taxtab[, "Phylum"])
names(phyla) <- taxtab[, "Family"]
table(phyla)
# Define phylum colors
library(RColorBrewer)
phylcol <- c(brewer.pal(12, "Paired"), brewer.pal(12, "Set3"))
# Colors used in the legend should be equally transparent as in the plot
phylcol_transp <- colToTransp(phylcol, 60)
png("bac_family_50_summer_vs_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(ps_props_sparcc,
sameLayout = TRUE,
#nodeColor = "cluster",
#nodeSize = "mclr",
#labelScale = FALSE,
nodeSize = "eigenvector", # "clr",
nodeColor = "cluster", #"feature",
#labelLength = 8, # label length
repulsion = 0.84,
cexNodes = 1.5, # size of node
cexHubs = 1.1, # Size of the central node among nodes
featVecCol = phyla,
colorVec = phylcol,
cexLabels = 2.5,
posCol = "darkturquoise",
negCol = "orange",
title1 = "Network on genus level with sparcc correlations",
showTitle = TRUE,
groupNames = c("Summer","Winter"))
#legend(-1.2, 10, cex = 1.5, pt.cex = 2.5, title = "Phylum:",
# legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16)
legend(0.7, 1.3, cex = 2.2, title = "estimated correlation:",
legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"),
bty = "n", horiz = TRUE)
dev.off()
comp_season <- netCompare(ps_props_sparcc,
permTest = FALSE,
verbose = FALSE,
seed = 123456)
summary(comp_season,
groupNames = c("summer", "winter"),
showCentr = c("degree", "between", "closeness"),
numbNodes = 5)