Skip to content

Latest commit

 

History

History
3168 lines (2515 loc) · 112 KB

File metadata and controls

3168 lines (2515 loc) · 112 KB

Yas Bay code

Author: Carly Moreno

March 30, 2025

New York University Abu Dhabi

_________________________________________________________________________________________________________

Environmental parameters

Correlation plot (Figure 1C)

install.packages("tidyverse")
install.packages('caroline')
install.packages("ggtext")
install.packages("wesanderson")
install.packages("factoextra")
install_github("kassambara/factoextra")
install.packages("missMDA")
install.packages("FactoMineR")
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(RColorBrewer)
library(wesanderson)
library(caroline)
library(tidyverse)
library(dplyr)
library(vegan)
library(devtools)
library(factoextra)
library("FactoMineR")
library(missMDA)
library(Hmisc)
library(corrplot)

# Make an environmental correlation plot

# Using all the data
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) %>% 
  select(c(7:14)) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  select(!3)
  
# Correlation matrix 
ccc <- cor(env, method = 'pearson', use = 'complete.obs')

# Correlation coefficients and the p-values of the correlation for all possible pairs
res <- rcorr(as.matrix(ccc), type = 'pearson')

# Customize the color of the correlogram
color <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
COL2(diverging = c("RdBu", "BrBG", "PiYG", "PRGn", "PuOr", "RdYlBu"), n = 200)

png(filename = "CorrelationSelectedEnvReversedFinal_nopH_12pt.png", res = 300, units = 'in', width = 7, height = 7)
g <- corrplot(ccc, method="color", col=rev(color(200)),  
         type = "upper", 
         p.mat = res$P, 
         tl.col = "black", 
         tl.srt = 45,
         tl.cex = 1.4,
         cl.cex = 1.4,
         cl.align.text ='l', 
         sig.level = c(0.001, 0.01, 0.05), pch.cex = 1.75,
         insig = 'label_sig', pch.col = 'grey20',
         diag = FALSE) 
dev.off()

Environmental PCA-biplot (Figure 1D)

# Using all the data, clean it up 
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:4,5,6,7,8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

# The matrix
allenv <- allenvnum %>% 
  select(c(6:13))

# Compute PCA 
env.pca <- prcomp(allenv, scale = T)
summary(env.pca)

# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(env.pca)

# Graph of individuals. Individuals with a similar profile are grouped together.
fviz_pca_ind(env.pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

# Graph of variables. Positive correlated variables point to the same side of the plot. 
# Negative correlated variables point to opposite sides of the graph.

fviz_pca_var(env.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

# Biplot of individuals and variables
fviz_pca_biplot(env.pca, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
)

# PCA results
# Eigenvalues
eig.val <- get_eigenvalue(env.pca)
eig.val

# Results for Variables
res.var <- get_pca_var(env.pca)
res.var$coord          # Coordinates
res.var$contrib        # Contributions to the PCs
res.var$cos2           # Quality of representation 
# Results for individuals
res.ind <- get_pca_ind(env.pca)
res.ind$coord          # Coordinates
res.ind$contrib        # Contributions to the PCs
res.ind$cos2           # Quality of representation 

var <- get_pca_var(env.pca)

# Color individuals by groups or some qualitative variable

groups <- as.factor(allenvnum$Season)

# Set the theme for all the following plots.
theme_set(theme_bw(base_size = 14))

png("PCAbiplot_EnvironmentalFactors_12pt.png",  width = 2200, height = 1800, res = 300)
g <- fviz_pca_biplot(env.pca, 
                     # Individuals (stations)
                     geom.ind = 'point', 
                     pointsize = 2,
                     col.ind = groups,
                     palette = c('#696969',"#00AFBB", "#FC4E07",  'darkgreen') ,
                     addEllipses = TRUE,                               # Concentration ellipses 
                     ellipse.type = "confidence",        # ellipse.level = 0.95,
                     
                     # Variable (environmental variable)
                     geom.var = c('arrow', 'text'),
                     labelsize = 4,
                     repel = T,                                        # Avoid label overplotting
                     #label = 5,
                     ggtheme = theme(axis.text=element_text(size=16.5), 
                                     axis.title=element_text(size=12),
                                     legend.text=element_text(size=16.5),
                                     axis.title.x = element_text(size = 16.5),
                                     axis.title.y = element_text(size = 16.5))) 
print(g)
dev.off()

# PerMANOVA - partitioning the euclidean distance matrix by Season to determine if the differences are statistically different

adonis2(allenv ~ Season, data = allenvnum, method='eu')

_________________________________________________________________________________________________________

Cell counts

Diatom cell counts (Figure 2A)

cells <- read.csv('cell_counts.csv', header=T, stringsAsFactors=F) %>% 
  rename('Pseudo nitzschia sp.' = 'Pseudo.nitzschia_sp.')
  
diatoms <- as_tibble(cells) %>%  
  rename(date=Sampling_Date) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  select(date, 'Pseudo nitzschia sp.','Chaetoceros sp.','Skeletonema sp.', 'Rhizosolenia sp.', 'unidentified diatoms',"Cyanobacteria","Dinoflagellates") %>% 
  pivot_longer(-date, names_to='taxon', values_to='counts')

diatoms$date <- as.Date(diatoms$date, "%d/%m/%Y") 
diatoms$date <- as.character(diatoms$date)
diatoms
datelist <- as.factor(diatoms$date)
datelist

png("diatom_with_phyto_totalCellcounts_12pt.png", res=300, units='in', width=12, height=9)
g <- ggplot(diatoms, aes(x=rev(date), y=counts, fill=taxon)) +
  geom_col(color='black', width=0.7) +
  coord_flip() + 
  labs(x="Date",
       y="Total counts (cells/mL)") +
  theme_classic() + 
  scale_fill_manual(name=NULL,
                    breaks=c("Cyanobacteria","Dinoflagellates",'Pseudo nitzschia sp.','Chaetoceros sp.','Skeletonema sp.', 'Rhizosolenia sp.', 'unidentified diatoms'),
                    values=c('#771155',  '#114477', '#117777','#774411', 'darkgrey','#88CCAA', '#777711', 
                               '#AAAA44', '#DDDD77',  '#AA7744', '#DDAA77', '#771122', 
                               '#AA4455')) +
  scale_x_discrete(breaks=datelist,
                   labels=rev(datelist)) +
  scale_y_continuous(expand=c(0,0)) +
  theme(legend.text=element_markdown(size = 14),
        legend.key.size=unit(13, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        axis.text.x=element_text(angle=0, size = 14), 
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 16))
print(g)
dev.off()

ggsave("just_the_diatoms.tiff", width=6, height=5)

Synechoccus and heterotrophic bacteria (Supplemental figure 5A and B)

data <- read.csv('YasBay_FCM.csv', header = T, stringsAsFactors = F, na = "NA")  # included na = 'NA', to make it stored as an NA not a string

data$Date <- as.Date(data$Date, "%m/%d/%Y") 
metadata$date <- as.character(metadata$date)

# Plot the data
ggplot(data, aes(x = Date, y = Synechococcus)) +
  geom_line(color = "blue", size = 1) +  # Line graph
  geom_point(size = 2, color = 'blue') +  # Add points for emphasis
  scale_x_date(date_labels = "%b-%Y", date_breaks = "1 month") +  # Format x-axis as Month-Year
  labs(title = 'Synechoccus', x = "Date", y = 'cells/mL') +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))  # Rotate x-axis labels

ggsave("SynechoccosYB.tiff", width = 6, height = 5)

# Plot the data
ggplot(data, aes(x = Date, y = Heterotrophic.Prokaryotes)) +
  geom_line(color = "darkorange", size = 1) +  # Line graph
  geom_point(size = 2, color = 'darkorange') +  # Add points for emphasis
  scale_x_date(date_labels = "%b-%Y", date_breaks = "1 month") +  # Format x-axis as Month-Year
  labs(title = 'Hetertrophic prokaryotes', x = "Date", y = 'cells/mL') +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))  # Rotate x-axis labels


ggsave("HeterProksYB.tiff", width = 6, height = 5)

_________________________________________________________________________________________________________

Relative abundance data

Eukaryotic relative abundance (Figure 3A)

install.packages("tidyverse")
install.packages('caroline')
install.packages("ggtext")
install.packages("wesanderson")

library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(RColorBrewer)
library(wesanderson)
library(caroline)
library(dplyr)

# Import the metadata
metadata <- read.csv('metadata.csv', header=T, stringsAsFactors=F, na="NA") %>%   # included na='NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter=Sample.Type, sample=SampleID) %>% 
  rename_all(tolower) %>% 
  filter(!date == '23/02/2024') %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu=id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to='sample',values_to='count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu=id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into=c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(domain=str_replace(domain, "d__", ""),
         phylum=str_replace(phylum, "p__", ""),
         class=str_replace(class, "c__", ""),
         order=str_replace(order, "o__", ""),
         family=str_replace(family, "f__", ""),
         genus=str_replace(genus, "g__", ""),
         species=str_replace(species, "s__", "")) %>%
  filter(domain == 'Eukaryota' | domain == 'Unassigned') %>%
  filter(!phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 'Incertae_Sedis', 
                        'Labyrinthulomycetes','Bicosoecida'))

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(sample) %>% 
  dplyr::summarize(total=sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols=c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to="level",
               values_to="taxon") %>% 
  mutate(filter=factor(filter,
                         levels=c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'phylum') %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(rel_abund=100*sum(rel_abund), .groups='drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon=replace_na(taxon, "Unclassified Eukaryota")) %>% 
  mutate(taxon=str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon=str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  dplyr::summarize(max=max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(mean=mean(rel_abund), .groups='drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  dplyr::summarize(pool=max(mean) < 10,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean=mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups='drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by='taxon') %>% 
  mutate(taxon=if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  dplyr::summarize(rel_abund=sum(rel_abund), 
            mean=min(mean),
            .groups='drop') %>% 
  mutate(taxon=factor(taxon),
         taxon=fct_reorder(taxon, mean, .desc=TRUE),
         taxon=fct_shift(taxon, n=1)) # reorder legend, by reordering the factors, go back to pool line

png("euk_phylum_bar_rel_abund_all_dates_12pt.png", res=300, units='in', width=11, height=16)
g <- ggplot(test, aes(x=rev(date), y=rel_abund, fill=taxon)) +
  geom_col(color='black', width=0.7) +
  coord_flip() + 
  labs(x="Date",
       y="Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name=NULL,
                    breaks=c("*Ascomycota*","*Chlorophyta*","*Cryptophyceae*","*Diatomea*","*Dinoflagellata*", "*Ochrophyta*", 
                               "*Ochrophyta*", "*Peronosporomycetes*", "*Protalveolata*","*Prymnesiophyceae*",
                               "Unclassified *Eukaryota*", "Other"),
                    values=c('#771155', '#AA4488', '#CC99BB', '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#44AAAA', '#77CCCC', '#117744', '#44AA77', 'darkgrey', '#777711', 
                               '#AAAA44', '#D1D1D1','#DDDD77', '#774411', '#AA7744', '#DDAA77', '#771122', 
                               '#AA4455')) +
  scale_x_discrete(breaks=datelist,
                   labels=rev(datelist)) +
  scale_y_continuous(expand=c(0.0,3), ) +
  theme(legend.position="bottom",
        legend.text=element_markdown(size = 23),
        legend.key.size=unit(16, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        axis.text.x=element_text(angle=0, size = 25),
        axis.text.y = element_text(size = 25),
        axis.title.x = element_text(size = 25)) +
  guides(fill = guide_legend(nrow = 5, byrow = TRUE))

print(g)
dev.off()

Bacteria relative abundance (Figure 3B)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Bacteria' | domain == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  dplyr::summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'phylum') %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  dplyr::summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  dplyr::summarize(pool = max(mean) < 1,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  dplyr::summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("bacteria_phylum_bar_rel_abund_all_dates_12pt.png", res = 300, units = 'in', width=11, height=16)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.7) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*Actinobacteriota*","*Bacteroidota*","*Bdellovibrionota*","*Chloroflexi*",
                               "*Cyanobacteria*","*Desulfobacterota*", "*Firmicutes*", "*Patescibacteria*",
                               "*Planctomycetota*","*Proteobacteria*","*SAR324_clade(Marine_group_B)*","*Verrucomicrobiota*", "Other"),
                    values = c('#771155', '#AA4488',  '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#774411', '#44AAAA', '#77CCCC', '#44AA77', '#AA7744','#88CCAA','darkgray', '#AAAA44','#777711', 
                               '#117744',  '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
  scale_x_discrete(breaks = rev(datelist),
                   labels = datelist) +
  scale_y_continuous(expand=c(0.0,3), ) +
theme(legend.position="bottom",
      legend.text=element_markdown(size = 23),
      legend.key.size=unit(16, "pt"),
      strip.background=element_blank(), 
      strip.placement='outside',
      axis.text.x=element_text(angle=0, size = 25),
      axis.text.y = element_text(size = 25),
      axis.title.x = element_text(size = 25)) +
  guides(fill = guide_legend(nrow = 7, byrow = TRUE))

print(g)
dev.off()

Archaeal relative abundance (Supplemental figure 8)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_arc.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Archaea' | domain == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'genus') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Archaea")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool = max(mean) < 15,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("archaea_bar_rel_abund_all_dates.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.5) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*Bathyarchaeia*","*Halococcus*","*Marine_Group_II*","*Methanobrevibacter*",
                               "*Methanosaeta*","*Nitrososphaeraceae*", "Unclassified *Archaea*","*uncultured*", "Other"),
                    values = c('#771155', '#AA4488',  '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#774411', '#44AAAA','darkgray', '#77CCCC', '#44AA77', '#AA7744','#88CCAA', '#AAAA44','#777711', 
                               '#117744',  '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
  scale_x_discrete(breaks = datelist,
                   labels = rev(datelist)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.text = element_markdown(size = 12),
        legend.key.size = unit(14, "pt"),
        strip.background = element_blank(), 
        strip.placement = 'outside',
        axis.text.x = element_text(angle = 0, size = 12))

print(g)
dev.off()

dev.off() 
(device = "RStudioGD")

Diatom relative abundance (Supplemental figure 6)

# Import the metadata
metadata <- read.csv('metadata.csv', header=T, stringsAsFactors=F, na="NA") %>%   # included na='NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter=Sample.Type, sample=SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- as.factor(metadata$date)
datelist <- datelist

# ASV counts for each feature
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu=id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to='sample',values_to='count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu=id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into=c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(domain=str_replace(domain, "d__", ""),
         phylum=str_replace(phylum, "p__", ""),
         class=str_replace(class, "c__", ""),
         order=str_replace(order, "o__", ""),
         family=str_replace(family, "f__", ""),
         genus=str_replace(genus, "g__", ""),
         species=str_replace(species, "s__", "")) %>% 
  filter(phylum == 'Diatomea')

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total=sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols=c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to="level",
               values_to="taxon") %>% 
  mutate(filter=factor(filter,
                         levels=c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'genus') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund=100*sum(rel_abund), .groups='drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon=replace_na(taxon, "Unclassified Diatomea")) %>% 
  mutate(taxon=str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon=str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max=max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean=mean(rel_abund), .groups='drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool=max(mean) < 30,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean=mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups='drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by='taxon') %>% 
  mutate(taxon=if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund=sum(rel_abund), 
            mean=min(mean),
            .groups='drop') %>% 
  mutate(taxon=factor(taxon),
         taxon=fct_reorder(taxon, mean, .desc=TRUE),
         taxon=fct_shift(taxon, n=1)) # reorder legend, by reordering the factors, go back to pool line

png("diatom_phylum_bar_rel_abund_all_dates.png", res=300, units='in', width=9, height=8)
g <- ggplot(test, aes(x=rev(date), y=rel_abund, fill=taxon)) +
  geom_col(color='black', width=0.5) +
  coord_flip() + 
  labs(y="Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name=NULL,
                    breaks=c("*Arcocellulus*","*Cerataulina*","*Chaetoceros*","*Cyclotella*",
                               "*Cylindrotheca*","*Leptocylindrus*","*Nitzschia*","*Pseudo-nitzschia*",
                               "*Skeletonema*","*Thalassiosira*","Unclassified *Diatomea*","Other"),
                   values=c('#771155', '#AA4488', '#CC99BB', '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#44AAAA', '#77CCCC', '#117744', '#44AA77', '#D1D1D1','#88CCAA', '#AAAA44','#777711',  
                               '#DDDD77','#DDAA77', '#774411', '#AA7744',  '#771122', 
                               '#AA4455')) +
  scale_x_discrete(breaks=datelist,
                   labels=rev(datelist)) +
  scale_y_continuous(expand=c(0,0)) +
  theme(legend.text=element_markdown(size = 12),
        legend.key.size=unit(14, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        axis.text.x=element_text(angle=0, size = 12))
print(g)
dev.off()

Proteobactera relative abundance (Supplemental figure 7A)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(phylum == 'Proteobacteria' | phylum == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'family') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool = max(mean) < 10,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes


test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("bacteria_Proteobacteria_phylum_bar_rel_abund_all_dates_top12.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.5) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*AEGEAN-169_marine_group*", "*Alcanivoracaceae1*", "*Algiphilaceae*", "*Alteromonadaceae*",
                               "*Arenicellaceae*","*Burkholderiaceae*", "*Cellvibrionaceae*", "*Nitriliruptoraceae*",
                               "*Clade_I*","*Colwelliaceae*","*Comamonadaceae*","*Halieaceae*",
                               "*Halomonadaceae*","*Hyphomicrobiaceae*","*Kangiellaceae*","*Legionellaceae*",
                               "*Marinobacteraceae*", "*Micavibrionaceae*","*Moraxellaceae*","*Pseudoalteromonadaceae*",
                               "*Pseudomonadaceae*","*Rhizobiaceae*", "*Rhodobacteraceae*", "*SAR116_clade*","*SAR86_clade*",
                               "*Salinisphaeraceae*","*Solimonadaceae*", "*Sphingomonadaceae*","*Spongiibacteraceae*",
                               "*Stappiaceae*", "*Vibrionaceae*","*Unknown_Family*", "Unclassified *Bacteria*", "Other"),
                    # breaks = c("*AEGEAN-169_marine_group*", "*Alcanivoracaceae1*", "*Algiphilaceae*", "*Alteromonadaceae*",
                    #            "*Arenicellaceae*","*Clade_I*","*Halieaceae*","*Halomonadaceae*",
                    #            "*Pseudoalteromonadaceae*", "*Rhodobacteraceae*", "*SAR86_clade*","*Solimonadaceae*", 
                    #            "*Sphingomonadaceae*", "*Stappiaceae*", "*Unknown_Family*","Other"),
                    values = c("#771155", "#AA4488", "#CC99BB", "#114477",
                               "#4477AA", "#77AADD", "#117777", "#44AAAA", 
                               "#77CCCC", "#117744", "#44AA77", "#88CCAA",
                               "#777711", "#AAAA44", "#DDDD77", "#774411", 
                               "#AA7744", "#DDAA77", "#771122", "#AA4455", 
                               "#DD7788", "#005A32", "purple", "green", 
                               "#CAE0AB", "#F7EE55", "#F6C141", "#F1932D", 
                               "#E8601C", "#DC050C","#99000D", "#000000",
                               "yellow")) +
  scale_x_discrete(breaks = datelist,
                   labels = rev(datelist)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.text = element_markdown(size = 12),
        legend.key.size = unit(14, "pt"),
        strip.background = element_blank(), 
        strip.placement = 'outside',
        axis.text.x = element_text(angle = 0, size = 12))

print(g)
dev.off()

dev.off() 
(device = "RStudioGD")

Bacteroidota relative abundance (Supplemental figure 7B)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(phylum == 'Bacteroidota' | domain == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'family') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool = max(mean) < 2,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("bacteria_Bacteroidota_phylum_bar_rel_abund_all_dates.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.5) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*Balneolaceae*","*Crocinitomicaceae*","*Cryomorphaceae*","*Cyclobacteriaceae*",
                               "*Flavobacteriaceae*","*NS11-12_marine_group*", "*NS9_marine_group*", "*Rhodothermaceae*",
                               "*Saprospiraceae**","*Schleiferiaceae**","*Weeksellaceae*","*uncultured*", "Other","Unclassified *Bacteria*"),
                    values = c('#771155', '#AA4488',  '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#774411', '#44AAAA', '#77CCCC', '#44AA77', '#AA7744','#88CCAA','darkgray', '#AAAA44','#777711', 
                               '#117744',  '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
  scale_x_discrete(breaks = datelist,
                   labels = rev(datelist)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.text = element_markdown(size = 12),
        legend.key.size = unit(14, "pt"),
        strip.background = element_blank(), 
        strip.placement = 'outside',
        axis.text.x = element_text(angle = 0, size = 12))

print(g)
dev.off()

_________________________________________________________________________________________________________

Community diversity and structure

Eukaryote diversity metrics (Figure 4A, B, C)

remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>%
  rename(otu=id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(Domain=str_replace(Domain, "d__", ""),
         Phylum=str_replace(Phylum, "p__", ""),
         Class=str_replace(Class, "c__", ""),
         Order=str_replace(Order, "o__", ""),
         Family=str_replace(Family, "f__", ""),
         Genus=str_replace(Genus, "g__", ""),
         Species=str_replace(Species, "s__", "")) %>%
  mutate(Domain=str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:28118]

# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]

index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))

png("euks_diversity_12pt.png", res=300, units='in', width=15, height=8)
par(mar=c(3,5,3,2), mfrow=c(1,2))
colors=c('#ED7D31','#548235','#C00000', '#4472C4')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Shannon diversity")
dev.off()

# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)

aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
dist <- vegdist(otumatdf,  method = "bray")

metaMDS(dist)



# Copy the very first part of the Randi Griffith tutorial for the NMDS

install.packages("remotes")
remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>%
  rename(otu=id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(Domain=str_replace(Domain, "d__", ""),
         Phylum=str_replace(Phylum, "p__", ""),
         Class=str_replace(Class, "c__", ""),
         Order=str_replace(Order, "o__", ""),
         Family=str_replace(Family, "f__", ""),
         Genus=str_replace(Genus, "g__", ""),
         Species=str_replace(Species, "s__", "")) %>%
  mutate(Domain=str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# NMDS

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# The ASVs (rows = sample)
abund.mat <- datas[,14:28118]

# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>% 
  na.omit() %>% 
  select(-'NO2.NO3')

# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)

# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com,  dmethod = "bray", sample = 2500)
dist <- vegdist(m_com,  method = "bray")
nmds = metaMDS(dist)
# nmds <- dist %>%
#   metaMDS(trace = F) %>%
#   ordiplot(type = "t") %>%
#   text("sites")
nmds_stress <- nmds$stress
# stress = 0.17

# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en

# Plot with base R
plot(nmds)
plot(en)

# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')

en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)

png("NMDS_Euks.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
  geom_point(data = data.scores, 
             aes(colour = as.factor(Season), shape = as.factor(Sample.Type)), 
             show.legend = T, size = 3) +
  scale_color_manual(name = "Seasons", 
                     values=colors,
                     labels = c('Autumn','Spring','Summer','Winter')) +
  scale_shape_manual(name = "Size fraction", 
                     values=c(15, 16, 17),
                     breaks = c('2', '3', '10'),
                     labels = c('0.2 um', '3.0 um', '10.0 um')) +
  geom_segment(data = en_coord_cont, 
               aes(x = 0, y = 0, 
                   xend = NMDS1*.6, yend = NMDS2*.6), 
               arrow = arrow(length = unit(.6, 'picas')), colour="red") +
  geom_text(data = en_coord_cont, aes(x = NMDS1*.63, y = NMDS2*.63), colour = "black", 
          fontface = "bold", label = row.names(en_coord_cont)) + 
  # stat_ellipse(data = data.scores,
  #              aes(color=as.factor(Season), show.legend = F)) +
  xlab("NMDS1") + 
  ylab("NMDS2") +
  theme_bw(base_size = 15) +
  theme(axis.text.y = element_text(color="black", size=17, angle=0),
        axis.text.x = element_text(color="black",  size=17, angle=0),
        axis.title.x = element_text(color="black", size=20, angle=0),
        axis.title.y = element_text(color="black", size=20, angle=90),
        legend.text=element_markdown(),
        legend.key.size=unit(15, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        panel.border = element_rect(colour = "black", fill = NA))

dev.off()

Bacterial diversity metrics (Figure 4A, B, C)

remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Bacteria

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Bacteria' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$otu
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))


# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:83966]

# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]

index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
#index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))

png("bac_diversity_12pt.png", res=300, units='in', width=15, height=8)
par(mar=c(3,5,3,2), mfrow=c(1,2))
colors = c('#ED7D31','#548235','#C00000', '#4472C4')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Shannon diversity")
dev.off()

# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)

aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)

# NMDS
# Install and load the following packages
install.packages("vegan")
install.packages("ape")
install.packages("dplyr")

library(vegan)
library(ape)
library(dplyr)

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Bacteria' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$otu
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:83966]

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# The ASVs (rows = sample)
abund.mat <- datas[,14:83966]

# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>% 
  na.omit() %>% 
  select(-'NO2.NO3')

# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)

# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com,  dmethod = "bray", sample = 10000)
dist <- vegdist(m_com,  method = "bray")
nmds = metaMDS(dist)
# nmds <- dist %>%
#   metaMDS(trace = F) %>%
#   ordiplot(type = "t") %>%
#   text("sites")
nmds_stress <- nmds$stress

# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en

# Plot with base R
plot(nmds)
plot(en)

# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
#data.scores = as.data.frame(scores(nmds)$sites)
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')

en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)

png("NMDS_Bac.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
  geom_point(data = data.scores, 
             aes(colour = as.factor(Season), shape = as.factor(Sample.Type)), 
             show.legend = T, size = 3) +
  scale_color_manual(name = "Seasons", 
                     values=colors,
                     labels = c('Autumn','Spring','Summer','Winter')) +
  scale_shape_manual(name = "Size fraction", 
                     values=c(15, 16, 17),
                     breaks = c('2', '3', '10'),
                     labels = c('0.2 um', '3.0 um', '10.0 um')) +
  geom_segment(data = en_coord_cont, 
               aes(x = 0, y = 0, 
                   xend = NMDS1*.6, yend = NMDS2*.6), 
               arrow = arrow(length = unit(.6, 'picas')), colour="red") +
  geom_text(data = en_coord_cont, aes(x = NMDS1*.64, y = NMDS2*.64), colour = "black", 
            fontface = "bold", label = row.names(en_coord_cont)) + 
  xlab("NMDS1") + 
  ylab("NMDS2") +
  theme_bw(base_size = 15) +
  theme(axis.text.y = element_text(color="black", size=17, angle=0),
        axis.text.x = element_text(color="black",  size=17, angle=0),
        axis.title.x = element_text(color="black", size=20, angle=0),
        axis.title.y = element_text(color="black", size=20, angle=90),
        legend.text=element_markdown(),
        legend.key.size=unit(15, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        panel.border = element_rect(colour = "black", fill = NA))

dev.off()

Archaeal diversity metrics (Figure 4A, B, C)

remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_arc.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Archaea' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))


# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:9586]

# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]

index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))

png("arc_diversity.png", res=300, units='in', width=9, height=8)
par(mar=c(3,4,3,2), mfrow=c(2,2))
colors=c('#771155','#114477','#117777', '#AA7744')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=1, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=1, ylab="Shannon diversity")
dev.off()

# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)

aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)

install.packages("vegan")
install.packages("ape")
install.packages("dplyr")

library(vegan)
library(ape)
library(dplyr)
library(tidyverse)


# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_arc.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Archaea' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))


# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:9598]

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:9598]

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# The ASVs (rows = sample)
abund.mat <- datas[,14:9598]

# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>% 
  na.omit() %>% 
  select(-'NO2.NO3')

# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)

# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com,  dmethod = "bray", sample = 10000)
dist <- vegdist(m_com,  method = "bray")
nmds = metaMDS(dist)
nmds_stress <- nmds$stress

# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en

# Plot with base R
plot(nmds)
plot(en)

# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
#data.scores = as.data.frame(scores(nmds)$sites)
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')

en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)

png("NMDS_Arc.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
  geom_point(data = data.scores, 
             aes(colour = as.factor(Season), shape = as.factor(Sample.Type)), 
             show.legend = T, size = 3) +
  scale_color_manual(name = "Seasons", 
                     values=colors,
                     labels = c('Autumn','Spring','Summer','Winter')) +
  scale_shape_manual(name = "Size fraction", 
                     values=c(15, 16, 17),
                     breaks = c('2', '3', '10'),
                     labels = c('0.2 um', '3.0 um', '10.0 um')) +
  geom_segment(data = en_coord_cont, 
               aes(x = 0, y = 0, 
                   xend = NMDS1*.3, yend = NMDS2*.3), 
               arrow = arrow(length = unit(.6, 'picas')), colour="red") +
  geom_text(data = en_coord_cont, aes(x = NMDS1*.34, y = NMDS2*.34), colour = "black", 
            fontface = "bold", label = row.names(en_coord_cont)) + 
  #stat_ellipse(aes(color=Season), show.legend = F) +
  xlab("NMDS1") + 
  ylab("NMDS2") +
  theme_bw(base_size = 15) +
  theme(axis.text.y = element_text(color="black", size=17, angle=0),
        axis.text.x = element_text(color="black",  size=17, angle=0),
        axis.title.x = element_text(color="black", size=20, angle=0),
        axis.title.y = element_text(color="black", size=20, angle=90),
        legend.text=element_markdown(),
        legend.key.size=unit(15, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        panel.border = element_rect(colour = "black", fill = NA))

dev.off()

_________________________________________________________________________________________________________

Metacyc biological pathway

Diatom relative abundance (Supplemental figure )

# Load the pheatmap package
install.packages('matrixStats')
library(pheatmap)
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(matrixStats)
library(RColorBrewer)
library(caroline)

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .)) %>% 
  filter(!SampleID == 'NA') %>% 
  filter(!SampleID == 'BB10')

# Annotation column: a data frame where the row names amtch the column names of the data_matrix

rownames(allenvnum) <- allenvnum$SampleID

# ASV counts for each feature into a matrix
merged <- read.csv('MetaCycPathways.csv', header=T, stringsAsFactors=F) %>%
  column_to_rownames(var='Description') %>% 
  select(-pathway)

# Check to see if they're relative abundance counts
# count <- rbind(counts, colSums(counts[1:90]))

merge_mat <- as.matrix(merged)

### Log2 normalization and then determine the genes that have the most variance
log_merge_mat <- as.matrix(log2(merged+1))

# Get the top most 50 variable genes
rv <- rowVars(merge_mat)
idx <- order(-rv)[1:40]

rv <- rowVars(log_merge_mat)
idx <- order(-rv)[1:40]

# Create column annotations for the sample designations. 

anncol <- data.frame(Var1 = factor(1:90, labels = c('1')))
rownames(anncol) <- allenvnum$SampleID
anncol$Season <- as.factor(allenvnum$Season)
anncol$Filter <- as.factor(allenvnum$Sample.Type)
anncol <- anncol[,-1]
anncol <- anncol[order(anncol$Season),]
anncol

# Change color of Seasons and filter size
# select my row annotation colors. Name the color vector clas2
my_colour = list(
  anncol,
  Filter = c('2' = 'yellow', '3' = "#FC4E07", '10' = '#696969'),
  Season = c(spring = '#AA4488',autumn = '#4477AA', summer = '#117777', winter = '#AA7744'))


# Reorder columns in merge_mat based on column order of annotation column, and Z-score the values

mat_ordered <- merge_mat[, rownames(anncol)]
mat_orderedz <- scale(mat_ordered)
head(mat_orderedz)

# set the color create color palette
paletteLength <- 100

my_color <- rev(brewer.pal(11, "RdBu"))
my_color = colorRampPalette(my_color)(paletteLength)

myBreaks <- c(seq(min(mat_orderedz), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(mat_orderedz)/paletteLength, max(mat_orderedz), length.out=floor(paletteLength/2)))

# Create the heatmap with annotation

png("metacycPie2log.png", res=300, units='in', width=18, height=14)
pheatmap(mat_orderedz[idx,], 
         annotation_col = anncol,
         annotation_colors = my_colour,
         main = "MetaCyc annotations",
         breaks = myBreaks,
         color = my_color,
         cellwidth = 6, 
         cellheight = 6, 
         fontsize_row = 6, 
         fontsize_col = 6, 
         #boarder_color = 'black', 
         cluster_rows = TRUE, 
         cluster_cols = TRUE,
         cutree_cols = 6)
dev.off()

_________________________________________________________________________________________________________

Network co-occurrence analysis

Cross-domain network analysis (Figure 5A)

# Install packages (only once on each device)

setwd("/Users/carlymoreno/Dropbox/Yas_bay")

# Required packages
install.packages("devtools")
install.packages("BiocManager")

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

# download gfortran dev package
# https://cran.r-project.org/bin/macosx/tools/

# Install NetCoMi
devtools::install_github("stefpeschel/NetCoMi", 
                         dependencies = c("Depends", "Imports", "LinkingTo"),
                         repos = c("https://cloud.r-project.org/",
                                   BiocManager::repositories()))

install.packages('ape','limma','tidyverse','phyloseq')

###############################################################################

# If previous installation fails, with warning that packages cannot be built, try first modifying the R Makevars file in the computer terminal


# ~/.R/Makevars - custom configuration to fix SpiecEasi install on macOS ARM64
# Uses Homebrew GCC/gfortran and overrides system R Makeconf which points to old /opt/gfortran

#nano ~/.R/Makevars     # copy and paste the 10 lines below

# Fortran compilers - point to Homebrew-installed gfortran-15
FC = /opt/homebrew/bin/gfortran-15
F77 = /opt/homebrew/bin/gfortran-15

# Fortran libraries - use Homebrew gcc libs only, no /opt/gfortran
FLIBS = -L/opt/homebrew/opt/gcc/lib/gcc/15 -lgfortran -lquadmath -lm

# C and C++ compilers - Homebrew gcc/g++ 15
CC = /opt/homebrew/bin/gcc-15
CXX = /opt/homebrew/bin/g++-15

# Compiler flags for C and C++ (enable OpenMP)
CFLAGS = -g -O2 -fopenmp
CXXFLAGS = -g -O2 -fopenmp

# Linker flags - libomp and gcc lib paths from Homebrew
LDFLAGS = -L/opt/homebrew/opt/libomp/lib -lomp -L/opt/homebrew/opt/gcc/lib/gcc/15
CPPFLAGS = -I/opt/homebrew/opt/libomp/include

# Force R to use the above Fortran libs during shared library linking
SHLIB_FCLD = $(FLIBS)

# close Terminal and R, and then reopen R

# And then these installation steps in R:

# Required packages
install.packages("devtools")
install.packages("BiocManager")

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

install.packages("pkgbuild")
pkgbuild::check_build_tools(debug = TRUE)

# Since two of NetCoMi's dependencies are only available on GitHub, 
# it is recommended to install them first:
devtools::install_github("zdk123/SpiecEasi")
devtools::install_github("GraceYoon/SPRING")

# Install NetCoMi
devtools::install_github("stefpeschel/NetCoMi", 
                         repos = c("https://cloud.r-project.org/",
                                   BiocManager::repositories()))


###############################################################################

# Start here

library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)

###############################################################################

# Make a phyloseq object

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>%
  rename(otu = id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>%
  mutate(Domain = str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "Phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq_euks = merge_phyloseq(physeq, sampledata, random_tree)
physeq_euks

plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq_euks)
plot_heatmap(physeq_euks, taxa.label="Phylum")

saveRDS(physeq_euks, "physeq_euks.rds")

# Make a phyloseq object

# Bacteria

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(toupper) %>% 
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>% 
  filter(Domain == 'Bacteria') %>% 
  filter(!Family %in% c('Chloroplast', 'Mitochondria'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "Phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
install.packages('ape')
library(ape)

random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq_bac = merge_phyloseq(physeq, sampledata, random_tree)
physeq_bac

plot_tree(physeq_bac, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq_bac)
plot_heatmap(physeq_bac, taxa.label="Phylum")

saveRDS(physeq_bac, "physeq_bac.rds")

###############################################################################

# Start here to make one network with eukaryotes and bacteria for the entire year

### Eukaryotes
pseuk <- readRDS('physeq_euks.rds')

# zero taxa 제거
ps.f = prune_samples(sample_sums(pseuk) > 0, pseuk) 
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f)  #  14733 taxa and 22 samples

# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Class")   # 161 orders and 22 samples

# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_euks_renamed <- NetCoMi::renameTaxa(ps_genus,
                                        pat = "<name>",
                                        substPat = "<name>_<subst_name>(<subst_R>)",
                                        numDupli = "Class")

sum_euks_mat <- as.matrix(t(otu_table(ps_euks_renamed)@.Data))

sum_euks_df <- as.data.frame(otu_table(ps_euks_renamed)@.Data)
sum_euks_df$OTU <- rownames(sum_euks_df)

sum_euks_df <- merge(sum_euks_df, taxonomy, by = 'OTU') %>% 
  drop_na()

sum_euks_df <- sum_euks_df %>% 
  filter(!Family %in% c('uncultured','Labyrinthulomycetes'))

rownames(sum_euks_df) <- sum_euks_df$Class

sum_euks_df <- as.matrix(t(sum_euks_df[,c(-1, -c(24:30))]))


### Bacteria
psbac <- readRDS("physeq_bac.rds")

# zero taxa 제거
ps.b = prune_samples(sample_sums(psbac) > 0, psbac) 
ps.b = prune_taxa(rowSums(otu_table(ps.b)) > 0, ps.b)  #  21585 taxa and 22 samples

# Agglomerate to genus level
ps_bacgenus <- tax_glom(ps.b, taxrank = "Class")     #  413 taxa and 22 samples

# Taxonomic table
taxtab <- as(tax_table(ps_bacgenus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_bac_genus_renamed <- NetCoMi::renameTaxa(ps_bacgenus,
                                            pat = "<name>",
                                            substPat = "<name>_<subst_name>(<subst_R>)",
                                            numDupli = "Class")

sum_bac_mat <- as.matrix(t(otu_table(ps_bac_genus_renamed)@.Data))

sum_bac_df <- as.data.frame(otu_table(ps_bac_genus_renamed)@.Data)
sum_bac_df$OTU <- rownames(sum_bac_df)

sum_bac_df <- merge(sum_bac_df, taxonomy, by = 'OTU') %>% 
  drop_na()

sum_bac_df <- sum_bac_df %>% 
  filter(!Family %in% c('uncultured'))

rownames(sum_bac_df) <- sum_bac_df$Class

sum_bac_df <- as.matrix(t(sum_bac_df[,c(-1, -c(24:30))]))

# Summer (group 1 bac and euks)

set.seed(123456)

spiec <- multi.spiec.easi(list(sum_euks_df, sum_bac_df), 
                                 method='mb', nlambda=40, 
                                 lambda.min.ratio=1e-1, 
                                 pulsar.params = list(thresh = 0.05))

assoMat1 <- SpiecEasi::symBeta(SpiecEasi::getOptBeta(spiec), mode = "ave")

assoMat1 <- as.matrix(assoMat1)

# Get taxa names
taxnames <- c(taxa_names(ps_genus), taxa_names(ps_bacgenus))
taxnames <- c(colnames(sum_euks_df), colnames(sum_bac_df))

colnames(assoMat1) <- rownames(assoMat1) <- taxnames
diag(assoMat1) <- 1

all <- netConstruct(data = assoMat1,
                    dataType = "condDependence", 
                    sparsMethod = "none")

netall <- netAnalyze(all, hubPar = "eigenvector")

nodeCols <- c(rep("darkturquoise", 57), rep("orange", 73))
names(nodeCols) <- taxnames

png("CrossDomain_EuksBac.png", res = 300, units = 'in', width = 10, height = 8)
plot(netall, 
     layoutGroup = "union",
     nodeColor = "colorVec", 
     colorVec = nodeCols,
     nodeSize = "eigen", 
     nodeSizeSpread = 5,
     labelScale = FALSE,
     posCol = "darkgreen", 
     negCol = "red",
     cexNodes = 2,  # size of node
     cexHubs = 2.2,   # Size of the central node among nodes
     cexLabels = .5, # Size of the regular nodes labesl
     cexHubLabels = .7, # Size of the central node among nodes
     cexTitle = 3)

legend(.7, 1, cex = .7, pt.cex = 2, 
       legend = c("Eukaryotes", "Bacteria"), col = c("lightblue", "orange"), 
       bty = "n", pch = 16) 

dev.off()

summary(netall)

Summer and winter eukaryotic network analysis (Figure 5B)

library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)

# Make a phyloseq object

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>%
  rename(otu = id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>%
  mutate(Domain = str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "Phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
install.packages('ape')
library(ape)

random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq1 = merge_phyloseq(physeq, sampledata, random_tree)
physeq1

plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq1)
plot_heatmap(physeq1, taxa.label="phylum")

saveRDS(physeq1, "physeq1.rds")

###############################################################################

# Start the tutorial and incorporate my data

# Summer time 

# 1.import data
ps <- readRDS("physeq1.rds")
summer = subset_samples(ps, Season %in% c("summer")) #Extract only summer microbiome samples

# 2. processing data
Taxa100 = names(sort(taxa_sums(summer), decreasing = TRUE)[1:50]) # Top 100 taxa extract
summer.1 = prune_taxa(Taxa100, summer)

# 3. filtering
summer_genus <- tax_glom(summer.1, taxrank = "Genus") # Agglomerate to genus level
taxtabsummer <- as(tax_table(summer_genus), "matrix")


# 4. Rename non-unique names using rename function
summer_genus_renamed <- NetCoMi::renameTaxa(summer_genus,
                                         pat = "<name>",
                                         substPat = "<name>_<subst_name>(<subst_R>)",
                                         numDupli = "Genus")

# 5. NetConstruct Functionto construct a network using spearman correlation. Write measure 
# as "spearman", normalization as "clr", and zero value as "multiRepl" method.

netsum_genus <- netConstruct(summer_genus_renamed,       # phyloseq
                          taxRank = "Genus",       # 기준 tax level
                          filtTax = "highestFreq",
                          filtTaxPar = list(highestfreq = 50),
                          zeroMethod = "pseudo", 
                          zeroPar = list(pseudocount = 0.5),  # 기준 tax level
                          measure = "spearman",    # spearman correlation 으로 계산해보자
                          normMethod = "clr",      # transformation method to deal with zero, tutorial pairs this with Pearson method
                          sparsMethod = "threshold",
                          thresh = 0.3,
                          verbose = 3,
                          seed = 1234)

sum_props_genus <- netAnalyze(netsum_genus, 
                              clustMethod = "cluster_fast_greedy",
                              weightDeg = TRUE,
                              normDeg = FALSE,
                              gcmHeat = FALSE,
                              hubPar = "eigenvector")

plotHeat(mat = sum_props_genus$graphletLCC$gcm1,
         pmat = sum_props_genus$graphletLCC$pAdjust1,
         type = "mixed",
         title = "GCM", 
         colorLim = c(-1, 1),
         mar = c(2, 0, 2, 0))

# 6. Information of the created network
summary(sum_props_genus, numbNodes = 5L)

# 7. plot layout setting up
graph4 <- igraph::graph_from_adjacency_matrix(netsum_genus$adjaMat1, 
                                              weighted = TRUE)
set.seed(42) # fixing the shape
lay_fr <- igraph::layout_with_fr(graph4)

rownames(lay_fr) <- rownames(netsum_genus$adjaMat1) # Row names of the layout matrix must match the node names

# 8. Drawing a network plot 그리기 
png("euks_genera_50_AI_summer_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(sum_props_genus,              # Network Ingredients
     layout = "spring",          # Network plot 의 layout # "spring" also available
     labelLength = 10,         # label length
     repulsion = 0.84,
     cexNodes = 0.9,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     colorVec =  phylcol,      # edge color 
     posCol = "darkturquoise", # positive correlation edge color
     negCol = "orange",        # negative correlation edge color
     cexLabels = 1.8,            # node label size
     title1 = "Summer Top 50 Eukaryote genera (Spearman)",  # Title
     showTitle = TRUE,         # Show title 
     cexTitle = 1.5)     

# Get phyla names
taxtab <- as(tax_table(summer_genus_renamed), "matrix")
phyla <- as.factor(gsub("p__", "", taxtab[, "Phylum"]))
names(phyla) <- taxtab[, "Phylum"]

# index color 
phylcol <- c("cyan", "blue3", "red", "lawngreen", "yellow", "deeppink")
phylcol_transp <- colToTransp(phylcol, 60)

# Phylum color index
legend(-1.4, 1.7, cex = 1, pt.cex = 1, title = "Phylum:",
       legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 12)

# correlation color index  0.4, 1.1,
legend(.8, 1.2, cex = 1, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = FALSE,
       bg = grey)

dev.off()

###############################################################################

# Wintertime

# 1.import data
ps <- readRDS("physeq1.rds")
winter = subset_samples(ps, Season %in% c("winter")) #Extract only winter microbiome samples

# 2. processing data
Taxa100 = names(sort(taxa_sums(winter), decreasing = TRUE)[1:50]) # Top 100 taxa extract
winter.1 = prune_taxa(Taxa100, winter)

# 3. filtering
winter_genus <- tax_glom(winter.1, taxrank = "Genus") # Agglomerate to genus level
taxtabwinter <- as(tax_table(winter_genus), "matrix")


# 4. Rename non-unique names using rename function
winter_genus_renamed <- NetCoMi::renameTaxa(winter_genus,
                                            pat = "<name>",
                                            substPat = "<name>_<subst_name>(<subst_R>)",
                                            numDupli = "Genus")

# 5. NetConstruct Functionto construct a network using spearman correlation. Write measure 
# as "spearman", normalization as "clr", and zero value as "multiRepl" method.

netwinter_genus <- netConstruct(winter_genus_renamed,       # phyloseq
                             taxRank = "Genus", 
                             filtTax = "highestFreq",
                             filtTaxPar = list(highestfreq = 50),
                             zeroMethod = "pseudo", 
                             zeroPar = list(pseudocount = 0.5),  # 기준 tax level
                             measure = "spearman",    # spearman correlation 으로 계산해보자
                             normMethod = "clr",      # transformation
                             sparsMethod = "threshold",
                             thresh = 0.3,
                             verbose = 3,
                             seed = 1234)

# taxRank = "Genus",       # 기준 tax level
# measure = "spearman",    # spearman correlation 으로 계산해보자
# zeroMethod = "multRepl",
# normMethod = "clr",      # transformation
# sparsMethod = "threshold",
# thresh = 0.3,
# verbose = 3,
# seed = 1234)

winter_props_genus <- netAnalyze(netwinter_genus, 
                                 clustMethod = "cluster_fast_greedy",
                                 weightDeg = TRUE,
                                 normDeg = FALSE,
                                 gcmHeat = FALSE,
                                 hubPar = "eigenvector")

plotHeat(mat = winter_props_genus$graphletLCC$gcm1,
         pmat = winter_props_genus$graphletLCC$pAdjust1,
         type = "mixed",
         title = "GCM", 
         colorLim = c(-1, 1),
         mar = c(2, 0, 2, 0))

# 6. Information of the created network

summary(winter_props_genus, numbNodes = 5L)

# 7. plot layout setting up

graph5 <- igraph::graph_from_adjacency_matrix(netwinter_genus$adjaMat1, 
                                              weighted = TRUE)
set.seed(42) # fixing the shape
lay_fr <- igraph::layout_with_fr(graph5)

rownames(lay_fr) <- rownames(netwinter_genus$adjaMat1) # Row names of the layout matrix must match the node names

# 8. Drawing a network plot 그리기 
png("euks_winter_top50_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(winter_props_genus,              # Network Ingredients
     layout = "spring",          # Network plot 의 layout # "spring" also available
     labelLength = 10,         # label length
     cexNodes = 0.9,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     colorVec =  phylcol,      # edge color 
     posCol = "darkturquoise", # positive correlation edge color
     negCol = "orange",        # negative correlation edge color
     cexLabels = 1.8,            # node label size
     title1 = "Winter Eukaryote genera (Spearman)",  # Title
     showTitle = TRUE,         # Show title 
     cexTitle = 1.5,           # Title size
     mar = c(5,5,5,5))      

# Get phyla names
taxtab <- as(tax_table(winter_genus_renamed), "matrix")
phyla <- as.factor(gsub("p__", "", taxtab[, "Phylum"]))
names(phyla) <- taxtab[, "Phylum"]

# index color 
phylcol <- c("cyan", "blue3", "red", "lawngreen", "yellow", "deeppink")
phylcol_transp <- colToTransp(phylcol, 60)

# Phylum color index
legend(-1.1, 1.1, cex = 1, pt.cex = 1, title = "Phylum:", 
        legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16) 

# correlation color index  0.4, 1.1,
legend(.7, -.3, cex = 1, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = FALSE,
       bg = grey)


dev.off()

###############################################################################

# Summer versus winter eukaryotic networks

ps <- readRDS("physeq1.rds")

ps.pt <- ps %>% 
  subset_samples(Season %in% c("summer", "winter")) # 19083 taxa and 12 samples 

# zero taxa 제거
ps.f = prune_samples(sample_sums(ps.pt)>0, ps.pt) 
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f)  #  3090 taxa and 12 samples

# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Family")

# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_genus_renamed <- NetCoMi::renameTaxa(ps_genus,
                                        pat = "<name>",
                                        substPat = "<name>_<subst_name>(<subst_R>)",
                                        numDupli = "Family")

# 각 부위에 대한 샘플 추출 
ps_genus_renamed_summer <- phyloseq::subset_samples(ps_genus_renamed, 
                                                  Season == "summer")
ps_genus_renamed_winter <- phyloseq::subset_samples(ps_genus_renamed, 
                                                    Season == "winter")

ps_genus.sparcc <- netConstruct(
  data = ps_genus_renamed_summer,
  data2 = ps_genus_renamed_winter,
  filtTax = "highestFreq",               # Top을 뽑는 기준
  filtTaxPar = list(highestFreq  = 50), # Top 100
  taxRank = "Family",                     # Genus level 
  measure = "sparcc",                    # sparcc
  measurePar = list(nlambda=20, rep.num=10),
  
  normMethod = "clr",                    # transformation
  zeroMethod = "none",                   # zero 값 보정 
  sparsMethod = "threshold", 
  adjust = "adaptBH",                    # p-value 값 보정 
  thresh = 0.3,
  
  dissFunc = "signed",
  verbose = 2,
  seed = 42)

ps_props_sparcc <- netAnalyze(ps_genus.sparcc, 
                              clustMethod = "cluster_fast_greedy",
                              hubPar = "eigenvector",           # Hub node 판별방법
                              normDeg = FALSE)

summary(ps_props_sparcc, groupNames = c("summer", "winter"))

# Compute layout
ps_graph <- igraph::graph_from_adjacency_matrix(ps_genus.sparcc$adjaMat1,
                                                weighted = TRUE)

set.seed(42)
ps_lay_fr <- igraph::layout_with_fr(ps_graph)

# Row names of the layout matrix must match the node names
rownames(ps_lay_fr) <- rownames(ps_genus.sparcc$adjaMat1)


# Get phyla names
taxtab <- as(tax_table(ps_genus_renamed), "matrix")
phyla <- as.factor(taxtab[, "Phylum"])
names(phyla) <- taxtab[, "Family"]
table(phyla)

# Define phylum colors
library(RColorBrewer)
phylcol <- c(brewer.pal(12, "Paired"), brewer.pal(12, "Set3"))

# Colors used in the legend should be equally transparent as in the plot
phylcol_transp <- colToTransp(phylcol, 60)

png("euks_family_50_summer_vs_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(ps_props_sparcc,
     sameLayout = TRUE, 
     #nodeColor = "cluster",
     #nodeSize = "mclr",
     #labelScale = FALSE,
     nodeSize = "eigenvector",  # "clr",
     nodeColor = "cluster",     #"feature",
     #labelLength = 8,         # label length
     repulsion = 0.85,
     #layoutGroup = "union",
     cexNodes = 1.5,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     featVecCol = phyla, 
     colorVec =  phylcol,
     cexLabels = 2.5,
     posCol = "darkturquoise", 
     negCol = "orange",
     title1 = "Network on genus level with sparcc correlations", 
     showTitle = TRUE,
     groupNames = c("Summer","Winter"))

#legend(-1.2, 10, cex = 1.5, pt.cex = 2.5, title = "Phylum:", 
#       legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16) 

legend(0.7, 1.3, cex = 2.2, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = TRUE)
dev.off()

comp_season <- netCompare(ps_props_sparcc, 
                          permTest = FALSE, 
                          verbose = FALSE,
                          seed = 123456)

summary(comp_season, 
        groupNames = c("summer", "winter"),
        showCentr = c("degree", "between", "closeness"), 
        numbNodes = 5)

plot(ps_props_sparcc, 
     sameLayout = TRUE, 
     layoutGroup = "union", 
     #colorVec = col,
     borderCol = "gray40", 
     nodeSize = "degree", 
     cexNodes = 0.9, 
     nodeSizeSpread = 3, 
     edgeTranspLow = 80, 
     edgeTranspHigh = 50,
     groupNames = c("summer", "winter"), 
     showTitle = TRUE, 
     cexTitle = 2.8,
     mar = c(1,1,3,1), 
     repulsion = 0.9, 
     labels = FALSE, 
     rmSingles = "inboth",
     nodeFilter = "clustMin", 
     nodeFilterPar = 10, 
     nodeTransp = 50, 
     hubTransp = 30)

Summer and winter bacterial network analysis (Supplemental figure 10)

library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)

###############################################################################

# Make a phyloseq object

# Bacteria

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(toupper) %>% 
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>% 
  filter(Domain == 'Bacteria') %>% 
  filter(!Family %in% c('Chloroplast', 'Mitochondria'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
install.packages('ape')
library(ape)

random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq1 = merge_phyloseq(physeq, sampledata, random_tree)
physeq1

plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq1)
plot_heatmap(physeq1, taxa.label="phylum")

saveRDS(physeq1, "physeq1.rds")

###############################################################################

 # Start the tutorial and incorporate my data

# Summer versus winter eukaryotic networks

ps <- readRDS("physeq1.rds")

ps.pt <- ps %>% 
  subset_samples(Season %in% c("summer", "winter")) # 19083 taxa and 12 samples 

# zero taxa 제거
ps.f = prune_samples(sample_sums(ps.pt)>0, ps.pt) 
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f)  #  3090 taxa and 12 samples

# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Family")

# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_genus_renamed <- NetCoMi::renameTaxa(ps_genus,
                                        pat = "<name>",
                                        substPat = "<name>_<subst_name>(<subst_R>)",
                                        numDupli = "Family")

# 각 부위에 대한 샘플 추출 
ps_genus_renamed_summer <- phyloseq::subset_samples(ps_genus_renamed, 
                                                    Season == "summer")
ps_genus_renamed_winter <- phyloseq::subset_samples(ps_genus_renamed, 
                                                    Season == "winter")

ps_genus.sparcc <- netConstruct(
  data = ps_genus_renamed_summer,
  data2 = ps_genus_renamed_winter,
  filtTax = "highestFreq",               # Top을 뽑는 기준
  filtTaxPar = list(highestFreq  = 50), # Top 100
  taxRank = "Family",                     # Genus level 
  measure = "sparcc",                    # sparcc
  measurePar = list(nlambda=20, rep.num=10),
  
  normMethod = "clr",                    # transformation
  zeroMethod = "none",                   # zero 값 보정 
  sparsMethod = "threshold", 
  adjust = "adaptBH",                    # p-value 값 보정 
  thresh = 0.3,
  
  dissFunc = "signed",
  verbose = 2,
  seed = 42)

ps_props_sparcc <- netAnalyze(ps_genus.sparcc, 
                              clustMethod = "cluster_fast_greedy",
                              hubPar = "eigenvector",           # Hub node 판별방법
                              normDeg = FALSE)

summary(ps_props_sparcc, groupNames = c("summer", "winter"))

# Compute layout
ps_graph <- igraph::graph_from_adjacency_matrix(ps_genus.sparcc$adjaMat1,
                                                weighted = TRUE)

set.seed(42)
ps_lay_fr <- igraph::layout_with_fr(ps_graph)

# Row names of the layout matrix must match the node names
rownames(ps_lay_fr) <- rownames(ps_genus.sparcc$adjaMat1)

# Get phyla names
taxtab <- as(tax_table(ps_genus_renamed), "matrix")
phyla <- as.factor(taxtab[, "Phylum"])
names(phyla) <- taxtab[, "Family"]
table(phyla)

# Define phylum colors
library(RColorBrewer)
phylcol <- c(brewer.pal(12, "Paired"), brewer.pal(12, "Set3"))

# Colors used in the legend should be equally transparent as in the plot
phylcol_transp <- colToTransp(phylcol, 60)
png("bac_family_50_summer_vs_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(ps_props_sparcc,
     sameLayout = TRUE, 
     #nodeColor = "cluster",
     #nodeSize = "mclr",
     #labelScale = FALSE,
     nodeSize = "eigenvector",  # "clr",
     nodeColor = "cluster",     #"feature",
     #labelLength = 8,         # label length
     repulsion = 0.84,
     cexNodes = 1.5,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     featVecCol = phyla, 
     colorVec =  phylcol,
     cexLabels = 2.5,
     posCol = "darkturquoise", 
     negCol = "orange",
     title1 = "Network on genus level with sparcc correlations", 
     showTitle = TRUE,
     groupNames = c("Summer","Winter"))

#legend(-1.2, 10, cex = 1.5, pt.cex = 2.5, title = "Phylum:", 
#       legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16) 

legend(0.7, 1.3, cex = 2.2, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = TRUE)
dev.off()

comp_season <- netCompare(ps_props_sparcc, 
                          permTest = FALSE, 
                          verbose = FALSE,
                          seed = 123456)

summary(comp_season, 
        groupNames = c("summer", "winter"),
        showCentr = c("degree", "between", "closeness"), 
        numbNodes = 5)