Yas Bay code

Author: Carly Moreno

March 30, 2025

New York University Abu Dhabi

_________________________________________________________________________________________________________

Environmental parameters

Correlation plot (Figure 1C)

install.packages("tidyverse")
install.packages('caroline')
install.packages("ggtext")
install.packages("wesanderson")
install.packages("factoextra")
install_github("kassambara/factoextra")
install.packages("missMDA")
install.packages("FactoMineR")
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(RColorBrewer)
library(wesanderson)
library(caroline)
library(tidyverse)
library(dplyr)
library(vegan)
library(devtools)
library(factoextra)
library("FactoMineR")
library(missMDA)
library(Hmisc)
library(corrplot)

# Make an environmental correlation plot

# Using all the data
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) %>% 
  select(c(7:14)) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  select(!3)
  
# Correlation matrix 
ccc <- cor(env, method = 'pearson', use = 'complete.obs')

# Correlation coefficients and the p-values of the correlation for all possible pairs
res <- rcorr(as.matrix(ccc), type = 'pearson')

# Customize the color of the correlogram
color <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
COL2(diverging = c("RdBu", "BrBG", "PiYG", "PRGn", "PuOr", "RdYlBu"), n = 200)

png(filename = "CorrelationSelectedEnvReversedFinal_nopH_12pt.png", res = 300, units = 'in', width = 7, height = 7)
g <- corrplot(ccc, method="color", col=rev(color(200)),  
         type = "upper", 
         p.mat = res$P, 
         tl.col = "black", 
         tl.srt = 45,
         tl.cex = 1.4,
         cl.cex = 1.4,
         cl.align.text ='l', 
         sig.level = c(0.001, 0.01, 0.05), pch.cex = 1.75,
         insig = 'label_sig', pch.col = 'grey20',
         diag = FALSE) 
dev.off()

Environmental PCA-biplot (Figure 1D)

# Using all the data, clean it up 
env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:4,5,6,7,8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

# The matrix
allenv <- allenvnum %>% 
  select(c(6:13))

# Compute PCA 
env.pca <- prcomp(allenv, scale = T)
summary(env.pca)

# Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.
fviz_eig(env.pca)

# Graph of individuals. Individuals with a similar profile are grouped together.
fviz_pca_ind(env.pca,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

# Graph of variables. Positive correlated variables point to the same side of the plot. 
# Negative correlated variables point to opposite sides of the graph.

fviz_pca_var(env.pca,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

# Biplot of individuals and variables
fviz_pca_biplot(env.pca, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
)

# PCA results
# Eigenvalues
eig.val <- get_eigenvalue(env.pca)
eig.val

# Results for Variables
res.var <- get_pca_var(env.pca)
res.var$coord          # Coordinates
res.var$contrib        # Contributions to the PCs
res.var$cos2           # Quality of representation 
# Results for individuals
res.ind <- get_pca_ind(env.pca)
res.ind$coord          # Coordinates
res.ind$contrib        # Contributions to the PCs
res.ind$cos2           # Quality of representation 

var <- get_pca_var(env.pca)

# Color individuals by groups or some qualitative variable

groups <- as.factor(allenvnum$Season)

# Set the theme for all the following plots.
theme_set(theme_bw(base_size = 14))

png("PCAbiplot_EnvironmentalFactors_12pt.png",  width = 2200, height = 1800, res = 300)
g <- fviz_pca_biplot(env.pca, 
                     # Individuals (stations)
                     geom.ind = 'point', 
                     pointsize = 2,
                     col.ind = groups,
                     palette = c('#696969',"#00AFBB", "#FC4E07",  'darkgreen') ,
                     addEllipses = TRUE,                               # Concentration ellipses 
                     ellipse.type = "confidence",        # ellipse.level = 0.95,
                     
                     # Variable (environmental variable)
                     geom.var = c('arrow', 'text'),
                     labelsize = 4,
                     repel = T,                                        # Avoid label overplotting
                     #label = 5,
                     ggtheme = theme(axis.text=element_text(size=16.5), 
                                     axis.title=element_text(size=12),
                                     legend.text=element_text(size=16.5),
                                     axis.title.x = element_text(size = 16.5),
                                     axis.title.y = element_text(size = 16.5))) 
print(g)
dev.off()

# PerMANOVA - partitioning the euclidean distance matrix by Season to determine if the differences are statistically different

adonis2(allenv ~ Season, data = allenvnum, method='eu')

_________________________________________________________________________________________________________

Cell counts

Diatom cell counts (Figure 2A)

cells <- read.csv('cell_counts.csv', header=T, stringsAsFactors=F) %>% 
  rename('Pseudo nitzschia sp.' = 'Pseudo.nitzschia_sp.')
  
diatoms <- as_tibble(cells) %>%  
  rename(date=Sampling_Date) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  select(date, 'Pseudo nitzschia sp.','Chaetoceros sp.','Skeletonema sp.', 'Rhizosolenia sp.', 'unidentified diatoms',"Cyanobacteria","Dinoflagellates") %>% 
  pivot_longer(-date, names_to='taxon', values_to='counts')

diatoms$date <- as.Date(diatoms$date, "%d/%m/%Y") 
diatoms$date <- as.character(diatoms$date)
diatoms
datelist <- as.factor(diatoms$date)
datelist

png("diatom_with_phyto_totalCellcounts_12pt.png", res=300, units='in', width=12, height=9)
g <- ggplot(diatoms, aes(x=rev(date), y=counts, fill=taxon)) +
  geom_col(color='black', width=0.7) +
  coord_flip() + 
  labs(x="Date",
       y="Total counts (cells/mL)") +
  theme_classic() + 
  scale_fill_manual(name=NULL,
                    breaks=c("Cyanobacteria","Dinoflagellates",'Pseudo nitzschia sp.','Chaetoceros sp.','Skeletonema sp.', 'Rhizosolenia sp.', 'unidentified diatoms'),
                    values=c('#771155',  '#114477', '#117777','#774411', 'darkgrey','#88CCAA', '#777711', 
                               '#AAAA44', '#DDDD77',  '#AA7744', '#DDAA77', '#771122', 
                               '#AA4455')) +
  scale_x_discrete(breaks=datelist,
                   labels=rev(datelist)) +
  scale_y_continuous(expand=c(0,0)) +
  theme(legend.text=element_markdown(size = 14),
        legend.key.size=unit(13, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        axis.text.x=element_text(angle=0, size = 14), 
        axis.text.y = element_text(size = 14),
        axis.title.x = element_text(size = 16))
print(g)
dev.off()

ggsave("just_the_diatoms.tiff", width=6, height=5)

Synechoccus and heterotrophic bacteria (Supplemental figure 5A and B)

data <- read.csv('YasBay_FCM.csv', header = T, stringsAsFactors = F, na = "NA")  # included na = 'NA', to make it stored as an NA not a string

data$Date <- as.Date(data$Date, "%m/%d/%Y") 
metadata$date <- as.character(metadata$date)

# Plot the data
ggplot(data, aes(x = Date, y = Synechococcus)) +
  geom_line(color = "blue", size = 1) +  # Line graph
  geom_point(size = 2, color = 'blue') +  # Add points for emphasis
  scale_x_date(date_labels = "%b-%Y", date_breaks = "1 month") +  # Format x-axis as Month-Year
  labs(title = 'Synechoccus', x = "Date", y = 'cells/mL') +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))  # Rotate x-axis labels

ggsave("SynechoccosYB.tiff", width = 6, height = 5)

# Plot the data
ggplot(data, aes(x = Date, y = Heterotrophic.Prokaryotes)) +
  geom_line(color = "darkorange", size = 1) +  # Line graph
  geom_point(size = 2, color = 'darkorange') +  # Add points for emphasis
  scale_x_date(date_labels = "%b-%Y", date_breaks = "1 month") +  # Format x-axis as Month-Year
  labs(title = 'Hetertrophic prokaryotes', x = "Date", y = 'cells/mL') +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))  # Rotate x-axis labels


ggsave("HeterProksYB.tiff", width = 6, height = 5)

_________________________________________________________________________________________________________

Relative abundance data

Eukaryotic relative abundance (Figure 3A)

install.packages("tidyverse")
install.packages('caroline')
install.packages("ggtext")
install.packages("wesanderson")

library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(RColorBrewer)
library(wesanderson)
library(caroline)
library(dplyr)

# Import the metadata
metadata <- read.csv('metadata.csv', header=T, stringsAsFactors=F, na="NA") %>%   # included na='NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter=Sample.Type, sample=SampleID) %>% 
  rename_all(tolower) %>% 
  filter(!date == '23/02/2024') %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu=id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to='sample',values_to='count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu=id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into=c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(domain=str_replace(domain, "d__", ""),
         phylum=str_replace(phylum, "p__", ""),
         class=str_replace(class, "c__", ""),
         order=str_replace(order, "o__", ""),
         family=str_replace(family, "f__", ""),
         genus=str_replace(genus, "g__", ""),
         species=str_replace(species, "s__", "")) %>%
  filter(domain == 'Eukaryota' | domain == 'Unassigned') %>%
  filter(!phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 'Incertae_Sedis', 
                        'Labyrinthulomycetes','Bicosoecida'))

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(sample) %>% 
  dplyr::summarize(total=sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols=c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to="level",
               values_to="taxon") %>% 
  mutate(filter=factor(filter,
                         levels=c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'phylum') %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(rel_abund=100*sum(rel_abund), .groups='drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon=replace_na(taxon, "Unclassified Eukaryota")) %>% 
  mutate(taxon=str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon=str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  dplyr::summarize(max=max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(mean=mean(rel_abund), .groups='drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  dplyr::summarize(pool=max(mean) < 10,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean=mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups='drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by='taxon') %>% 
  mutate(taxon=if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  dplyr::summarize(rel_abund=sum(rel_abund), 
            mean=min(mean),
            .groups='drop') %>% 
  mutate(taxon=factor(taxon),
         taxon=fct_reorder(taxon, mean, .desc=TRUE),
         taxon=fct_shift(taxon, n=1)) # reorder legend, by reordering the factors, go back to pool line

png("euk_phylum_bar_rel_abund_all_dates_12pt.png", res=300, units='in', width=11, height=16)
g <- ggplot(test, aes(x=rev(date), y=rel_abund, fill=taxon)) +
  geom_col(color='black', width=0.7) +
  coord_flip() + 
  labs(x="Date",
       y="Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name=NULL,
                    breaks=c("*Ascomycota*","*Chlorophyta*","*Cryptophyceae*","*Diatomea*","*Dinoflagellata*", "*Ochrophyta*", 
                               "*Ochrophyta*", "*Peronosporomycetes*", "*Protalveolata*","*Prymnesiophyceae*",
                               "Unclassified *Eukaryota*", "Other"),
                    values=c('#771155', '#AA4488', '#CC99BB', '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#44AAAA', '#77CCCC', '#117744', '#44AA77', 'darkgrey', '#777711', 
                               '#AAAA44', '#D1D1D1','#DDDD77', '#774411', '#AA7744', '#DDAA77', '#771122', 
                               '#AA4455')) +
  scale_x_discrete(breaks=datelist,
                   labels=rev(datelist)) +
  scale_y_continuous(expand=c(0.0,3), ) +
  theme(legend.position="bottom",
        legend.text=element_markdown(size = 23),
        legend.key.size=unit(16, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        axis.text.x=element_text(angle=0, size = 25),
        axis.text.y = element_text(size = 25),
        axis.title.x = element_text(size = 25)) +
  guides(fill = guide_legend(nrow = 5, byrow = TRUE))

print(g)
dev.off()

Bacteria relative abundance (Figure 3B)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Bacteria' | domain == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  dplyr::summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'phylum') %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  dplyr::summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  dplyr::summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  dplyr::summarize(pool = max(mean) < 1,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  dplyr::summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("bacteria_phylum_bar_rel_abund_all_dates_12pt.png", res = 300, units = 'in', width=11, height=16)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.7) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*Actinobacteriota*","*Bacteroidota*","*Bdellovibrionota*","*Chloroflexi*",
                               "*Cyanobacteria*","*Desulfobacterota*", "*Firmicutes*", "*Patescibacteria*",
                               "*Planctomycetota*","*Proteobacteria*","*SAR324_clade(Marine_group_B)*","*Verrucomicrobiota*", "Other"),
                    values = c('#771155', '#AA4488',  '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#774411', '#44AAAA', '#77CCCC', '#44AA77', '#AA7744','#88CCAA','darkgray', '#AAAA44','#777711', 
                               '#117744',  '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
  scale_x_discrete(breaks = rev(datelist),
                   labels = datelist) +
  scale_y_continuous(expand=c(0.0,3), ) +
theme(legend.position="bottom",
      legend.text=element_markdown(size = 23),
      legend.key.size=unit(16, "pt"),
      strip.background=element_blank(), 
      strip.placement='outside',
      axis.text.x=element_text(angle=0, size = 25),
      axis.text.y = element_text(size = 25),
      axis.title.x = element_text(size = 25)) +
  guides(fill = guide_legend(nrow = 7, byrow = TRUE))

print(g)
dev.off()

Archaeal relative abundance (Supplemental figure 8)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_arc.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Archaea' | domain == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'genus') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Archaea")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool = max(mean) < 15,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("archaea_bar_rel_abund_all_dates.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.5) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*Bathyarchaeia*","*Halococcus*","*Marine_Group_II*","*Methanobrevibacter*",
                               "*Methanosaeta*","*Nitrososphaeraceae*", "Unclassified *Archaea*","*uncultured*", "Other"),
                    values = c('#771155', '#AA4488',  '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#774411', '#44AAAA','darkgray', '#77CCCC', '#44AA77', '#AA7744','#88CCAA', '#AAAA44','#777711', 
                               '#117744',  '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
  scale_x_discrete(breaks = datelist,
                   labels = rev(datelist)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.text = element_markdown(size = 12),
        legend.key.size = unit(14, "pt"),
        strip.background = element_blank(), 
        strip.placement = 'outside',
        axis.text.x = element_text(angle = 0, size = 12))

print(g)
dev.off()

dev.off() 
(device = "RStudioGD")

Diatom relative abundance (Supplemental figure 6)

# Import the metadata
metadata <- read.csv('metadata.csv', header=T, stringsAsFactors=F, na="NA") %>%   # included na='NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter=Sample.Type, sample=SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- as.factor(metadata$date)
datelist <- datelist

# ASV counts for each feature
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu=id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to='sample',values_to='count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu=id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into=c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(domain=str_replace(domain, "d__", ""),
         phylum=str_replace(phylum, "p__", ""),
         class=str_replace(class, "c__", ""),
         order=str_replace(order, "o__", ""),
         family=str_replace(family, "f__", ""),
         genus=str_replace(genus, "g__", ""),
         species=str_replace(species, "s__", "")) %>% 
  filter(phylum == 'Diatomea')

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total=sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by='otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund=count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols=c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to="level",
               values_to="taxon") %>% 
  mutate(filter=factor(filter,
                         levels=c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'genus') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund=100*sum(rel_abund), .groups='drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon=replace_na(taxon, "Unclassified Diatomea")) %>% 
  mutate(taxon=str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon=str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max=max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean=mean(rel_abund), .groups='drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool=max(mean) < 30,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean=mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups='drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by='taxon') %>% 
  mutate(taxon=if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund=sum(rel_abund), 
            mean=min(mean),
            .groups='drop') %>% 
  mutate(taxon=factor(taxon),
         taxon=fct_reorder(taxon, mean, .desc=TRUE),
         taxon=fct_shift(taxon, n=1)) # reorder legend, by reordering the factors, go back to pool line

png("diatom_phylum_bar_rel_abund_all_dates.png", res=300, units='in', width=9, height=8)
g <- ggplot(test, aes(x=rev(date), y=rel_abund, fill=taxon)) +
  geom_col(color='black', width=0.5) +
  coord_flip() + 
  labs(y="Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name=NULL,
                    breaks=c("*Arcocellulus*","*Cerataulina*","*Chaetoceros*","*Cyclotella*",
                               "*Cylindrotheca*","*Leptocylindrus*","*Nitzschia*","*Pseudo-nitzschia*",
                               "*Skeletonema*","*Thalassiosira*","Unclassified *Diatomea*","Other"),
                   values=c('#771155', '#AA4488', '#CC99BB', '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#44AAAA', '#77CCCC', '#117744', '#44AA77', '#D1D1D1','#88CCAA', '#AAAA44','#777711',  
                               '#DDDD77','#DDAA77', '#774411', '#AA7744',  '#771122', 
                               '#AA4455')) +
  scale_x_discrete(breaks=datelist,
                   labels=rev(datelist)) +
  scale_y_continuous(expand=c(0,0)) +
  theme(legend.text=element_markdown(size = 12),
        legend.key.size=unit(14, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        axis.text.x=element_text(angle=0, size = 12))
print(g)
dev.off()

Proteobactera relative abundance (Supplemental figure 7A)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(phylum == 'Proteobacteria' | phylum == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'family') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool = max(mean) < 10,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes


test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("bacteria_Proteobacteria_phylum_bar_rel_abund_all_dates_top12.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.5) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*AEGEAN-169_marine_group*", "*Alcanivoracaceae1*", "*Algiphilaceae*", "*Alteromonadaceae*",
                               "*Arenicellaceae*","*Burkholderiaceae*", "*Cellvibrionaceae*", "*Nitriliruptoraceae*",
                               "*Clade_I*","*Colwelliaceae*","*Comamonadaceae*","*Halieaceae*",
                               "*Halomonadaceae*","*Hyphomicrobiaceae*","*Kangiellaceae*","*Legionellaceae*",
                               "*Marinobacteraceae*", "*Micavibrionaceae*","*Moraxellaceae*","*Pseudoalteromonadaceae*",
                               "*Pseudomonadaceae*","*Rhizobiaceae*", "*Rhodobacteraceae*", "*SAR116_clade*","*SAR86_clade*",
                               "*Salinisphaeraceae*","*Solimonadaceae*", "*Sphingomonadaceae*","*Spongiibacteraceae*",
                               "*Stappiaceae*", "*Vibrionaceae*","*Unknown_Family*", "Unclassified *Bacteria*", "Other"),
                    # breaks = c("*AEGEAN-169_marine_group*", "*Alcanivoracaceae1*", "*Algiphilaceae*", "*Alteromonadaceae*",
                    #            "*Arenicellaceae*","*Clade_I*","*Halieaceae*","*Halomonadaceae*",
                    #            "*Pseudoalteromonadaceae*", "*Rhodobacteraceae*", "*SAR86_clade*","*Solimonadaceae*", 
                    #            "*Sphingomonadaceae*", "*Stappiaceae*", "*Unknown_Family*","Other"),
                    values = c("#771155", "#AA4488", "#CC99BB", "#114477",
                               "#4477AA", "#77AADD", "#117777", "#44AAAA", 
                               "#77CCCC", "#117744", "#44AA77", "#88CCAA",
                               "#777711", "#AAAA44", "#DDDD77", "#774411", 
                               "#AA7744", "#DDAA77", "#771122", "#AA4455", 
                               "#DD7788", "#005A32", "purple", "green", 
                               "#CAE0AB", "#F7EE55", "#F6C141", "#F1932D", 
                               "#E8601C", "#DC050C","#99000D", "#000000",
                               "yellow")) +
  scale_x_discrete(breaks = datelist,
                   labels = rev(datelist)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.text = element_markdown(size = 12),
        legend.key.size = unit(14, "pt"),
        strip.background = element_blank(), 
        strip.placement = 'outside',
        axis.text.x = element_text(angle = 0, size = 12))

print(g)
dev.off()

dev.off() 
(device = "RStudioGD")

Bacteroidota relative abundance (Supplemental figure 7B)

# Import the metadata
metadata <- read.csv('metadata.csv', header = T, stringsAsFactors = F, na = "NA") %>%   # included na = 'NA', to make it stored as an NA not a string
  select(SampleID, Sample.Type, Date) %>% 
  rename(Filter = Sample.Type, sample = SampleID) %>% 
  rename_all(tolower) %>% 
  drop_na(filter)

metadata %>% count(filter)
metadata$date <- as.Date(metadata$date, "%d/%m/%Y") 
metadata$date <- as.character(metadata$date)
metadata

datelist <- metadata$date

# ASV counts for each feature
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F) 

otu_counts <- as_tibble(merged %>% 
                          rename(otu = id) %>% 
                          select(-Sequence,-Confidence,-Taxon) %>% 
                          pivot_longer(-otu,names_to = 'sample',values_to = 'count'))

otu_counts

# Taxonomy for each feature
taxonomy <- as_tibble(merged) %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(phylum == 'Bacteroidota' | domain == 'Unassigned')

taxonomy

# Join the dataframes with inner_join and create a relative abundance column

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup()

# Check to see if your relative abundances for each samples add up to 1
otu_rel_abund %>% 
  group_by(date) %>% 
  summarize(total = sum(rel_abund))

# Make the dataframe even tidyer

otu_rel_abund <- as_tibble(inner_join(metadata, otu_counts, by='sample')) %>% #as_tibble(anti_join(metadata, otu_counts))    # to see what is missing from metadata
  inner_join(., taxonomy, by = 'otu') %>% 
  group_by(date) %>% 
  mutate(rel_abund = count / sum(count)) %>% 
  ungroup() %>% 
  pivot_longer(cols = c("domain", "phylum", "class", "order", "family", "genus", "species"),
               names_to = "level",
               values_to = "taxon") %>% 
  mutate(filter = factor(filter,
                         levels = c("2", "3", "10")))

taxon_rel_abund <- otu_rel_abund %>% 
  filter(level == 'family') %>% 
  group_by(date, taxon) %>% 
  summarize(rel_abund = 100*sum(rel_abund), .groups = 'drop') %>%       # grouped phylums within each sample, gives us the rel abund of that phylum in that sample
  mutate(taxon = replace_na(taxon, "Unclassified Bacteria")) %>% 
  mutate(taxon = str_replace(taxon,
                             "Unclassified (.*)", "Unclassified *\\1*"),
         taxon = str_replace(taxon, "^(\\S*)$", "*\\1*")) 

taxon_rel_abund %>% 
  group_by(taxon) %>% 
  summarize(max = max(rel_abund)) %>%    # Group by taxon, and I want to look across the 3 filter sizes, so do summarize, and see what the maximum rel abund of each phylum across the three groups
  arrange(desc(max))

taxon_pool <- taxon_rel_abund %>% 
  group_by(date, taxon) %>% 
  summarize(mean = mean(rel_abund), .groups = 'drop') %>%   # filter, taxon, and mean, 3 filters and 50 phyla
  group_by(taxon) %>%                                       # relative abund of all samples across filters
  summarize(pool = max(mean) < 2,                           # If the maximum mean value within each of the filter size is less than one percent, it will be pooled
            mean = mean(mean),  # the average relative abundance across the filter sizes, we make this new column to used in the mutate factor taxon step below
            .groups = 'drop')    # Now summarize, and pool everything that has a less than 1% max mean realtive abundance across all taxa and filter sizes

test <- inner_join(taxon_rel_abund, taxon_pool, by = 'taxon') %>% 
  mutate(taxon = if_else(pool, "Other", taxon)) %>%   # if pool was true, we put Other, if it was false we put the taxon name
  group_by(date, taxon) %>%                         # group by taxon, and pool the different phyla, with each filter size
  summarize(rel_abund = sum(rel_abund), 
            mean = min(mean),
            .groups = 'drop') %>% 
  mutate(taxon = factor(taxon),
         taxon = fct_reorder(taxon, mean, .desc=TRUE),
         taxon = fct_shift(taxon, n = 1)) # reorder legend, by reordering the factors, go back to pool line

png("bacteria_Bacteroidota_phylum_bar_rel_abund_all_dates.png", res = 300, units = 'in', width = 9, height = 8)
g <- ggplot(test, aes(x = rev(date), y = rel_abund, fill = taxon)) +
  geom_col(color = 'black', width = 0.5) +
  coord_flip() + 
  labs(x = "Date",
       y = "Relative Abundance (%)") +
  theme_classic() + 
  scale_fill_manual(name = NULL,
                    breaks = c("*Balneolaceae*","*Crocinitomicaceae*","*Cryomorphaceae*","*Cyclobacteriaceae*",
                               "*Flavobacteriaceae*","*NS11-12_marine_group*", "*NS9_marine_group*", "*Rhodothermaceae*",
                               "*Saprospiraceae**","*Schleiferiaceae**","*Weeksellaceae*","*uncultured*", "Other","Unclassified *Bacteria*"),
                    values = c('#771155', '#AA4488',  '#114477', '#4477AA', '#77AADD', '#117777', 
                               '#774411', '#44AAAA', '#77CCCC', '#44AA77', '#AA7744','#88CCAA','darkgray', '#AAAA44','#777711', 
                               '#117744',  '#771122', '#CC99BB', '#DDDD77','#DDAA77', '#AA4455')) +
  scale_x_discrete(breaks = datelist,
                   labels = rev(datelist)) +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.text = element_markdown(size = 12),
        legend.key.size = unit(14, "pt"),
        strip.background = element_blank(), 
        strip.placement = 'outside',
        axis.text.x = element_text(angle = 0, size = 12))

print(g)
dev.off()

_________________________________________________________________________________________________________

Community diversity and structure

Eukaryote diversity metrics (Figure 4A, B, C)

remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>%
  rename(otu=id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(Domain=str_replace(Domain, "d__", ""),
         Phylum=str_replace(Phylum, "p__", ""),
         Class=str_replace(Class, "c__", ""),
         Order=str_replace(Order, "o__", ""),
         Family=str_replace(Family, "f__", ""),
         Genus=str_replace(Genus, "g__", ""),
         Species=str_replace(Species, "s__", "")) %>%
  mutate(Domain=str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:28118]

# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]

index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))

png("euks_diversity_12pt.png", res=300, units='in', width=15, height=8)
par(mar=c(3,5,3,2), mfrow=c(1,2))
colors=c('#ED7D31','#548235','#C00000', '#4472C4')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Shannon diversity")
dev.off()

# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)

aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
dist <- vegdist(otumatdf,  method = "bray")

metaMDS(dist)



# Copy the very first part of the Randi Griffith tutorial for the NMDS

install.packages("remotes")
remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>%
  rename(otu=id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into=c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep="; ",
           fill='right') %>%    # NA values on the right if it need to add information
  mutate(Domain=str_replace(Domain, "d__", ""),
         Phylum=str_replace(Phylum, "p__", ""),
         Class=str_replace(Class, "c__", ""),
         Order=str_replace(Order, "o__", ""),
         Family=str_replace(Family, "f__", ""),
         Genus=str_replace(Genus, "g__", ""),
         Species=str_replace(Species, "s__", "")) %>%
  mutate(Domain=str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# NMDS

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# The ASVs (rows = sample)
abund.mat <- datas[,14:28118]

# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>% 
  na.omit() %>% 
  select(-'NO2.NO3')

# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)

# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com,  dmethod = "bray", sample = 2500)
dist <- vegdist(m_com,  method = "bray")
nmds = metaMDS(dist)
# nmds <- dist %>%
#   metaMDS(trace = F) %>%
#   ordiplot(type = "t") %>%
#   text("sites")
nmds_stress <- nmds$stress
# stress = 0.17

# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en

# Plot with base R
plot(nmds)
plot(en)

# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')

en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)

png("NMDS_Euks.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
  geom_point(data = data.scores, 
             aes(colour = as.factor(Season), shape = as.factor(Sample.Type)), 
             show.legend = T, size = 3) +
  scale_color_manual(name = "Seasons", 
                     values=colors,
                     labels = c('Autumn','Spring','Summer','Winter')) +
  scale_shape_manual(name = "Size fraction", 
                     values=c(15, 16, 17),
                     breaks = c('2', '3', '10'),
                     labels = c('0.2 um', '3.0 um', '10.0 um')) +
  geom_segment(data = en_coord_cont, 
               aes(x = 0, y = 0, 
                   xend = NMDS1*.6, yend = NMDS2*.6), 
               arrow = arrow(length = unit(.6, 'picas')), colour="red") +
  geom_text(data = en_coord_cont, aes(x = NMDS1*.63, y = NMDS2*.63), colour = "black", 
          fontface = "bold", label = row.names(en_coord_cont)) + 
  # stat_ellipse(data = data.scores,
  #              aes(color=as.factor(Season), show.legend = F)) +
  xlab("NMDS1") + 
  ylab("NMDS2") +
  theme_bw(base_size = 15) +
  theme(axis.text.y = element_text(color="black", size=17, angle=0),
        axis.text.x = element_text(color="black",  size=17, angle=0),
        axis.title.x = element_text(color="black", size=20, angle=0),
        axis.title.y = element_text(color="black", size=20, angle=90),
        legend.text=element_markdown(),
        legend.key.size=unit(15, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        panel.border = element_rect(colour = "black", fill = NA))

dev.off()

Bacterial diversity metrics (Figure 4A, B, C)

remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Bacteria

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Bacteria' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$otu
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))


# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:83966]

# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]

index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
#index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))

png("bac_diversity_12pt.png", res=300, units='in', width=15, height=8)
par(mar=c(3,5,3,2), mfrow=c(1,2))
colors = c('#ED7D31','#548235','#C00000', '#4472C4')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=2, cex.lab=2, ylab="Shannon diversity")
dev.off()

# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)

aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)

# NMDS
# Install and load the following packages
install.packages("vegan")
install.packages("ape")
install.packages("dplyr")

library(vegan)
library(ape)
library(dplyr)

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Bacteria' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$otu
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:83966]

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# The ASVs (rows = sample)
abund.mat <- datas[,14:83966]

# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>% 
  na.omit() %>% 
  select(-'NO2.NO3')

# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)

# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com,  dmethod = "bray", sample = 10000)
dist <- vegdist(m_com,  method = "bray")
nmds = metaMDS(dist)
# nmds <- dist %>%
#   metaMDS(trace = F) %>%
#   ordiplot(type = "t") %>%
#   text("sites")
nmds_stress <- nmds$stress

# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en

# Plot with base R
plot(nmds)
plot(en)

# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
#data.scores = as.data.frame(scores(nmds)$sites)
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')

en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)

png("NMDS_Bac.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
  geom_point(data = data.scores, 
             aes(colour = as.factor(Season), shape = as.factor(Sample.Type)), 
             show.legend = T, size = 3) +
  scale_color_manual(name = "Seasons", 
                     values=colors,
                     labels = c('Autumn','Spring','Summer','Winter')) +
  scale_shape_manual(name = "Size fraction", 
                     values=c(15, 16, 17),
                     breaks = c('2', '3', '10'),
                     labels = c('0.2 um', '3.0 um', '10.0 um')) +
  geom_segment(data = en_coord_cont, 
               aes(x = 0, y = 0, 
                   xend = NMDS1*.6, yend = NMDS2*.6), 
               arrow = arrow(length = unit(.6, 'picas')), colour="red") +
  geom_text(data = en_coord_cont, aes(x = NMDS1*.64, y = NMDS2*.64), colour = "black", 
            fontface = "bold", label = row.names(en_coord_cont)) + 
  xlab("NMDS1") + 
  ylab("NMDS2") +
  theme_bw(base_size = 15) +
  theme(axis.text.y = element_text(color="black", size=17, angle=0),
        axis.text.x = element_text(color="black",  size=17, angle=0),
        axis.title.x = element_text(color="black", size=20, angle=0),
        axis.title.y = element_text(color="black", size=20, angle=90),
        legend.text=element_markdown(),
        legend.key.size=unit(15, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        panel.border = element_rect(colour = "black", fill = NA))

dev.off()

Archaeal diversity metrics (Figure 4A, B, C)

remotes::install_github("gavinsimpson/ggvegan")
library(remotes)
library(ggvegan)
library(vegan)
library(ggplot2)
library(tidyverse)

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_arc.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Archaea' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))


# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:9586]

# store computed indices in a new data frame called 'indices'
index <- datas[,c("Season","Sample.Type")]

index$Richness <- rowSums(abund.mat > 0)
index$Shannon <- diversity(abund.mat) # shannon is default
index$Rarefied <- c(rarefy(abund.mat[1:28105,], sample=91))

png("arc_diversity.png", res=300, units='in', width=9, height=8)
par(mar=c(3,4,3,2), mfrow=c(2,2))
colors=c('#771155','#114477','#117777', '#AA7744')
boxplot(Richness ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=1, ylab="Richness")
boxplot(Shannon ~ Season, data=index, boxwex=.7, col=colors, 
        cex.axis=1, ylab="Shannon diversity")
dev.off()

# try anova with linear model (lm) and aov model (preferred)
mod.richness <- lm(Richness ~ Season, data=index)
mod.shannon <- lm(Shannon ~ Season, data=index)

aov.richness <- aov(Richness ~ Season, data=index)
aov.shannon <- aov(Shannon ~ Season, data=index)

install.packages("vegan")
install.packages("ape")
install.packages("dplyr")

library(vegan)
library(ape)
library(dplyr)
library(tidyverse)


# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_arc.csv', header=T, stringsAsFactors=F)

otumat <- merged %>%
  select(-Sequence,-Confidence,-Taxon) %>% 
  column_to_rownames(var="id") 

otumatdf <- as.data.frame(t(otumat))
otumatdf$SampleID <- rownames(otumatdf)

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(tolower) %>% 
  separate(taxon,
           into = c("domain", "phylum", "class", "order", "family", "genus", "species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(domain = str_replace(domain, "d__", ""),
         phylum = str_replace(phylum, "p__", ""),
         class = str_replace(class, "c__", ""),
         order = str_replace(order, "o__", ""),
         family = str_replace(family, "f__", ""),
         genus = str_replace(genus, "g__", ""),
         species = str_replace(species, "s__", "")) %>% 
  filter(domain == 'Archaea' | domain == 'Unassigned')

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  #filter(Sample.Type == 2) %>% 
  #rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .))


# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:9598]

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# make Habitat an ordered variable
allenvnum$Season <- ordered(allenvnum$Season, c("spring", "summer", "autumn", "winter"))

abund.mat <- datas[,14:9598]

# Merge the otumat datafram with the environment dataframe

datas <- merge(allenvnum, otumatdf, by='SampleID') %>% 
  na.omit()

rownames(datas) <- datas$SampleID
colnames(datas) <- make.names(colnames(datas))

# The ASVs (rows = sample)
abund.mat <- datas[,14:9598]

# The predictor variables (rows = sample)
fijado <- datas[,6:13] %>% 
  na.omit() %>% 
  select(-'NO2.NO3')

# convert abund.mat to a matrix
m_com = as.matrix(abund.mat)

# Perform the NMDS ordination
set.seed(19870912)
#dist <- avgdist(m_com,  dmethod = "bray", sample = 10000)
dist <- vegdist(m_com,  method = "bray")
nmds = metaMDS(dist)
nmds_stress <- nmds$stress

# Run the envfit function with our environmental data
en = envfit(nmds, fijado, permutations = 999, na.rm = TRUE)
en

# Plot with base R
plot(nmds)
plot(en)

# Extract the data scores for ggplot
data.scores = as.data.frame(scores(nmds))
#data.scores = as.data.frame(scores(nmds)$sites)
data.scores$SampleID <- rownames(data.scores)
data.scores <- merge(data.scores, allenvnum, by = 'SampleID')

en_coord_cont = as.data.frame(scores(en, "vectors")) * ordiArrowMul(en)

png("NMDS_Arc.png", res=300, units='in', width=9, height=8)
ggplot(data.scores, aes(x = NMDS1, y = NMDS2)) +
  geom_point(data = data.scores, 
             aes(colour = as.factor(Season), shape = as.factor(Sample.Type)), 
             show.legend = T, size = 3) +
  scale_color_manual(name = "Seasons", 
                     values=colors,
                     labels = c('Autumn','Spring','Summer','Winter')) +
  scale_shape_manual(name = "Size fraction", 
                     values=c(15, 16, 17),
                     breaks = c('2', '3', '10'),
                     labels = c('0.2 um', '3.0 um', '10.0 um')) +
  geom_segment(data = en_coord_cont, 
               aes(x = 0, y = 0, 
                   xend = NMDS1*.3, yend = NMDS2*.3), 
               arrow = arrow(length = unit(.6, 'picas')), colour="red") +
  geom_text(data = en_coord_cont, aes(x = NMDS1*.34, y = NMDS2*.34), colour = "black", 
            fontface = "bold", label = row.names(en_coord_cont)) + 
  #stat_ellipse(aes(color=Season), show.legend = F) +
  xlab("NMDS1") + 
  ylab("NMDS2") +
  theme_bw(base_size = 15) +
  theme(axis.text.y = element_text(color="black", size=17, angle=0),
        axis.text.x = element_text(color="black",  size=17, angle=0),
        axis.title.x = element_text(color="black", size=20, angle=0),
        axis.title.y = element_text(color="black", size=20, angle=90),
        legend.text=element_markdown(),
        legend.key.size=unit(15, "pt"),
        strip.background=element_blank(), 
        strip.placement='outside',
        panel.border = element_rect(colour = "black", fill = NA))

dev.off()

_________________________________________________________________________________________________________

Metacyc biological pathway

Diatom relative abundance (Supplemental figure )

# Load the pheatmap package
install.packages('matrixStats')
library(pheatmap)
library(tidyverse)
library(readxl)
library(ggplot2)
library(ggtext)
library(matrixStats)
library(RColorBrewer)
library(caroline)

# Input sample metadata

envs <- read_csv('Metadata_YB.csv') 

allenvnum <- envs %>% 
  select(c(2:8,10:15)) %>% 
  rename('NO2+NO3'='No2.No3_umol_L') %>% 
  rename_all(~ gsub("_umol_L", "", .)) %>% 
  rename_all(~ gsub("ODO_mgL", "DO", .)) %>% 
  filter(!SampleID == 'NA') %>% 
  filter(!SampleID == 'BB10')

# Annotation column: a data frame where the row names amtch the column names of the data_matrix

rownames(allenvnum) <- allenvnum$SampleID

# ASV counts for each feature into a matrix
merged <- read.csv('MetaCycPathways.csv', header=T, stringsAsFactors=F) %>%
  column_to_rownames(var='Description') %>% 
  select(-pathway)

# Check to see if they're relative abundance counts
# count <- rbind(counts, colSums(counts[1:90]))

merge_mat <- as.matrix(merged)

### Log2 normalization and then determine the genes that have the most variance
log_merge_mat <- as.matrix(log2(merged+1))

# Get the top most 50 variable genes
rv <- rowVars(merge_mat)
idx <- order(-rv)[1:40]

rv <- rowVars(log_merge_mat)
idx <- order(-rv)[1:40]

# Create column annotations for the sample designations. 

anncol <- data.frame(Var1 = factor(1:90, labels = c('1')))
rownames(anncol) <- allenvnum$SampleID
anncol$Season <- as.factor(allenvnum$Season)
anncol$Filter <- as.factor(allenvnum$Sample.Type)
anncol <- anncol[,-1]
anncol <- anncol[order(anncol$Season),]
anncol

# Change color of Seasons and filter size
# select my row annotation colors. Name the color vector clas2
my_colour = list(
  anncol,
  Filter = c('2' = 'yellow', '3' = "#FC4E07", '10' = '#696969'),
  Season = c(spring = '#AA4488',autumn = '#4477AA', summer = '#117777', winter = '#AA7744'))


# Reorder columns in merge_mat based on column order of annotation column, and Z-score the values

mat_ordered <- merge_mat[, rownames(anncol)]
mat_orderedz <- scale(mat_ordered)
head(mat_orderedz)

# set the color create color palette
paletteLength <- 100

my_color <- rev(brewer.pal(11, "RdBu"))
my_color = colorRampPalette(my_color)(paletteLength)

myBreaks <- c(seq(min(mat_orderedz), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(mat_orderedz)/paletteLength, max(mat_orderedz), length.out=floor(paletteLength/2)))

# Create the heatmap with annotation

png("metacycPie2log.png", res=300, units='in', width=18, height=14)
pheatmap(mat_orderedz[idx,], 
         annotation_col = anncol,
         annotation_colors = my_colour,
         main = "MetaCyc annotations",
         breaks = myBreaks,
         color = my_color,
         cellwidth = 6, 
         cellheight = 6, 
         fontsize_row = 6, 
         fontsize_col = 6, 
         #boarder_color = 'black', 
         cluster_rows = TRUE, 
         cluster_cols = TRUE,
         cutree_cols = 6)
dev.off()

_________________________________________________________________________________________________________

Network co-occurrence analysis

Cross-domain network analysis (Figure 5A)

# Install packages (only once on each device)

setwd("/Users/carlymoreno/Dropbox/Yas_bay")

# Required packages
install.packages("devtools")
install.packages("BiocManager")

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

# download gfortran dev package
# https://cran.r-project.org/bin/macosx/tools/

# Install NetCoMi
devtools::install_github("stefpeschel/NetCoMi", 
                         dependencies = c("Depends", "Imports", "LinkingTo"),
                         repos = c("https://cloud.r-project.org/",
                                   BiocManager::repositories()))

install.packages('ape','limma','tidyverse','phyloseq')

###############################################################################

# If previous installation fails, with warning that packages cannot be built, try first modifying the R Makevars file in the computer terminal


# ~/.R/Makevars - custom configuration to fix SpiecEasi install on macOS ARM64
# Uses Homebrew GCC/gfortran and overrides system R Makeconf which points to old /opt/gfortran

#nano ~/.R/Makevars     # copy and paste the 10 lines below

# Fortran compilers - point to Homebrew-installed gfortran-15
FC = /opt/homebrew/bin/gfortran-15
F77 = /opt/homebrew/bin/gfortran-15

# Fortran libraries - use Homebrew gcc libs only, no /opt/gfortran
FLIBS = -L/opt/homebrew/opt/gcc/lib/gcc/15 -lgfortran -lquadmath -lm

# C and C++ compilers - Homebrew gcc/g++ 15
CC = /opt/homebrew/bin/gcc-15
CXX = /opt/homebrew/bin/g++-15

# Compiler flags for C and C++ (enable OpenMP)
CFLAGS = -g -O2 -fopenmp
CXXFLAGS = -g -O2 -fopenmp

# Linker flags - libomp and gcc lib paths from Homebrew
LDFLAGS = -L/opt/homebrew/opt/libomp/lib -lomp -L/opt/homebrew/opt/gcc/lib/gcc/15
CPPFLAGS = -I/opt/homebrew/opt/libomp/include

# Force R to use the above Fortran libs during shared library linking
SHLIB_FCLD = $(FLIBS)

# close Terminal and R, and then reopen R

# And then these installation steps in R:

# Required packages
install.packages("devtools")
install.packages("BiocManager")

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

install.packages("pkgbuild")
pkgbuild::check_build_tools(debug = TRUE)

# Since two of NetCoMi's dependencies are only available on GitHub, 
# it is recommended to install them first:
devtools::install_github("zdk123/SpiecEasi")
devtools::install_github("GraceYoon/SPRING")

# Install NetCoMi
devtools::install_github("stefpeschel/NetCoMi", 
                         repos = c("https://cloud.r-project.org/",
                                   BiocManager::repositories()))


###############################################################################

# Start here

library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)

###############################################################################

# Make a phyloseq object

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>%
  rename(otu = id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>%
  mutate(Domain = str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "Phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq_euks = merge_phyloseq(physeq, sampledata, random_tree)
physeq_euks

plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq_euks)
plot_heatmap(physeq_euks, taxa.label="Phylum")

saveRDS(physeq_euks, "physeq_euks.rds")

# Make a phyloseq object

# Bacteria

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(toupper) %>% 
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>% 
  filter(Domain == 'Bacteria') %>% 
  filter(!Family %in% c('Chloroplast', 'Mitochondria'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "Phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
install.packages('ape')
library(ape)

random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq_bac = merge_phyloseq(physeq, sampledata, random_tree)
physeq_bac

plot_tree(physeq_bac, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq_bac)
plot_heatmap(physeq_bac, taxa.label="Phylum")

saveRDS(physeq_bac, "physeq_bac.rds")

###############################################################################

# Start here to make one network with eukaryotes and bacteria for the entire year

### Eukaryotes
pseuk <- readRDS('physeq_euks.rds')

# zero taxa 제거
ps.f = prune_samples(sample_sums(pseuk) > 0, pseuk) 
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f)  #  14733 taxa and 22 samples

# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Class")   # 161 orders and 22 samples

# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_euks_renamed <- NetCoMi::renameTaxa(ps_genus,
                                        pat = "<name>",
                                        substPat = "<name>_<subst_name>(<subst_R>)",
                                        numDupli = "Class")

sum_euks_mat <- as.matrix(t(otu_table(ps_euks_renamed)@.Data))

sum_euks_df <- as.data.frame(otu_table(ps_euks_renamed)@.Data)
sum_euks_df$OTU <- rownames(sum_euks_df)

sum_euks_df <- merge(sum_euks_df, taxonomy, by = 'OTU') %>% 
  drop_na()

sum_euks_df <- sum_euks_df %>% 
  filter(!Family %in% c('uncultured','Labyrinthulomycetes'))

rownames(sum_euks_df) <- sum_euks_df$Class

sum_euks_df <- as.matrix(t(sum_euks_df[,c(-1, -c(24:30))]))


### Bacteria
psbac <- readRDS("physeq_bac.rds")

# zero taxa 제거
ps.b = prune_samples(sample_sums(psbac) > 0, psbac) 
ps.b = prune_taxa(rowSums(otu_table(ps.b)) > 0, ps.b)  #  21585 taxa and 22 samples

# Agglomerate to genus level
ps_bacgenus <- tax_glom(ps.b, taxrank = "Class")     #  413 taxa and 22 samples

# Taxonomic table
taxtab <- as(tax_table(ps_bacgenus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_bac_genus_renamed <- NetCoMi::renameTaxa(ps_bacgenus,
                                            pat = "<name>",
                                            substPat = "<name>_<subst_name>(<subst_R>)",
                                            numDupli = "Class")

sum_bac_mat <- as.matrix(t(otu_table(ps_bac_genus_renamed)@.Data))

sum_bac_df <- as.data.frame(otu_table(ps_bac_genus_renamed)@.Data)
sum_bac_df$OTU <- rownames(sum_bac_df)

sum_bac_df <- merge(sum_bac_df, taxonomy, by = 'OTU') %>% 
  drop_na()

sum_bac_df <- sum_bac_df %>% 
  filter(!Family %in% c('uncultured'))

rownames(sum_bac_df) <- sum_bac_df$Class

sum_bac_df <- as.matrix(t(sum_bac_df[,c(-1, -c(24:30))]))

# Summer (group 1 bac and euks)

set.seed(123456)

spiec <- multi.spiec.easi(list(sum_euks_df, sum_bac_df), 
                                 method='mb', nlambda=40, 
                                 lambda.min.ratio=1e-1, 
                                 pulsar.params = list(thresh = 0.05))

assoMat1 <- SpiecEasi::symBeta(SpiecEasi::getOptBeta(spiec), mode = "ave")

assoMat1 <- as.matrix(assoMat1)

# Get taxa names
taxnames <- c(taxa_names(ps_genus), taxa_names(ps_bacgenus))
taxnames <- c(colnames(sum_euks_df), colnames(sum_bac_df))

colnames(assoMat1) <- rownames(assoMat1) <- taxnames
diag(assoMat1) <- 1

all <- netConstruct(data = assoMat1,
                    dataType = "condDependence", 
                    sparsMethod = "none")

netall <- netAnalyze(all, hubPar = "eigenvector")

nodeCols <- c(rep("darkturquoise", 57), rep("orange", 73))
names(nodeCols) <- taxnames

png("CrossDomain_EuksBac.png", res = 300, units = 'in', width = 10, height = 8)
plot(netall, 
     layoutGroup = "union",
     nodeColor = "colorVec", 
     colorVec = nodeCols,
     nodeSize = "eigen", 
     nodeSizeSpread = 5,
     labelScale = FALSE,
     posCol = "darkgreen", 
     negCol = "red",
     cexNodes = 2,  # size of node
     cexHubs = 2.2,   # Size of the central node among nodes
     cexLabels = .5, # Size of the regular nodes labesl
     cexHubLabels = .7, # Size of the central node among nodes
     cexTitle = 3)

legend(.7, 1, cex = .7, pt.cex = 2, 
       legend = c("Eukaryotes", "Bacteria"), col = c("lightblue", "orange"), 
       bty = "n", pch = 16) 

dev.off()

summary(netall)

Summer and winter eukaryotic network analysis (Figure 5B)

library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)

# Make a phyloseq object

# Eukaryotes

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_18S.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>%
  rename(otu = id) %>%
  select(otu, Taxon) %>%
  rename_all(toupper) %>%
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>%
  mutate(Domain = str_replace(Domain, "Unassigned", "Unknown")) %>% 
  filter(Domain == 'Eukaryota' | Domain == 'Unassigned') %>%
  filter(!Phylum %in% c('Vertebrata', 'Annelida', 'Arthropoda', 'Bryozoa' , 'Cnidaria', 'Ciliophora',
                        'Mollusca','Nemertea', 'Porifera', 'Echinodermata','Nematozoad','Rotifera',
                        'Tunicata','Gracilipodida','Entoprocta','Florideophycidae', 'Holozoa', 
                        'Incertae_Sedis','Bicosoecida','Pavlovophyceae'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "Phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
install.packages('ape')
library(ape)

random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq1 = merge_phyloseq(physeq, sampledata, random_tree)
physeq1

plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq1)
plot_heatmap(physeq1, taxa.label="phylum")

saveRDS(physeq1, "physeq1.rds")

###############################################################################

# Start the tutorial and incorporate my data

# Summer time 

# 1.import data
ps <- readRDS("physeq1.rds")
summer = subset_samples(ps, Season %in% c("summer")) #Extract only summer microbiome samples

# 2. processing data
Taxa100 = names(sort(taxa_sums(summer), decreasing = TRUE)[1:50]) # Top 100 taxa extract
summer.1 = prune_taxa(Taxa100, summer)

# 3. filtering
summer_genus <- tax_glom(summer.1, taxrank = "Genus") # Agglomerate to genus level
taxtabsummer <- as(tax_table(summer_genus), "matrix")


# 4. Rename non-unique names using rename function
summer_genus_renamed <- NetCoMi::renameTaxa(summer_genus,
                                         pat = "<name>",
                                         substPat = "<name>_<subst_name>(<subst_R>)",
                                         numDupli = "Genus")

# 5. NetConstruct Functionto construct a network using spearman correlation. Write measure 
# as "spearman", normalization as "clr", and zero value as "multiRepl" method.

netsum_genus <- netConstruct(summer_genus_renamed,       # phyloseq
                          taxRank = "Genus",       # 기준 tax level
                          filtTax = "highestFreq",
                          filtTaxPar = list(highestfreq = 50),
                          zeroMethod = "pseudo", 
                          zeroPar = list(pseudocount = 0.5),  # 기준 tax level
                          measure = "spearman",    # spearman correlation 으로 계산해보자
                          normMethod = "clr",      # transformation method to deal with zero, tutorial pairs this with Pearson method
                          sparsMethod = "threshold",
                          thresh = 0.3,
                          verbose = 3,
                          seed = 1234)

sum_props_genus <- netAnalyze(netsum_genus, 
                              clustMethod = "cluster_fast_greedy",
                              weightDeg = TRUE,
                              normDeg = FALSE,
                              gcmHeat = FALSE,
                              hubPar = "eigenvector")

plotHeat(mat = sum_props_genus$graphletLCC$gcm1,
         pmat = sum_props_genus$graphletLCC$pAdjust1,
         type = "mixed",
         title = "GCM", 
         colorLim = c(-1, 1),
         mar = c(2, 0, 2, 0))

# 6. Information of the created network
summary(sum_props_genus, numbNodes = 5L)

# 7. plot layout setting up
graph4 <- igraph::graph_from_adjacency_matrix(netsum_genus$adjaMat1, 
                                              weighted = TRUE)
set.seed(42) # fixing the shape
lay_fr <- igraph::layout_with_fr(graph4)

rownames(lay_fr) <- rownames(netsum_genus$adjaMat1) # Row names of the layout matrix must match the node names

# 8. Drawing a network plot 그리기 
png("euks_genera_50_AI_summer_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(sum_props_genus,              # Network Ingredients
     layout = "spring",          # Network plot 의 layout # "spring" also available
     labelLength = 10,         # label length
     repulsion = 0.84,
     cexNodes = 0.9,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     colorVec =  phylcol,      # edge color 
     posCol = "darkturquoise", # positive correlation edge color
     negCol = "orange",        # negative correlation edge color
     cexLabels = 1.8,            # node label size
     title1 = "Summer Top 50 Eukaryote genera (Spearman)",  # Title
     showTitle = TRUE,         # Show title 
     cexTitle = 1.5)     

# Get phyla names
taxtab <- as(tax_table(summer_genus_renamed), "matrix")
phyla <- as.factor(gsub("p__", "", taxtab[, "Phylum"]))
names(phyla) <- taxtab[, "Phylum"]

# index color 
phylcol <- c("cyan", "blue3", "red", "lawngreen", "yellow", "deeppink")
phylcol_transp <- colToTransp(phylcol, 60)

# Phylum color index
legend(-1.4, 1.7, cex = 1, pt.cex = 1, title = "Phylum:",
       legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 12)

# correlation color index  0.4, 1.1,
legend(.8, 1.2, cex = 1, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = FALSE,
       bg = grey)

dev.off()

###############################################################################

# Wintertime

# 1.import data
ps <- readRDS("physeq1.rds")
winter = subset_samples(ps, Season %in% c("winter")) #Extract only winter microbiome samples

# 2. processing data
Taxa100 = names(sort(taxa_sums(winter), decreasing = TRUE)[1:50]) # Top 100 taxa extract
winter.1 = prune_taxa(Taxa100, winter)

# 3. filtering
winter_genus <- tax_glom(winter.1, taxrank = "Genus") # Agglomerate to genus level
taxtabwinter <- as(tax_table(winter_genus), "matrix")


# 4. Rename non-unique names using rename function
winter_genus_renamed <- NetCoMi::renameTaxa(winter_genus,
                                            pat = "<name>",
                                            substPat = "<name>_<subst_name>(<subst_R>)",
                                            numDupli = "Genus")

# 5. NetConstruct Functionto construct a network using spearman correlation. Write measure 
# as "spearman", normalization as "clr", and zero value as "multiRepl" method.

netwinter_genus <- netConstruct(winter_genus_renamed,       # phyloseq
                             taxRank = "Genus", 
                             filtTax = "highestFreq",
                             filtTaxPar = list(highestfreq = 50),
                             zeroMethod = "pseudo", 
                             zeroPar = list(pseudocount = 0.5),  # 기준 tax level
                             measure = "spearman",    # spearman correlation 으로 계산해보자
                             normMethod = "clr",      # transformation
                             sparsMethod = "threshold",
                             thresh = 0.3,
                             verbose = 3,
                             seed = 1234)

# taxRank = "Genus",       # 기준 tax level
# measure = "spearman",    # spearman correlation 으로 계산해보자
# zeroMethod = "multRepl",
# normMethod = "clr",      # transformation
# sparsMethod = "threshold",
# thresh = 0.3,
# verbose = 3,
# seed = 1234)

winter_props_genus <- netAnalyze(netwinter_genus, 
                                 clustMethod = "cluster_fast_greedy",
                                 weightDeg = TRUE,
                                 normDeg = FALSE,
                                 gcmHeat = FALSE,
                                 hubPar = "eigenvector")

plotHeat(mat = winter_props_genus$graphletLCC$gcm1,
         pmat = winter_props_genus$graphletLCC$pAdjust1,
         type = "mixed",
         title = "GCM", 
         colorLim = c(-1, 1),
         mar = c(2, 0, 2, 0))

# 6. Information of the created network

summary(winter_props_genus, numbNodes = 5L)

# 7. plot layout setting up

graph5 <- igraph::graph_from_adjacency_matrix(netwinter_genus$adjaMat1, 
                                              weighted = TRUE)
set.seed(42) # fixing the shape
lay_fr <- igraph::layout_with_fr(graph5)

rownames(lay_fr) <- rownames(netwinter_genus$adjaMat1) # Row names of the layout matrix must match the node names

# 8. Drawing a network plot 그리기 
png("euks_winter_top50_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(winter_props_genus,              # Network Ingredients
     layout = "spring",          # Network plot 의 layout # "spring" also available
     labelLength = 10,         # label length
     cexNodes = 0.9,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     colorVec =  phylcol,      # edge color 
     posCol = "darkturquoise", # positive correlation edge color
     negCol = "orange",        # negative correlation edge color
     cexLabels = 1.8,            # node label size
     title1 = "Winter Eukaryote genera (Spearman)",  # Title
     showTitle = TRUE,         # Show title 
     cexTitle = 1.5,           # Title size
     mar = c(5,5,5,5))      

# Get phyla names
taxtab <- as(tax_table(winter_genus_renamed), "matrix")
phyla <- as.factor(gsub("p__", "", taxtab[, "Phylum"]))
names(phyla) <- taxtab[, "Phylum"]

# index color 
phylcol <- c("cyan", "blue3", "red", "lawngreen", "yellow", "deeppink")
phylcol_transp <- colToTransp(phylcol, 60)

# Phylum color index
legend(-1.1, 1.1, cex = 1, pt.cex = 1, title = "Phylum:", 
        legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16) 

# correlation color index  0.4, 1.1,
legend(.7, -.3, cex = 1, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = FALSE,
       bg = grey)


dev.off()

###############################################################################

# Summer versus winter eukaryotic networks

ps <- readRDS("physeq1.rds")

ps.pt <- ps %>% 
  subset_samples(Season %in% c("summer", "winter")) # 19083 taxa and 12 samples 

# zero taxa 제거
ps.f = prune_samples(sample_sums(ps.pt)>0, ps.pt) 
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f)  #  3090 taxa and 12 samples

# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Family")

# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_genus_renamed <- NetCoMi::renameTaxa(ps_genus,
                                        pat = "<name>",
                                        substPat = "<name>_<subst_name>(<subst_R>)",
                                        numDupli = "Family")

# 각 부위에 대한 샘플 추출 
ps_genus_renamed_summer <- phyloseq::subset_samples(ps_genus_renamed, 
                                                  Season == "summer")
ps_genus_renamed_winter <- phyloseq::subset_samples(ps_genus_renamed, 
                                                    Season == "winter")

ps_genus.sparcc <- netConstruct(
  data = ps_genus_renamed_summer,
  data2 = ps_genus_renamed_winter,
  filtTax = "highestFreq",               # Top을 뽑는 기준
  filtTaxPar = list(highestFreq  = 50), # Top 100
  taxRank = "Family",                     # Genus level 
  measure = "sparcc",                    # sparcc
  measurePar = list(nlambda=20, rep.num=10),
  
  normMethod = "clr",                    # transformation
  zeroMethod = "none",                   # zero 값 보정 
  sparsMethod = "threshold", 
  adjust = "adaptBH",                    # p-value 값 보정 
  thresh = 0.3,
  
  dissFunc = "signed",
  verbose = 2,
  seed = 42)

ps_props_sparcc <- netAnalyze(ps_genus.sparcc, 
                              clustMethod = "cluster_fast_greedy",
                              hubPar = "eigenvector",           # Hub node 판별방법
                              normDeg = FALSE)

summary(ps_props_sparcc, groupNames = c("summer", "winter"))

# Compute layout
ps_graph <- igraph::graph_from_adjacency_matrix(ps_genus.sparcc$adjaMat1,
                                                weighted = TRUE)

set.seed(42)
ps_lay_fr <- igraph::layout_with_fr(ps_graph)

# Row names of the layout matrix must match the node names
rownames(ps_lay_fr) <- rownames(ps_genus.sparcc$adjaMat1)


# Get phyla names
taxtab <- as(tax_table(ps_genus_renamed), "matrix")
phyla <- as.factor(taxtab[, "Phylum"])
names(phyla) <- taxtab[, "Family"]
table(phyla)

# Define phylum colors
library(RColorBrewer)
phylcol <- c(brewer.pal(12, "Paired"), brewer.pal(12, "Set3"))

# Colors used in the legend should be equally transparent as in the plot
phylcol_transp <- colToTransp(phylcol, 60)

png("euks_family_50_summer_vs_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(ps_props_sparcc,
     sameLayout = TRUE, 
     #nodeColor = "cluster",
     #nodeSize = "mclr",
     #labelScale = FALSE,
     nodeSize = "eigenvector",  # "clr",
     nodeColor = "cluster",     #"feature",
     #labelLength = 8,         # label length
     repulsion = 0.85,
     #layoutGroup = "union",
     cexNodes = 1.5,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     featVecCol = phyla, 
     colorVec =  phylcol,
     cexLabels = 2.5,
     posCol = "darkturquoise", 
     negCol = "orange",
     title1 = "Network on genus level with sparcc correlations", 
     showTitle = TRUE,
     groupNames = c("Summer","Winter"))

#legend(-1.2, 10, cex = 1.5, pt.cex = 2.5, title = "Phylum:", 
#       legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16) 

legend(0.7, 1.3, cex = 2.2, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = TRUE)
dev.off()

comp_season <- netCompare(ps_props_sparcc, 
                          permTest = FALSE, 
                          verbose = FALSE,
                          seed = 123456)

summary(comp_season, 
        groupNames = c("summer", "winter"),
        showCentr = c("degree", "between", "closeness"), 
        numbNodes = 5)

plot(ps_props_sparcc, 
     sameLayout = TRUE, 
     layoutGroup = "union", 
     #colorVec = col,
     borderCol = "gray40", 
     nodeSize = "degree", 
     cexNodes = 0.9, 
     nodeSizeSpread = 3, 
     edgeTranspLow = 80, 
     edgeTranspHigh = 50,
     groupNames = c("summer", "winter"), 
     showTitle = TRUE, 
     cexTitle = 2.8,
     mar = c(1,1,3,1), 
     repulsion = 0.9, 
     labels = FALSE, 
     rmSingles = "inboth",
     nodeFilter = "clustMin", 
     nodeFilterPar = 10, 
     nodeTransp = 50, 
     hubTransp = 30)

Summer and winter bacterial network analysis (Supplemental figure 10)

library(devtools)
library('BiocManager')
library(NetCoMi)
library(limma)
library(igraph)
library(ggplot2)
library(phyloseq)
library(tidyverse)
library(ape)

###############################################################################

# Make a phyloseq object

# Bacteria

# ASV counts for each feature into a matrix
merged <- read.csv('metadata_16S_bac.csv', header = T, stringsAsFactors = F)

otumat <- merged %>%
  rename(otu = id) %>%
  select(-Sequence,-Confidence,-Taxon) 

rownames(otumat) <- otumat$otu
otumat <- otumat[,-1]
otumat <- as.matrix(otumat)
otumat
class(otumat)
otumatrans <- as.matrix(t(otumat))

# Taxonomy
taxonomy <- merged %>% 
  rename(otu = id) %>% 
  select(otu, Taxon) %>% 
  rename_all(toupper) %>% 
  separate(TAXON,
           into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"),
           sep = "; ",
           fill = 'right') %>%    # NA values on the right if it need to add information
  mutate(Domain = str_replace(Domain, "d__", ""),
         Phylum = str_replace(Phylum, "p__", ""),
         Class = str_replace(Class, "c__", ""),
         Order = str_replace(Order, "o__", ""),
         Family = str_replace(Family, "f__", ""),
         Genus = str_replace(Genus, "g__", ""),
         Species = str_replace(Species, "s__", "")) %>% 
  filter(Domain == 'Bacteria') %>% 
  filter(!Family %in% c('Chloroplast', 'Mitochondria'))

taxonomy
rownames(taxonomy) <- taxonomy$OTU
taxmat <- taxonomy[,-1]
taxmat <- as.matrix(taxmat)
taxmat
class(taxmat)

taxmatrans <- as.matrix(t(taxmat))

# Combine into phyloseq object

OTU = otu_table(otumat, taxa_are_rows = TRUE)
OTU

TAX = tax_table(taxmat)
TAX

physeq = phyloseq(OTU, TAX)
physeq

plot_bar(physeq, fill = "phylum")

# transposed
OTUtrans = otu_table(otumatrans, taxa_are_rows = FALSE)
OTUtrans

TAXtrans = tax_table(taxmatrans)
TAXtrans

physeq1 = phyloseq(OTUtrans, TAX)
physeq1

plot_bar(physeq1, fill = "Phylum")

# Input sample metadata

env <- read.csv('Metadata_YB.csv', header = T, stringsAsFactors = F) 

allenvnum <- env %>% 
  select(c(2:8,10:15)) %>% 
  filter(Sample.Type == 2) %>% 
  rename_all(~ gsub("_", " ", .)) %>% 
  rename('NO2+NO3' = 'No2.No3 umol L') %>% 
  rename_all(~ gsub("umol L", "", .)) %>% 
  rename_all(~ gsub("ODO mgL", "DO", .)) %>% 
  drop_na()

rownames(allenvnum) <- allenvnum$SampleID
allenvnum <- allenvnum[,c(-1,-2)]

sampledata = sample_data(allenvnum)

# Create a phylogentic tree
install.packages('ape')
library(ape)

random_tree = rtree(ntaxa(physeq), rooted=TRUE, tip.label=taxa_names(physeq))
plot(random_tree)

# Merge new data with current phyloseq object:
physeq1 = merge_phyloseq(physeq, sampledata, random_tree)
physeq1

plot_tree(physeq1, color="Season", label.tips="taxa_names", ladderize="left", plot.margin=0.3)

plot_heatmap(physeq1)
plot_heatmap(physeq1, taxa.label="phylum")

saveRDS(physeq1, "physeq1.rds")

###############################################################################

 # Start the tutorial and incorporate my data

# Summer versus winter eukaryotic networks

ps <- readRDS("physeq1.rds")

ps.pt <- ps %>% 
  subset_samples(Season %in% c("summer", "winter")) # 19083 taxa and 12 samples 

# zero taxa 제거
ps.f = prune_samples(sample_sums(ps.pt)>0, ps.pt) 
ps.f = prune_taxa(rowSums(otu_table(ps.f)) > 0, ps.f)  #  3090 taxa and 12 samples

# Agglomerate to genus level
ps_genus <- tax_glom(ps.f, taxrank = "Family")

# Taxonomic table
taxtab <- as(tax_table(ps_genus), "matrix")

# necomi 형식으로 변환 및 Genus level 설정 
ps_genus_renamed <- NetCoMi::renameTaxa(ps_genus,
                                        pat = "<name>",
                                        substPat = "<name>_<subst_name>(<subst_R>)",
                                        numDupli = "Family")

# 각 부위에 대한 샘플 추출 
ps_genus_renamed_summer <- phyloseq::subset_samples(ps_genus_renamed, 
                                                    Season == "summer")
ps_genus_renamed_winter <- phyloseq::subset_samples(ps_genus_renamed, 
                                                    Season == "winter")

ps_genus.sparcc <- netConstruct(
  data = ps_genus_renamed_summer,
  data2 = ps_genus_renamed_winter,
  filtTax = "highestFreq",               # Top을 뽑는 기준
  filtTaxPar = list(highestFreq  = 50), # Top 100
  taxRank = "Family",                     # Genus level 
  measure = "sparcc",                    # sparcc
  measurePar = list(nlambda=20, rep.num=10),
  
  normMethod = "clr",                    # transformation
  zeroMethod = "none",                   # zero 값 보정 
  sparsMethod = "threshold", 
  adjust = "adaptBH",                    # p-value 값 보정 
  thresh = 0.3,
  
  dissFunc = "signed",
  verbose = 2,
  seed = 42)

ps_props_sparcc <- netAnalyze(ps_genus.sparcc, 
                              clustMethod = "cluster_fast_greedy",
                              hubPar = "eigenvector",           # Hub node 판별방법
                              normDeg = FALSE)

summary(ps_props_sparcc, groupNames = c("summer", "winter"))

# Compute layout
ps_graph <- igraph::graph_from_adjacency_matrix(ps_genus.sparcc$adjaMat1,
                                                weighted = TRUE)

set.seed(42)
ps_lay_fr <- igraph::layout_with_fr(ps_graph)

# Row names of the layout matrix must match the node names
rownames(ps_lay_fr) <- rownames(ps_genus.sparcc$adjaMat1)

# Get phyla names
taxtab <- as(tax_table(ps_genus_renamed), "matrix")
phyla <- as.factor(taxtab[, "Phylum"])
names(phyla) <- taxtab[, "Family"]
table(phyla)

# Define phylum colors
library(RColorBrewer)
phylcol <- c(brewer.pal(12, "Paired"), brewer.pal(12, "Set3"))

# Colors used in the legend should be equally transparent as in the plot
phylcol_transp <- colToTransp(phylcol, 60)
png("bac_family_50_summer_vs_network.png", res = 300, units = 'in', width = 10, height = 8)
plot(ps_props_sparcc,
     sameLayout = TRUE, 
     #nodeColor = "cluster",
     #nodeSize = "mclr",
     #labelScale = FALSE,
     nodeSize = "eigenvector",  # "clr",
     nodeColor = "cluster",     #"feature",
     #labelLength = 8,         # label length
     repulsion = 0.84,
     cexNodes = 1.5,  # size of node
     cexHubs = 1.1,   # Size of the central node among nodes
     featVecCol = phyla, 
     colorVec =  phylcol,
     cexLabels = 2.5,
     posCol = "darkturquoise", 
     negCol = "orange",
     title1 = "Network on genus level with sparcc correlations", 
     showTitle = TRUE,
     groupNames = c("Summer","Winter"))

#legend(-1.2, 10, cex = 1.5, pt.cex = 2.5, title = "Phylum:", 
#       legend=levels(phyla), col = phylcol_transp, bty = "n", pch = 16) 

legend(0.7, 1.3, cex = 2.2, title = "estimated correlation:",
       legend = c("+","-"), lty = 1, lwd = 3, col = c("darkturquoise","orange"), 
       bty = "n", horiz = TRUE)
dev.off()

comp_season <- netCompare(ps_props_sparcc, 
                          permTest = FALSE, 
                          verbose = FALSE,
                          seed = 123456)

summary(comp_season, 
        groupNames = c("summer", "winter"),
        showCentr = c("degree", "between", "closeness"), 
        numbNodes = 5)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Yas Bay code

Environmental parameters

Correlation plot (Figure 1C)

Environmental PCA-biplot (Figure 1D)

Cell counts

Diatom cell counts (Figure 2A)

Synechoccus and heterotrophic bacteria (Supplemental figure 5A and B)

Relative abundance data

Eukaryotic relative abundance (Figure 3A)

Bacteria relative abundance (Figure 3B)

Archaeal relative abundance (Supplemental figure 8)

Diatom relative abundance (Supplemental figure 6)

Proteobactera relative abundance (Supplemental figure 7A)

Bacteroidota relative abundance (Supplemental figure 7B)

Community diversity and structure

Eukaryote diversity metrics (Figure 4A, B, C)

Bacterial diversity metrics (Figure 4A, B, C)

Archaeal diversity metrics (Figure 4A, B, C)

Metacyc biological pathway

Diatom relative abundance (Supplemental figure )

Network co-occurrence analysis

Cross-domain network analysis (Figure 5A)

Summer and winter eukaryotic network analysis (Figure 5B)

Summer and winter bacterial network analysis (Supplemental figure 10)

FilesExpand file tree

FinalYBcode.md

Latest commit

History

FinalYBcode.md

File metadata and controls

Yas Bay code

Environmental parameters

Correlation plot (Figure 1C)

Environmental PCA-biplot (Figure 1D)

Cell counts

Diatom cell counts (Figure 2A)

Synechoccus and heterotrophic bacteria (Supplemental figure 5A and B)

Relative abundance data

Eukaryotic relative abundance (Figure 3A)

Bacteria relative abundance (Figure 3B)

Archaeal relative abundance (Supplemental figure 8)

Diatom relative abundance (Supplemental figure 6)

Proteobactera relative abundance (Supplemental figure 7A)

Bacteroidota relative abundance (Supplemental figure 7B)

Community diversity and structure

Eukaryote diversity metrics (Figure 4A, B, C)

Bacterial diversity metrics (Figure 4A, B, C)

Archaeal diversity metrics (Figure 4A, B, C)

Metacyc biological pathway

Diatom relative abundance (Supplemental figure )

Network co-occurrence analysis

Cross-domain network analysis (Figure 5A)

Summer and winter eukaryotic network analysis (Figure 5B)

Summer and winter bacterial network analysis (Supplemental figure 10)