Skip to content

Commit

Permalink
Revise code for importing and cleaning EMP and AEU data.
Browse files Browse the repository at this point in the history
  • Loading branch information
tedflynn committed Mar 19, 2024
1 parent dc6cefa commit 28faf0f
Showing 1 changed file with 298 additions and 10 deletions.
308 changes: 298 additions & 10 deletions phyto_code/phyto-data-processing-FASTR.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,310 @@ rm(list=ls())
```

## R Markdown
## Import EMP Data

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
Import phytoplankton data collected by the Environmental Monitoring Program.

When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
```{r import EMP, echo=FALSE}
# Import EMP data files
phyto_files_EMP <- list.files(path = here("phyto-final","data","EMP","csv"),
pattern = "\\.csv",
full.names = T)
df_phyto_EMP <- map(phyto_files_EMP, ~read_csv(.x, show_col_types = FALSE)) %>%
list_rbind()
# Read in files with non-standard headers individually
df_Dec2021 <- read_csv(here("phyto-final","data","EMP","oddballs","December 2021.csv"))
df_Nov2021 <- read_csv(here("phyto-final","data","EMP","oddballs","November 2021.csv"))
df_Sep2013 <- read_csv(here("phyto-final","data","EMP","oddballs","September 2013.csv"))
df_Nov2013 <- read_csv(here("phyto-final","data","EMP","oddballs","November 2013.csv"))
# Combine like oddball dfs
df_phyto2013 <- bind_rows(df_Sep2013, df_Nov2013)
df_phyto2021 <- bind_rows(df_Dec2021, df_Nov2021)
# Remove individual dfs
rm(df_Dec2021)
rm(df_Nov2021)
rm(df_Nov2013)
rm(df_Sep2013)
# Rename headers to match standard BSA headers Oddballs actually have the
# "correct" name of Total Cells rather than the incorrect "Number of cells per
# unit"
df_phyto2013 <- df_phyto2013 %>%
rename("Number of cells per unit" = "Total Cells Counted")
df_phyto2021 <- df_phyto2021 %>%
rename("Number of cells per unit" = "Total Number of Cells") %>%
rename("Unit Abundance" = "Unit Abundance (# of Natural Units)")
# Combine oddball files with others
df_phyto_EMP <- bind_rows(df_phyto_EMP, df_phyto2013)
df_phyto_EMP <- bind_rows(df_phyto_EMP, df_phyto2021)
# Remove unneeded dfs
rm(df_phyto2013)
rm(df_phyto2021)
# Remove empty rows
df_phyto_EMP <- df_phyto_EMP %>% filter_all(any_vars(!is.na(.)))
# Correct GALD, which is imported into two separate columns
# Test to see if NAs are 'either/or' and that there aren't some rows with a
# value in both GALD and GALD 1
sum(is.na(df_phyto_EMP$GALD)) # Total is 5880
sum(is.na(df_phyto_EMP$`GALD 1`)) # Total is 8072
# Sum of NAs is 13952 which is the same as the number of rows in the df.
# This shows that there aren't any rows with two values so that we can
# Combine them without any issues.
# Move GALD header
df_phyto_EMP <- df_phyto_EMP %>% relocate(`GALD 1`, .after = GALD)
# Combine both GALD columns
df_phyto_EMP <- df_phyto_EMP %>%
rowwise() %>%
mutate(GALD.Tot = sum(c_across(GALD:`GALD 1`), na.rm = TRUE))
# Remove old GALD columns and rename GALD.Tot
df_phyto_EMP <- df_phyto_EMP %>%
select(!(GALD:`GALD 1`)) %>%
rename("GALD" = "GALD.Tot")
# Clean up column names
df_phyto_EMP <- df_phyto_EMP %>% clean_names(case = "big_camel")
df_phyto_EMP <- df_phyto_EMP %>% rename("GALD" = "Gald")
# Remove blank columns
df_phyto_EMP <- df_phyto_EMP %>% select_if(~ !all(is.na(.)))
# Remove columns that just have the method code "Phyto" as well as
# pre-calculated organisms per mL
df_phyto_EMP <- df_phyto_EMP %>% select(SampleDate:Biovolume10,GALD)
# Add column to indicate what study it came from
df_phyto_EMP <- df_phyto_EMP %>% mutate(Study = "EMP", .after = StationCode)
```{r cars}
summary(cars)
```

## Including Plots
## Import phyto data collected for FASTR project

```{r import FASTR}
# Import AEU data files (comment out when finished)
phyto_files_AEU <- list.files(path = here("phyto-final","data","csv"),
pattern = "\\.csv",
full.names = T)
df_phyto_FASTR <- map(phyto_files_AEU, ~read_csv(.x, show_col_types = FALSE)) %>%
list_rbind()
# Remove empty rows
df_phyto_FASTR <- df_phyto_FASTR %>% filter_all(any_vars(!is.na(.)))
# Remove weird row with only a single zero in biovolume
df_phyto_FASTR <- df_phyto_FASTR %>% drop_na(MethodCode)
# Clean up column names
df_phyto_FASTR <- df_phyto_FASTR %>% clean_names(case = "big_camel")
# Filter out samples from other projects. Read in station names with flags and
# merge together.
df_keepers <- read_csv(here("phyto-final","CSVs", "station_names_flagged.csv"))
df_phyto_FASTR <- left_join(df_phyto_FASTR, df_keepers)
rm(df_keepers)
# Remove data unrelated to project
df_phyto_FASTR <- df_phyto_FASTR %>% filter(Flag == "Keep")
# Remove flag column
df_phyto_FASTR <- df_phyto_FASTR %>% select(!(Flag))
# Confirm station IDs
unique(df_phyto_FASTR$StationCode) # Shows only 10 FASTR stations
table(df_phyto_FASTR$StationCode)
# Remove blank columns
df_phyto_FASTR <- df_phyto_FASTR %>% select_if(~ !all(is.na(.)))
# Remove individual measurement columns as they are only present in a small
# number of samples.
df_phyto_FASTR <- df_phyto_FASTR %>% select(!(Dimension:DepthM))
# Remove secondary and tertiary GALD measurements as they don't vary much and
# aren't present in EMP data
df_phyto_FASTR <- df_phyto_FASTR %>% select(!(Gald2:Gald3))
# Correct GALD, which is imported into two separate columns. Test to see if NAs
# are 'either/or' and that there aren't some rows with a value in both GALD and
# GALD 1
sum(is.na(df_phyto_FASTR$Gald)) # Total is 1791
sum(is.na(df_phyto_FASTR$Gald1)) # Total is 4535
# Sum of NAs is 6326 which is the same as the number of rows in the df. This
# shows that there aren't any rows with two values so that we can Combine them
# without any issues.
# Move Gald1 column
df_phyto_FASTR <- df_phyto_FASTR %>% relocate(Gald1, .after = Gald)
# Combine both GALD columns
df_phyto_FASTR <- df_phyto_FASTR %>%
rowwise() %>%
mutate(GALD.Tot = sum(c_across(Gald:Gald1), na.rm = TRUE))
# Check if rows have two NAs or no NAs in the GALD columns
# test <- df_phyto_FASTR %>%
# mutate(GALD.Value = case_when(is.na(Gald) & is.na(Gald1) ~ "Fix",
# TRUE ~ "Okay"))
# Remove old GALD columns and rename GALD.Tot
df_phyto_FASTR <- df_phyto_FASTR %>%
select(!(Gald:Gald1)) %>%
rename("GALD" = "GALD.Tot")
# Remove MethodCode column that just says "Phyto"
df_phyto_FASTR <- df_phyto_FASTR %>% select(!(MethodCode))
# Add column to indicate what study it came from
df_phyto_FASTR <- df_phyto_FASTR %>% mutate(Study = "FASTR", .after = StationCode)
```

## Combine EMP and AEU data

```{r combine data}
df_phyto <- bind_rows(df_phyto_EMP, df_phyto_FASTR)
# Average all 10 biovolume measurements for each taxon
df_phyto <- df_phyto %>%
rowwise() %>%
mutate(BV.Avg = mean(c_across(Biovolume1:Biovolume10), na.rm = T)) %>%
select(!(Biovolume1:Biovolume10)) # Remove Individual Biovolume Columns
# Remove unneeded columns
df_phyto <- df_phyto %>% select(!c("BsaTin","DiatomSoftBody"))
df_phyto <- df_phyto %>% select(!(ColonyFilamentIndividualGroupCode:Shape))
df_phyto <- df_phyto %>% select(!(VolumeReceivedML:NumberOfFieldsCounted))
# Fix samples with missing times
# Just replace with 12:00:00 b/c aren't doing any time-based analyses
df_phyto <- df_phyto %>% replace_na(list(SampleTime = "12:00:00"))
# Get dates in the right format. Some are 1/1/14 and others 1/1/2014.
df_phyto$SampleDate <- mdy(df_phyto$SampleDate)
# Combine date and time column
df_phyto <- df_phyto %>% unite(DateTime, c("SampleDate","SampleTime"), sep = " ") #, remove = FALSE, na.rm = FALSE)
df_phyto$DateTime <- as_datetime(df_phyto$DateTime,
tz = "US/Pacific",
format = c("%Y-%m-%d %H:%M:%OS"))
# Check for missing dates
df_phyto %>% filter(is.na(DateTime)) # No missing dates
# Correct BSA header
df_phyto <- df_phyto %>% rename("TotalCells" = "NumberOfCellsPerUnit")
# Calculate Unit Density & Biovolume Density
df_phyto <- df_phyto %>%
mutate(Units.per.mL = UnitAbundance * Factor) %>%
mutate(BV.um3.per.mL= TotalCells * BV.Avg * Factor)
# Add column for year and month for highlighting data
df_phyto <- df_phyto %>%
mutate(Year = year(df_phyto$DateTime)) %>%
mutate(Month = month(df_phyto$DateTime, label = T))
# Order month in calendar order rather than (default) alphabetical
df_phyto$Month = factor(df_phyto$Month, levels = month.abb)
# Reorder date/time columns
df_phyto <- df_phyto %>%
relocate(Year, .after = DateTime) %>%
relocate(Month, .after = DateTime)
# Convert years to factors for plotting
df_phyto$Year <- as.factor(df_phyto$Year)
```

## Clean Data

```{r data cleaning, echo = FALSE}
# Fix EMP site names
df_phyto$StationCode <- gsub("EZ6 SAC","EZ6",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ6SAC","EZ6",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ6SJR","EZ6-SJR",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ2SAC","EZ2",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ2 SAC","EZ2",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ2SJR","EZ2-SJR",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ2 SJR","EZ2-SJR",df_phyto$StationCode)
df_phyto$StationCode <- gsub("EZ6 SJR","EZ6-SJR",df_phyto$StationCode)
df_phyto$StationCode <- gsub("D16-Twitchell","D16",df_phyto$StationCode)
df_phyto$StationCode <- gsub("D16-Twitchel","D16",df_phyto$StationCode)
df_phyto$StationCode <- gsub("D16 - Twitchell","D16",df_phyto$StationCode)
df_phyto$StationCode <- gsub("D16 Twitchell","D16",df_phyto$StationCode)
df_phyto$StationCode <- gsub("NZ328","NZ325",df_phyto$StationCode) # Typo in August 2019
df_phyto$StationCode <- gsub("C3A-HOOD","C3A",df_phyto$StationCode)
df_phyto$StationCode <- gsub("C3A Hood","C3A",df_phyto$StationCode)
df_phyto$StationCode <- gsub("C3A- Hood","C3A",df_phyto$StationCode)
df_phyto$StationCode <- gsub("C3A-Hood","C3A",df_phyto$StationCode)
df_phyto$StationCode <- gsub("NZ542","NZS42",df_phyto$StationCode)
df_phyto$StationCode <- gsub("E26","EZ6",df_phyto$StationCode)
df_phyto$StationCode <- gsub("E22","EZ2",df_phyto$StationCode) # Typo in May 2018
# Fix AEU site names
df_phyto$StationCode <- gsub("SHER","SHR",df_phyto$StationCode)
df_phyto$StationCode <- gsub("BL-5","BL5",df_phyto$StationCode)
# Remove extra Microcystis tows at D19
df_phyto <- df_phyto %>% filter(StationCode != "D19 MC Tow")
# In Fall 2016, taxonomists began classifying the species
# Chroococcus microscopicus as Eucapsis microscopica. This is one of the most
# dominant species in this samples, so all taxa previously classified as
# C. microscopicus will be re-named E. microscopica
df_phyto <- df_phyto %>%
mutate(Taxon = case_when(Taxon == 'Chroococcus microscopicus' ~ 'Eucapsis microscopica',
TRUE ~ Taxon)) %>%
mutate(Genus = case_when(Taxon == 'Eucapsis microscopica' ~ 'Eucapsis',
TRUE ~ Genus))
# The taxon Plagioselmis lacustris is inconsistently named, appearing sometimes as
# Rhodomonas lacustris. Change to Rhodomonas lacustris to avoid confusion.
df_phyto <- df_phyto %>%
mutate(Taxon = case_when(Taxon == 'Plagioselmis lacustris' ~ 'Rhodomonas lacustris',
TRUE ~ Taxon)) %>%
mutate(Genus = case_when(Taxon == 'Rhodomonas lacustris' ~ 'Rhodomonas',
TRUE ~ Genus))
# Correct the genus label for a Chlorella entry
df_phyto$Genus <- gsub("cf Chlorella","Chlorella",df_phyto$Genus)
# Add Taxonomy Data-------------------------------------------------------------
# Read in CSV with manually-added WoRMS classification
df_taxa <- read_csv(here("phyto-final","CSVs","phyto_group_classification.csv"))
df_phyto <- left_join(df_phyto, df_taxa)
# Check if any groups are missing
sum(is.na(df_phyto$Group)) # none missing
rm(df_taxa)
You can also embed plots, for example:
# Save df to use for making plots
# save(df_phyto, file = "RData/df_phyto.RData")
```{r pressure, echo=FALSE}
plot(pressure)
```

Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.

0 comments on commit 28faf0f

Please sign in to comment.