Aquatics_null_model_workflow.Rmd

---
title: "Aquatics Null Forecast"
author: "R. Quinn Thomas"
date: "9/14/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Step 0: Configuration

```{r}
library(tidyverse)
library(lubridate)
library(neonstore)
library(rjags)
library(tidybayes)
library(modelr)

Sys.setenv("NEONSTORE_HOME" = "neon_store/")
if(dir.exists("neon_store/")){
  dir.create("neon_store/")
}

set.seed(329)

```


## Step 1: Acquire data

The downloading of data from NEON will be handled by the neon_store package. By using
the package, we can only download new data without having to specify what time 
periods to download.  It compares the files in the local neon store to the files
at NEON and downloads the files that don't exist locally.

All the data download will be handled by a centralized script that acquires data 
for all themes.

```{r}
focal_sites <- c("BARC","POSE")
neonstore::neon_download("DP1.20288.001", site =  focal_sites, type="basic") #Water Quality
neonstore::neon_download("DP1.20264.001", site =  focal_sites, type="basic") #Water Temperature
neonstore::neon_download("DP1.20053.001", site = focal_sites, type = "basic") #Surface Water Temperature

```

## Step 2: Generate Targets

Load data

```{r}
oxy <- neonstore::neon_read(table = "waq_instantaneous", site = focal_sites)
temp <- neonstore::neon_read("TSD_30_min", site = focal_sites)
```


Process oxygen data to hourly.

```{r}
oxy_cleaned <- oxy %>%
  dplyr::select(siteID, startDateTime, sensorDepth, dissolvedOxygen,
                dissolvedOxygenExpUncert,dissolvedOxygenFinalQF) %>%
  dplyr::filter(dissolvedOxygenFinalQF == 0,
                sensorDepth > 0) %>%
  dplyr::mutate(startDateTime = as_datetime(startDateTime)) %>%
  dplyr::mutate(date = as_date(startDateTime),
                hour = hour(startDateTime)) %>%
  dplyr::group_by(siteID, date, hour) %>%
  dplyr::summarize(sensorDepth = mean(sensorDepth, na.rm = TRUE),
                   dissolvedOxygen = mean(dissolvedOxygen, na.rm = TRUE),
                   dissolvedOxygenExpUncert = mean(dissolvedOxygenExpUncert, na.rm = TRUE),
                   sensorDepth = mean(sensorDepth, na.rm = TRUE), .groups = "drop") %>%
  dplyr::mutate(startDateTime = make_datetime(year = year(date), month = month(date),
                                              day = day(date), hour = hour,
                                              min = 0, tz ="UTC")) %>%
  dplyr::select(siteID, startDateTime, sensorDepth, dissolvedOxygen, dissolvedOxygenExpUncert)
```

Visualize data

```{r}
oxy_cleaned %>%
  ggplot(aes(x = startDateTime, y = dissolvedOxygen)) +
  geom_point() +
  facet_wrap(~siteID) +
  labs(x = "Date")
```

```{r}
temp_cleaned <- temp %>%
  dplyr::select(startDateTime, siteID, tsdWaterTempMean, thermistorDepth, tsdWaterTempExpUncert) %>%
  dplyr::mutate(date = as_date(startDateTime),
                hour = hour(startDateTime)) %>%
  dplyr::group_by(date, siteID, hour,thermistorDepth) %>%
  dplyr::summarize(tsdWaterTempMean = mean(tsdWaterTempMean, na.rm = TRUE),
                   tsdWaterTempExpUncert = mean(tsdWaterTempExpUncert, na.rm = TRUE), .groups = "drop") %>%
  dplyr::mutate(startDateTime = make_datetime(year = year(date), month = month(date),
                                              day = day(date), hour = hour, min = 0,
                                              tz ="UTC")) %>%
  dplyr::select(startDateTime, siteID, tsdWaterTempMean,thermistorDepth,tsdWaterTempExpUncert) %>%
  dplyr::group_by(startDateTime, siteID, thermistorDepth) %>%
  dplyr::summarise(tsdWaterTempMean = mean(tsdWaterTempMean, na.rm = TRUE),
                   tsdWaterTempExpUncert = mean(tsdWaterTempExpUncert), .groups = "drop") %>% 
  dplyr::filter(thermistorDepth == min(thermistorDepth))
```

```{r}
temp_cleaned %>%
  ggplot(aes(x = startDateTime, y = tsdWaterTempMean)) +
  geom_point() +
  facet_wrap(~siteID) +
  labs(x = "Date")
```

Clean up to create targets file

```{r}
temp_targets <- temp_cleaned %>% 
  rename(time = startDateTime,
         water_temperature = tsdWaterTempMean,
         water_temperature_sd = tsdWaterTempExpUncert,
         depth = thermistorDepth)

oxygen_targets <- oxy_cleaned %>% 
  rename(time = startDateTime,
         dissolved_oxygen = dissolvedOxygen,
         dissolved_oxygen_sd = dissolvedOxygenExpUncert,
         depth = sensorDepth)

time <- tibble(time = seq(min(temp_targets$time),max(temp_targets$time), by = "1 hour"))


temp_targets <- left_join(time, temp_targets, by = "time") %>% 
  select(time, siteID, water_temperature, water_temperature_sd, depth) %>% 
  mutate(water_temperature = ifelse(is.nan(water_temperature), NA, water_temperature),
         water_temperature_sd = ifelse(is.nan(water_temperature_sd), NA, water_temperature_sd),
         depth = ifelse(is.nan(depth), NA, depth))

time <- tibble(time = seq(min(oxygen_targets$time),max(oxygen_targets$time), by = "1 hour"))

oxygen_targets <- left_join(time, oxygen_targets, by = "time") %>% 
  select(time, siteID, dissolved_oxygen, dissolved_oxygen_sd, depth) %>% 
  mutate(dissolved_oxygen = ifelse(is.nan(dissolved_oxygen), NA, dissolved_oxygen),
         dissolved_oxygen_sd = ifelse(is.nan(dissolved_oxygen_sd), NA, dissolved_oxygen_sd),
         depth = ifelse(is.nan(depth), NA, depth))

write_csv(temp_targets, "aquatic-temperature-targets.csv.gz")
write_csv(oxygen_targets, "aquatic-oxygen-targets.csv.gz")
```

## Step 3: Generate Null Model

Read in targets and create time series with gaps

```{r}

oxygen_targets <- read_csv("aquatic-oxygen-targets.csv.gz", guess_max = 10000)
oxygen <- oxygen_targets %>% filter(siteID == "BARC",
                                     time > as_date("2020-01-01"), 
                                     hour(time) == 12)

max_time <- max(oxygen$time) - months(1)
#max_time <- max(oxygen$time) + days(1)

#This is for testing
oxygen <- oxygen %>% 
  filter(time < max_time)


start_forecast <- max_time
# This is key here - I added 16 days on the end of the data for the forecast period
full_time <- tibble(time = seq(min(oxygen$time), max(oxygen$time) + days(16), by = "1 day"))

oxygen <- left_join(full_time, oxygen)
```

Get the mean observation uncertainty

```{r}
oxygen_sd <- mean(oxygen$dissolved_oxygen_sd, na.rm = TRUE)
```

Create jags model

```{r}
RandomWalk = "
model{

  #### Priors
  x[1] ~ dnorm(x_ic,tau_ic)
  tau_add <- 1 / pow(sd_add,2)
  sd_add ~ dunif(lower_add,upper_add)


  #### Process Model
  for(t in 2:n){
    x[t]~dnorm(x[t-1],tau_add)
    x_obs[t] ~ dnorm(x[t],tau_obs)
  }

  #### Data Model
  for(t in 1:nobs){
    y[t] ~ dnorm(x[y_index[t]],tau_obs)
  }

}
"
```

Set up jags model

```{r}
#Full time series with gaps
y <- c(oxygen$dissolved_oxygen)
time <- c(oxygen$time)
#Indexes of full time series with gaps
y_index <- 1:length(y)
#Remove gaps
y_gaps <- y[!is.na(y)]
#keep indexes to reference the gappy time series
y_index <- y_index[!is.na(y)]

init_x <- approx(x = time[!is.na(y)], y = y_gaps, xout = time, rule = 2)$y

data <- list(y = y_gaps,
             y_index = y_index,
             nobs = length(y_index),
             n = length(y),
             x_ic = 8.620833,
             tau_ic = 51,
             tau_obs = 1/(oxygen_sd^2),
             lower_add=0.0001,
             upper_add=1000)

nchain = 3
chain_seeds <- c(200,800,1400)
init <- list()
for(i in 1:nchain){
  init[[i]] <- list(sd_add=sd(diff(y_gaps)),
                    .RNG.name = "base::Wichmann-Hill",
                    .RNG.seed = chain_seeds[i],
                    x = init_x)
}

j.model   <- jags.model (file = textConnection(RandomWalk),
                         data = data,
                         inits = init,
                         n.chains = 3)
```

Sample jags model

```{r}
jags.out   <- coda.samples (model = j.model,variable.names = "sd_add", n.iter = 10000)

m   <- coda.samples (model = j.model,
                     variable.names = c("x","sd_add","x_obs"),
                     n.iter = 10000,
                     thin = 5)
```

Use the package tidybayes to clean up the JAGS output

```{r}
model_output <- m %>%
  spread_draws(x_obs[day]) %>%
  filter(.chain == 1) %>%
  rename(oxygen = x_obs,
         ensemble = .iteration) %>%
  mutate(time = full_time$time[day]) %>%
  ungroup() %>%
  select(time, oxygen, ensemble)
```

Does the output look reasonable?

```{r}
obs <- tibble(time = full_time$time,
              obs = y)

model_output %>% 
  group_by(time) %>% 
  summarise(mean = mean(oxygen),
            upper = quantile(oxygen, 0.975),
            lower = quantile(oxygen, 0.025),.groups = "drop") %>% 
  ggplot(aes(x = time, y = mean)) +
  geom_line() +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, color = "lightblue", fill = "lightblue") +
  geom_point(data = obs, aes(x = time, y = obs), color = "red") +
  labs(x = "Date", y = "Oxygen concentration (mg/L)")
```

Save forecast (only future days)

```{r}
forecast_saved <- model_output %>%
  filter(time > start_forecast) %>%
  mutate(data_assimilation = 0) %>%
  mutate(forecast_iteration_id = start_forecast) %>%
  mutate(forecast_project_id = "EFInull")

forecast_file_name <- paste0("aquatics-oxygen-EFInull-",as_date(start_forecast),".csv")
write_csv(forecast_saved, forecast_file_name)
```

## Step 3b: Generate metadata

```{r eval = FALSE}
library(EML)
library(EFIstandards)

#Note that the units of oxygen is wrong (need to get working)

attributes <- tibble::tribble(
  ~attributeName, ~attributeDefinition, ~unit, ~formatString, ~numberType, ~definition,
  "time",          "time",                       "year",     "YYYY-MM-DD", "numberType", NA,
  "ensemble",      "index of ensemble member",   "dimensionless",    NA,         "integer", NA,
  "oxygen",     "oxygen concentration", "dimensionless", NA,  "real", NA,
  "forecast_issue_time",     "time that forecast was created", NA, "YYYY-MM-DD",  NA, NA,
  "data_assimilation",     "Flag whether time step included data assimilation", "dimensionless", NA, "integer", NA,
  "forecast_iteration_id",     "ID for specific forecast cycle", NA, NA,  NA, "forecast id",
  "forecast_project_id",     "ID for forecasting project", NA, NA,  NA, "project id"
)
attrList <- set_attributes(attributes,
                           col_classes = c("Date", "numeric", "numeric","Date",
                                           "numeric", "character", "character"))
physical <- set_physical(paste0("aquatics-EFInull-",as_date(start_forecast),".csv"))

dataTable <- eml$dataTable(
  entityName = paste0("aquatics-EFInull-",as_date(start_forecast),".csv"),
  entityDescription = "Forecast of oxygen using a null model",
  physical = physical,
  attributeList = attrList)

me <- list(individualName = list(givenName = "Quinn",
                                 surName = "Thomas"),
           electronicMailAddress = "rqthomas@vt.edu",
           id = "https://orcid.org/0000-0003-1282-7825"
           )

coverage <- set_coverage(
  begin = as_date(min(forecast_saved$time)),
  end = as_date(max(forecast_saved$time)),
  geographicDescription = "Bacro Lake",
  west = -122.44, east = -117.15,
  north = 37.38, south = 30.00,
  altitudeMin = 160, altitudeMaximum = 330,
  altitudeUnits = "meter"
  )

keywordSet <- list(
  list(
    keywordThesaurus = "EFI controlled vocabulary",
    keyword = list("forecast",
                   "oxygen",
                   "timeseries")
  )
)


initial_conditions <- list(
  # Possible values: no, contains, data_driven, propagates, assimilates
  uncertainty = "assimilates",
  # Number of parameters / dimensionality
  complexity = 1,
  propagation = list(
    type = "ensemble",
    size = length(unique(forecast_saved$ensemble))
  ),
  assimilation = list(
    type = "State-space MCMC",
    reference = "none",
    complexity = 1
  )
)

parameters <- list(
  uncertainty = "assimilates",
  complexity = 1,
  propagation = list(
    type = "ensemble",
    size = length(unique(forecast_saved$ensemble))
  ),
  assimilation = list(
    type = "State-space MCMC",
    reference = "none",
    complexity = 1
  )
)

random_effects <- list(
  uncertainty = "no"
)

process_error <- list(
  uncertainty = "assimilates",
  propagation = list(
    type = "ensemble", 
    size = length(unique(forecast_saved$ensemble))
  ),
  assimilation = list(
    type = "State-space MCMC",
    reference = "none",
    complexity = 1
  ),
  complexity = 1,
  covariance = FALSE
)

drivers <- list(
  uncertainty = "no"
)

model_description <- list(
  name = "Persistence null state-space model",
  type =  "empirical",
  repository = "none")

additionalMetadata <- eml$additionalMetadata(
  #  describes="forecast",  ## not sure how to find the correct ID for this to be valid
  metadata = list(
    forecast = list(
      timestep = "1 day", ## should be udunits parsable; already in coverage -> temporalCoverage?
      forecast_horizon = "16 days",
      initial_conditions = initial_conditions,
      parameters = parameters,
      random_effects = random_effects,
      process_error = process_error,
      drivers = drivers,
      forecast_issue_time = as_date(forecast_saved$forecast_issue_time[1]),
      forecast_iteration_id = forecast_saved$forecast_iteration_id[1],
      forecast_project_id = forecast_saved$forecast_project_id[1],
      metadata_standard_version = "0.2",
      model_description = model_description
    ) # forecast
  ) # metadata
) # eml$additionalMetadata

dataset = eml$dataset(
  title = "Aquatic null model",
  creator = me,
  contact = list(references = me$id),
  pubDate = as_date(forecast_saved$forecast_issue_time[1]),
  intellectualRights = "MIT",
  abstract =  "Aquatic null model",
  dataTable = dataTable,
  keywordSet = keywordSet,
  coverage = coverage,
)

my_eml <- eml$eml(dataset = dataset,
                  additionalMetadata = additionalMetadata,
                  packageId = forecast_saved$forecast_project_id[1],  #Is this the ForecastProject_ID?
                  system = "uuid"
)

eml_validate(my_eml)

EFIstandards::forecast_validator(my_eml)

write_eml(my_eml, paste0("aquatics-EFInull-",as_date(start_forecast),"-eml.xml"))
```


## Step 4: Score forecast

Read in forecast and convert to the matrix format that is needed by the crps_sample function

```{r}
forecast <- read_csv(forecast_file_name) 

dat <- forecast %>% 
  select(time, oxygen, ensemble) %>% 
  pivot_wider(names_from = ensemble, values_from = oxygen) %>% 
  select(-time)

dat <- as.matrix(dat)

```

Read in observations

```{r}
forecast_time <- unique(forecast$time)

aquatic_targets <- read_csv("aquatic-oxygen-targets.csv.gz")

oxygen_evaluate <- aquatic_targets %>% filter(siteID == "BARC", 
                                              time %in% forecast_time)
```

```{r}

forecast_time1 <- tibble(time = unique(forecast$time))
obs <- left_join(forecast_time1, oxygen_evaluate)
d <- tibble(time = unique(forecast$time),
            forecast = rowMeans(dat),
            observations = obs$dissolved_oxygen) %>% 
  pivot_longer(-time, names_to = "variable", values_to = "values" )

ggplot(d, aes(x = time, y = values, color = variable)) +
  geom_point() +
  geom_line() +
  labs(x = "date", y = "Oxygen concentrations (mg/L)")
```

Use the CRPS to score the forecast at each forecast horizon

```{r}
#crps_sample can't handle NAs so only use the forecast rows that have 
#corresponding observations

dat <- dat[which(forecast_time %in% oxygen_evaluate$time), ]
crps <- scoringRules::crps_sample(y = oxygen_evaluate$dissolved_oxygen, dat)


scored <- tibble(time = oxygen_evaluate$time,
                 horizon = difftime(oxygen_evaluate$time, min(forecast_time), units = "days"),
                 crps = crps)
```

```{r}
ggplot(scored, aes(x = horizon, y = crps)) +
  geom_point() +
  geom_line() +
  labs(x = "Forecast horizon (days)", y = "CRPS")
```

Write score

```{r}
write_csv(scored, paste0("aquatics-oxygen-",as_date(min(forecast_time)),".csv"))
```