Debias Work.Rmd

---
title: "Debias work"
author: "Adam White"
date: "`r Sys.Date()`"
output: html_document
---

# DATA COLLECTION SECTION 1
```{r}

library(tidyverse) # if not installed run install.packages('tidyverse')
library(arrow) # install.packages('arrow')
library(lubridate) # install.packages('lubridate')
# Observed weather
obs_met <- read_csv("https://s3.flare-forecast.org/targets/fcre_v2/fcre/observed-met_fcre.csv")

# Forecasted weather
forecast_dir <- arrow::s3_bucket(bucket = "drivers/noaa/gefs-v12/stage2/parquet/0",
                                 endpoint_override =  "s3.flare-forecast.org", 
                                 anonymous = TRUE)
forecast_dates <- lubridate::as_date('2021-01-20')


# running this will show you what the column names are
arrow::open_dataset(forecast_dir) 

#this dataset is VERY large and needs to be filtered before collecting
forecasted_met <- arrow::open_dataset(forecast_dir) |> 
  filter(site_id == 'fcre', # Falling Creek Reservoir (site_id code)
         reference_datetime %in% forecast_dates) |> 
  # you can also filter/select based on other columns in the dataset
  # collect brings the data into your local environment
  collect()

#===================================#
# Do join
# Wrangle the data into the same formats
forecasted_met <- 
  forecasted_met |> 
  tidyr::pivot_wider(names_from = variable, 
                     id_cols = c(horizon, parameter, reference_datetime, datetime),
                     values_from = prediction) |>
  
  # calculate wind speed from eastward and northward directions
  dplyr::mutate(wind_speed = sqrt(eastward_wind^2 + northward_wind^2)) |> 
  dplyr::select(#'site_id', 
                #'height',
                'horizon',
                'parameter',
                'reference_datetime', 
                'datetime', 
                "air_temperature",
                "air_pressure",
                "relative_humidity",
                "surface_downwelling_longwave_flux_in_air", 
                "surface_downwelling_shortwave_flux_in_air",
                "precipitation_flux",
                "wind_speed") |> 
  tidyr::pivot_longer(cols = air_temperature:wind_speed,
                      names_to = 'variable', values_to = 'prediction') 

met_joined <- dplyr::inner_join(forecasted_met, 
                                obs_met, 
                                by = c('datetime', 'variable'))
```


## DATA MANIPULATION
```{r}
# extracting just the specific variables we would like to look at for June
air_temp_june = met_joined[which(met_joined$variable == 'air_temperature'),]
humid_june = met_joined[which(met_joined$variable == 'relative_humidity'),]
wind_june = met_joined[which(met_joined$variable == 'wind_speed'),]
shortwave_june = met_joined[which(met_joined$variable == 
                                    'surface_downwelling_shortwave_flux_in_air'),]
air_pres_june = met_joined[which(met_joined$variable == 'air_pressure'),]
longwave_june = met_joined[which(met_joined$variable == 
                                    'surface_downwelling_longwave_flux_in_air'),]

# arrange them all by parameter (ensemble num) and then only every 24 hours
# ordering by ensemble
air_temp_june = arrange(air_temp_june, parameter)
humid_june = arrange(humid_june, parameter)
wind_june = arrange(wind_june, parameter)
shortwave_june = arrange(shortwave_june, parameter)
air_pres_june = arrange(air_pres_june, parameter)
longwave_june = arrange(longwave_june, parameter)

# every 6 hours


# creating wide tables for every covariate
obs_june_temp = pivot_wider(data = air_temp_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'observation')
air_temp_june = pivot_wider(data = air_temp_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'prediction')
humid_june = pivot_wider(data = humid_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'prediction')
wind_june = pivot_wider(data = wind_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'prediction')
shortwave_june = pivot_wider(data = shortwave_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'prediction')
air_pres_june = pivot_wider(data = air_pres_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'prediction')
longwave_june = pivot_wider(data = longwave_june, id_cols = c(horizon, datetime), 
            names_from = parameter, 
            values_from = 'prediction')

# orderign by date time not horizon 
air_temp_june = arrange(air_temp_june, datetime)
humid_june = arrange(humid_june, datetime)
wind_june = arrange(wind_june, datetime)
shortwave_june = arrange(shortwave_june, datetime)
air_pres_june = arrange(air_pres_june, datetime)
longwave_june = arrange(longwave_june, datetime)


# removing the horizon, datetime, and final column (NA present)
horizon = air_temp_june[1]
air_temp_june = air_temp_june[-1:-2]
air_temp_june = air_temp_june[-31]

humid_june = humid_june[-1:-2]
humid_june = humid_june[-31]

wind_june = wind_june[-1:-2]
wind_june = wind_june[-31]

shortwave_june = shortwave_june[-1:-2]
shortwave_june = shortwave_june[-31]

air_pres_june = air_pres_june[-1:-2]
air_pres_june = air_pres_june[-31]

longwave_june = longwave_june[-1:-2]
longwave_june = longwave_june[-31]

# convert all the dataframes to matrices
temp_june <<- as.matrix(air_temp_june)
rh_june <<- as.matrix(humid_june)
wind_sp_june <<- as.matrix(wind_june)
short_rad_june <<- as.matrix(shortwave_june)
long_rad_june <<- as.matrix(longwave_june)
air_pres_june <<- as.matrix(air_pres_june)

#NaNs present -- this sets them to 0.0
short_rad_june[is.na(short_rad_june)] <- 0.0
long_rad_june[is.na(long_rad_june)] <- 0.0
```


## observed data collection
```{r}
obs_june_temp = arrange(obs_june_temp, datetime)
#obs_june_temp = obs_june_temp[which(obs_june_temp$horizon %% 24 == 0), ]
obs_june_temp = obs_june_temp$`1`
obs <<- as.numeric(obs_june_temp)
```


```{r}

debias <- function(b, X, Y){
  X1 = b[1] * X + b[2]
  score <- scoringRules::logs_sample(y = Y, dat = X1)
  score[is.infinite(score)] <- 10 * mean(score[!is.infinite(score)])
  return(sum(score))
}

b <- c(1,0)
optim <- optim(par = b, fn = debias, X = temp_june, Y = obs, method = "BFGS")


b2 = optim$par


adj_ens <- b2[1] * temp_june + b2[2]

adj_f <- (adj_ens - 273.15) * 9/5 + 32

temp_dec <- (temp_june - 273.15) * 9/5 + 32

obs_f <- (obs - 273.15) * 9/5 + 32
```

```{r}
library(ggmatplot)
orig <- cbind(temp_dec, obs_f)
adj <- cbind(adj_f, obs_f)


plt <- ggmatplot(x = 1:200, y = orig[1:200,], plot_type = "line",
                 color = c(rep("blue", 30), "black"),
                 linetype = c(rep("solid", 31))) 
plt = plt + xlab("Hours Ahead") + ylab("Air Temperature (F)") +
  ggtitle("Biased Ensembles and Observed Temperature")


plt3 <- ggmatplot(x = 1:200, y = adj[1:200,], plot_type = "line",
                 color = c(rep("blue", 30), "black"),
                 linetype = c(rep("solid", 31))) 
plt3 = plt3 + xlab("Hours Ahead") + ylab("Air Temperature (F)") +
  ggtitle("Debiased Ensembles and Observed Temperature")


par(mfrow = c(2,1))

plt
plt3
````


```{r}


p4 <- ggplot() + 
  xlab("Horizon (Days)") + ylab("Spread Modification Scalar") + 
  ggtitle("Spread Modification Factor versus Predicted Days Ahead")

p4 <- p4 + geom_line(aes(x = 1:35, y = 0.6 + 2.5*exp(-0.15 * 1:35), 
                         color = 'black')) + guides(fill = "none")

p4 <- p4 + theme(legend.position = "none")
p4


````