-
Notifications
You must be signed in to change notification settings - Fork 0
/
Capstone Functions.R
362 lines (263 loc) · 13.1 KB
/
Capstone Functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# This function doesnt work just yet,
# their is a problem with too many infinite values so the hessian can not be
# estimated.
ensemble_parameters <- function(Ens_Obs_Joined){
# DATA MANIPULATION ---------------------------------------------------------
# extracting both predicted and observed air temp from the long format data
air_temp = Ens_Obs_Joined[which(Ens_Obs_Joined$variable == 'air_temperature'),]
# arrange them all by parameter (ensemble num) and ordering by ensemble
air_temp = arrange(air_temp, parameter)
# creating wide tables for both observed data and predicted
obs_temp = pivot_wider(data = air_temp, id_cols = c(horizon, datetime),
names_from = parameter,
values_from = 'observation')
air_temp_ens = pivot_wider(data = air_temp, id_cols = c(horizon, datetime,
reference_datetime),
names_from = parameter,
values_from = 'prediction')
# orderign by date time not horizon
air_temp_ens = arrange(air_temp_ens, datetime)
# removing the horizon, datetime, and final column (NA present)
horizon = air_temp_ens$horizon #saving for later usage
ref_datetime <- air_temp_ens[-2]
date_time <- air_temp_ens[-3]
air_temp_ens = air_temp_ens[-1:-3]
air_temp_ens = air_temp_ens[-31]
# convert all the dataframes to matrices
pred_air_temp <<- as.matrix(air_temp_ens)
# selecting just one column of observed data
obs_temp = arrange(obs_temp, datetime)
obs_temp = obs_temp$`1`
obs <<- as.numeric(obs_temp)
# OBJECTIVE FUNCTION --------------------------------------------------------
# This is the objective function that is used in the optimization function
#
# INPUT:
# b is the initial set of parameters of length 5
# X is the predicted air temperature ensembles
# Y is the observed air temperature at the specific location
# days is the horizon divided by 24
#
# OUTPUT:
# returns the sum of the log score to be optimized
#
bias_spread_param_optim <- function(b, X, Y, days){
# debiases the ensembles
debiased_ens <- b[1] * X + b[2]
# stores the row means for later use
rowM <- rowMeans(debiased_ens)
# this is the exponential decay function that is used to adjust the spread
# of each horizon in the next step
inflate = b[1] + b[2]*exp(-b[3] * days)
# adjusts each row by the corresponding horizon value found in previous step
debias_inflated_ens <- inflate * (debiased_ens - rowM) + rowM
# sum of log score
score = scoringRules::logs_sample(y = Y, dat = debias_inflated_ens)
score[is.infinite(score)] <- mean(
score[!is.infinite(scoringRules::logs_sample(y = obs, dat = debias_inflated_ens))])
return(sum(score))
}
# PARAMETER OPTIMIZATION ----------------------------------------------------
# initial vector of parameters
b <- c(1,0,0.6, 2.5, 0.15)
# setting upper and lower bounds so optim values are finite
# optimization call to find the best parameters for the above objective
# function
bias_horizon_optim <- optim(par = b, fn = bias_spread_param_optim,
X = pred_air_temp, Y = obs,
days = (horizon/24), method = "BFGS"
)
return(bias_horizon_optim$par)
}
# -----------------------------------------------------------------------------
# This function optimizes both the debiasing values and the horizon spread
# values in separate objective functions. It takes an input of the inner joined
# ensemble data and observations in long table format.
#
# INPUT:
# Ens_Obs_Joined is a inner joined long format table that includes both
# NOAA GEFS air temperature predictions and the temperature recordings for
# the specific location the user would like to adjust the ensembles for.
#
# OUTPUT:
# Parameters is a vector of length 5, with the first two entries being the
# slope and intercept that debias the ensemble. The next 3 entries are used
# in the exponential decay function. These parameters can be passed into
# the ensemble_converter() function to adjust the ensemble.
ensemble_parameters_seperate <- function(Ens_Obs_Joined){
# DATA MANIPULATION ---------------------------------------------------------
# extracting both predicted and observed air temp from the long format data
air_temp = Ens_Obs_Joined[which(Ens_Obs_Joined$variable == 'air_temperature'),]
# arrange them all by parameter (ensemble num) and ordering by ensemble
air_temp = arrange(air_temp, parameter)
# creating wide tables for both observed data and predicted
obs_temp = pivot_wider(data = air_temp, id_cols = c(horizon, datetime),
names_from = parameter,
values_from = 'observation')
air_temp_ens = pivot_wider(data = air_temp, id_cols = c(horizon, datetime,
reference_datetime),
names_from = parameter,
values_from = 'prediction')
# orderign by date time not horizon
air_temp_ens = arrange(air_temp_ens, datetime)
# removing the horizon, datetime, and final column (NA present)
horizon = air_temp_ens$horizon #saving for later usage
ref_datetime <- air_temp_ens[-2]
date_time <- air_temp_ens[-3]
air_temp_ens = air_temp_ens[-1:-3]
air_temp_ens = air_temp_ens[-31]
# convert all the dataframes to matrices
pred_air_temp <<- as.matrix(air_temp_ens)
# selecting just one column of observed data
obs_temp = arrange(obs_temp, datetime)
obs_temp = obs_temp$`1`
obs <<- as.numeric(obs_temp)
# OBJECTIVE FUNCTION --------------------------------------------------------
# This is the objective function that is used in the optimization function
#
# INPUT:
# b is the initial set of parameters of length 5
# X is the predicted air temperature ensembles
# Y is the observed air temperature at the specific location
# days is the horizon divided by 24
#
# OUTPUT:
#
# returns the sum of the log score to be optimized
#
spread_param_optim <- function(b, X, Y, days){
# stores the row means for later use
rowM <- rowMeans(X)
# this is the exponential decay function that is used to adjust the spread
# of each horizon in the next step
inflate = b[1] + b[2]*exp(-b[3] * days)
# adjusts each row by the corresponding horizon value found in previous step
debias_inflated_ens <- inflate * (X - rowM) + rowM
# sum of log score
return(sum(scoringRules::logs_sample(y = Y, dat = debias_inflated_ens)))
}
# This function attempts to debias the ensembles to better fit the recorded
# temperatures. It is an objective function that is used
bias_param_optim <- function(b, X, Y){
# debias function
X1 <- b[1] * X + b[2]
# this removes INF values and sets them to the mean to the optim can keep
# looking for the minimum
score = scoringRules::logs_sample(y = Y, dat = X1)
score[is.infinite(score)] <- mean(
score[!is.infinite(scoringRules::logs_sample(y = obs, dat = X1))])
return(sum(score))
}
# PARAMETER OPTIMIZATION ----------------------------------------------------
# initial vector of parameters
b1 <- c(1,0)
b2 <- c(0.6, 2.5, 0.15)
# optimization call to find the best parameters for the above objective
# function
bias_optim <- optim(par = b1, fn = bias_param_optim, X = pred_air_temp,
Y = obs, method = "BFGS")
# debiases the ensembles
adj_ensemble <- bias_optim$par[1] * pred_air_temp + bias_optim$par[2]
# optimization function to find the optimal horizon spread values
horizon_optim <- optim(par = b2, fn = spread_param_optim,
X = adj_ensemble, Y = obs,
days = (horizon/24), method = "L-BFGS")
# joining the set of optimized parameters
b <- c(bias_optim$par, horizon_optim$par, horizon_optim$value)
return(b)
}
# ------------------------------------------------------------------------------
# This function accepts two parameters, one in the same form as above, a long
# format table with the NOAA GEFS predicted air temperature, and a length 5
# 5 vector found in the previous function. This function adjusts the predicted
# temperature ensembles, and then returns that data frame, with the corresponding
# horizon, datetime, and reference datetime binded to the front of the data.
#
# INPUT:
# Ens_Obs_Joined is a inner joined long format table that includes NOAA
# GEFS air temperature predictions. It does not need to include the observed
# air temperature.
# Parameters is a length 5 vector that adjusts the ensembles. This is most
# likeley found in the previous function.
#
# TODO: switch to ensemble and observed data, then inner join them
# Output:
# Returns an adjusted air temperature ensemble, with horizon, datetime,
# and reference datetime binded to the front of the data.
ensemble_converter <- function(Ens_Obs_Joined, Parameters){
# DATA MANIPULATION ---------------------------------------------------------
# extracting both predicted and observed air temp from the long format data
air_temp = Ens_Obs_Joined[which(Ens_Obs_Joined$variable == 'air_temperature'),]
# arrange them all by parameter (ensemble num) and ordering by ensemble
air_temp = arrange(air_temp, parameter)
# creating wide tables for both observed data and predicted
obs_temp = pivot_wider(data = air_temp, id_cols = c(horizon, datetime),
names_from = parameter,
values_from = 'observation')
air_temp_ens = pivot_wider(data = air_temp, id_cols = c(horizon, datetime,
reference_datetime),
names_from = parameter,
values_from = 'prediction')
# orderign by date time not horizon
air_temp_ens = arrange(air_temp_ens, datetime)
# removing the horizon, datetime, and final column (NA present)
horizon = air_temp_ens$horizon #saving for later usage
first3_col <- air_temp_ens[1:3]
#ref_datetime <- as.Date.POSIXct(ref_datetime_v, format = '%Y-%m-%d', tz = 'UTC')
#date_time <- as.Date.POSIXct(date_time_v, format = '%Y-%m-%d %H', tz = 'UTC')
air_temp_ens = air_temp_ens[-1:-3]
air_temp_ens = air_temp_ens[-31]
# convert all the dataframes to matrices
pred_air_temp <<- as.matrix(air_temp_ens)
# ENSEMBLE CONVETING --------------------------------------------------------
# changes days to a numeric value to multiply
days = as.numeric(horizon/24)
# debiases the ensembles
Adj_Ensemble <- Parameters[1] * pred_air_temp + Parameters[2]
# calculates the spread factor for each horizon, following the
# exponential decay function
inflation <- Parameters[3] + Parameters[4] * exp(-Parameters[5]*days)
# saves the row means to use in the next step
M <- rowMeans(Adj_Ensemble)
# adjusts the rows of the ensemble by the corresponding horizon spread
# factor
final_ens <- inflation * (Adj_Ensemble - M) + M
# this binds the horizon, datetime, and reference datetime back onto the
# ensembles, for usage later
date_final_ens <- as.data.frame(cbind(first3_col, final_ens))
# returns the adjusted ensembles
# TODO switch to long format
return(date_final_ens)
}
# ------------------------------------------------------------------------------
# This function utilizes the ggmatplot package to plot the columns of the
# the ensemble and the observed values over each other
ensemble_plot <- function(Ensemble, observed, Ens_Obs_Joined, horizon_interval = c(1:840),
ref_datetime){
air_temp = Ens_Obs_Joined[which(Ens_Obs_Joined$variable == 'air_temperature'),]
# arrange them all by parameter (ensemble num) and ordering by ensemble
air_temp = arrange(air_temp, parameter)
air_temp_ens = pivot_wider(data = air_temp, id_cols = c(horizon, datetime,
reference_datetime),
names_from = parameter,
values_from = 'prediction')
air_temp_ens = arrange(air_temp_ens, datetime)
if (is.Date(ref_datetime)){
library(ggmatplot)
# gets the specific indices of the reference date
indices = which(air_temp_ens$reference_datetime == ref_datetime)
adj_obs <- as.data.frame(cbind(Ensemble, observed))
adj_obs <- (adj_obs - 273.15) * 9/5 + 32
one_ref_adj_obs <- adj_obs[indices,]
plt <- ggmatplot(horizon_interval, y = one_ref_adj_obs[horizon_interval,],
plot_type = "line",
linetype = c(rep("solid", 31)),
color = c(rep('black', 30), 'red'))
plt = plt + xlab("Horizon") + ylab("Air Temperature (F)") +
ggtitle("Adjusted Ensemble Versus Observed Temp")
return(plt)
}
else {
print("Please enter a valid reference date time")
}
}