diff --git a/.gitignore b/.gitignore index 9d23429..f68f46b 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,9 @@ vignettes/*.pdf /data/*.shp.xml /data/*.cpg /data/*.sbn +/data/*.nc +/analysis/temp3d/data +/analysis/temp3d/output +/analysis/temp3d/R/archive +/analysis/temp3d/plots +/analysis/temp3d/paper diff --git a/DESCRIPTION b/DESCRIPTION index 6257b56..21b8c06 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: coldpool Type: Package Title: AFSC/RACE Groundfish Assessment Program EBS and NBS temperature products -Version: 3.2-2 +Version: 3.3-1 Authors@R: c(person("Sean", "Rohan", email = "sean.rohan@noaa.gov", role = c("aut", "cre")), person("Lewis", "Barnett", email = "lewis.barnett@noaa.gov", role = c("aut", "ctb")), person("Kelly", "Kearney", role = c("ctb")), @@ -29,5 +29,6 @@ RoxygenNote: 7.2.3 Imports: colorspace, viridis, raster, - here + here, + tidyr Suggests: knitr, rmarkdown, RODBC, getPass, readr, testthat, cowplot, metR diff --git a/NAMESPACE b/NAMESPACE index f89b951..df18b4f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,10 +8,13 @@ export(cpa_pre2021) export(ebs_bottom_temperature) export(ebs_proj_crs) export(ebs_surface_temperature) +export(estimate_anisotropy) +export(estimate_z_expansion) export(get_connected) export(get_data) export(interpolate_variable) export(interpolation_wrapper) +export(kriging_cv) export(legend_discrete_cbar) export(loocv_gear_temp) export(make_raster_file) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..0f3292e --- /dev/null +++ b/NEWS @@ -0,0 +1,12 @@ +coldpool 3.3-1 (November 13, 2023) +---------------------------------------------------------------- + +NEW FEATURE + +- Added kriging_cv() function to compare kriging methods + using cross-validation for ordinary kriging,co-kriging, and + regression kriging. +- Added estimate_z_expansion() function to estimate the vertical + expansion factors for 3D kriging based on cross-validation. +- Added estimate_anisotropy() function for estimating 2D + anisotropy in variograms using cross-validation. \ No newline at end of file diff --git a/R/estimate_anisotropy.R b/R/estimate_anisotropy.R new file mode 100644 index 0000000..4895306 --- /dev/null +++ b/R/estimate_anisotropy.R @@ -0,0 +1,46 @@ +#' Estimate 2D variogram anisotropy parameters for kriging using 10-fold cross-validation +#' +#' Uses the coldpool::kriging_loocv() interface. +#' +#' @param x A data.frame containing variables to be interpolated and latitude and longitude coordinates. +#' @param variable_name 1L character vector indicating which varible to interpolate (e.g. 'TEMPERATURE') +#' @param latitude_name 1L character vector indicating the name of the latitude column (e.g., 'LATITUDE') +#' @param longitude_name 1L character vector indicating the name of the longitude column (e.g., 'LONGITUDE') +#' @param input_crs Character vector indicating the coordinate reference system for x (default = "WGS84") +#' @param interpolation_crs Coordinate reference system to use for interpolation. +#' @param kriging_formula Formula to use for kriging. Default (kriging_formula = variable_name ~ 1) is ordinary kriging without covariates. Formula for the gstat interface (see ?gstat::gstat). Covariate example: kriging_formula = variable_name ~ BOTTOM_DEPTH +#' @param starting_values Optional. Starting value of anisotropy parameters to use for optimization. See: ?gstat::vgm. +#' @param folds 1L numeric vector indicating how many folds to use for cross-validation. +#' @param nm Maximum number of nearest neighbor observations to use for interpolation. +#' @param vgm_width Optional. Width of variogram breaks. +#' @param variogram_model Character vector indicating which variogram model to use for interpolation. Valid options are exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste). +#' @param seed RNG seed (set.seed(seed)) to use for anisotropy estimation based on cross-validation. +#' @param only_return_anisotropy For internal use. Default = FALSE +#' @export + +estimate_anisotropy <- function(x, + kriging_formula = variable_name ~ 1, + folds = 10, + variogram_model, + starting_values = NULL, + vgm_width = NULL, + nm = Inf, + seed = 19673) { + + stopifnot("estimate_2d_anisotropy: Length of interpolation method must be 1." = length(variogram_model) == 1) + stopifnot("estimate_2d_anisotropy: variogram_model must be one of 'Exp', 'Cir', 'Gau', 'Sph', 'Mat', 'Bes', 'Ste'" = tolower(variogram_model) %in% c('exp', 'cir', 'gau', 'sph', 'mat', 'bes', 'ste')) + + pars <- coldpool::kriging_loocv(x = x, + anisotropy_parameters = starting_values, + estimate_anisotropy = TRUE, + nm = nm, + vgm_width = vgm_width, + kriging_formula = kriging_formula, + location_formula = location_formula, + interpolation_methods = variogram_model, + seed = 19673, + only_return_anisotropy = TRUE) + + return(pars) + +} diff --git a/R/estimate_z_expansion.R b/R/estimate_z_expansion.R new file mode 100644 index 0000000..aa96fa1 --- /dev/null +++ b/R/estimate_z_expansion.R @@ -0,0 +1,230 @@ +#' Estimate vertical expansion factor for 3D kriging +#' +#' Uses L-BFGS algorithm implemented in optim(). +#' +#' @param z_start Starting value for vertical expansion estimation +#' @param x data.frame containing +#' @param cv_index Index of folds for cross validation +#' @param location_formula Formula to use for location argument to gstat. See gstat documentation for description of the formula interface (see ?gstat::gstat). +#' @param kriging_formula Formula to use for kriging. See gstat documentation for description of the formula interface (see ?gstat::gstat). +#' @param anisotopy_parameters Anisotropy parameters to use for ordinary kriging as a 5L vector. See: ?gstat::vgm. If NULL and estimate_anisotropy, anisotropy is estimated. +#' @param vgm_width Optional. +#' @param variogram_model Character vector indicating which variogram model to use for interpolation. Valid options are exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste). +#' @param use_for_mse Logical vector indicating whether to use an observation to calculate MSE for cross-validation. +#' @param z_limits Upper and lower bounds for z_expansion, for optimization. +#' @param nm Maximum number of nearest neighbor observations to use for interpolation. +#' @export + +estimate_z_expansion <- function(x, + location_formula, + kriging_formula, + model, + z_start, + cv_index, + anisotropy_parameters = c(0,0,0,1,1), + vgm_width, + nm = Inf, + maxdist = Inf, + use_for_mse = rep(TRUE, length(cv_index)), + z_limits = c(10, 500000)) { + + + + obj_fn <- function(log_z_mult, + dat, + cv_index, + anisotropy_parameters, + location_formula, + kriging_formula, + vgm_width, + model, + use_for_mse) { + + dat[all.vars(location_formula)[3]] <- dat[all.vars(location_formula)[3]] * exp(log_z_mult) + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = dat, + nmax = nm, + maxdist = maxdist) + + vario_both <- gstat::variogram(mod, + width = vgm_width, + alpha = c(0, 45, 90, 135)) + + vario_nugget <- gstat::fit.variogram(gstat::variogram(mod, + width = vgm_width), + gstat::vgm(model)) + + vario_fit <- gstat::fit.variogram(vario_both, + gstat::vgm(model = model, + anis = anisotropy_parameters, + nugget = vario_nugget$psill[1])) + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = dat, + nmax = nm, + model = vario_fit, + maxdist = maxdist) + + cv_results <- gstat::gstat.cv(object = mod, + nfold = cv_index, + verbose = TRUE, + debug.level = 2) + + mse <- mean(cv_results$residual[use_for_mse]^2) + + return(mse) + + } + + results <- optim(par = log(z_start), + fn = obj_fn, + method = "L-BFGS-B", + dat = x, + control = list(trace = 5), + kriging_formula = kriging_formula, + location_formula = location_formula, + anisotropy_parameters = anisotropy_parameters, + lower = log(z_limits[1]), + upper = log(z_limits[2]), + cv_index = cv_index, + vgm_width = vgm_width, + model = model, + use_for_mse = use_for_mse + ) + + results$par <- exp(results$par) + + return(results) + +} + + + +# estimate_z_expansion <- function(x, +# location_formula, +# kriging_formula, +# model, +# z_start, +# cv_index, +# anisotropy_parameters = c(0,0,0,1,1), +# vgm_parameters = NULL, +# vgm_width, +# nm = Inf, +# maxdist = Inf, +# use_for_mse = rep(TRUE, length(cv_index)), +# z_limits = c(10, 500000)) { +# +# obj_fn <- function(log_z_mult, +# dat, +# cv_index, +# anisotropy_parameters, +# location_formula, +# kriging_formula, +# vgm_parameters = NULL, +# vgm_width, +# model, +# use_for_mse) { +# +# dat[all.vars(location_formula)[3]] <- dat[all.vars(location_formula)[3]] * exp(log_z_mult) +# +# if(is.null(vgm_parameters)) { +# mod <- gstat::gstat(formula = kriging_formula, +# locations = location_formula, +# data = dat, +# nmax = nm, +# maxdist = maxdist) +# +# vario_both <- gstat::variogram(mod, +# width = vgm_width, +# alpha = c(0, 45, 90, 135)) +# +# vario_nugget <- gstat::fit.variogram(gstat::variogram(mod, +# width = vgm_width), +# gstat::vgm(model)) +# +# vario_fit <- gstat::fit.variogram(vario_both, +# gstat::vgm(model = model, +# anis = anisotropy_parameters, +# nugget = vario_nugget$psill[1])) +# +# mod <- gstat::gstat(formula = kriging_formula, +# locations = location_formula, +# data = dat, +# nmax = nm, +# maxdist = maxdist, +# model = vario_fit) +# +# cv_results <- gstat::gstat.cv(object = mod, +# nfold = cv_index, +# verbose = TRUE, +# debug.level = 2) +# +# mse <- mean(cv_results$residual[use_for_mse]^2) +# +# } else { +# +# stopifnot("estimate_z_expansion: vgm_parameters must be a list or named vector containing numeric values named nugget, psill, and range variables." = all(c("nugget", "psill", "range") %in% names(vgm_parameters))) +# +# vario_fixed <- gstat::vgm(model = model, +# psill = vgm_parameters[['psill']], +# range = vgm_parameters[['range']], +# anis = anisotropy_parameters, +# nugget = vgm_parameters[['nugget']]) +# +# unique_index <- unique(cv_index) +# +# resid_error <- numeric() +# +# for(ii in 1:length(unique_index)) { +# +# training <- dat[cv_index != unique_index[ii], ] +# +# mod <- gstat::gstat(formula = kriging_formula, +# locations = location_formula, +# data = training, +# nmax = nm, +# model = vario_fixed, +# maxdist = maxdist) +# +# validation <- dat[cv_index == unique_index[ii] & use_for_mse, ] +# +# fit <- predict(mod, newdata = validation) +# +# resid_error <- fit$var1.pred - validation[[all.vars(kriging_formula)[1]]] +# +# +# } +# +# mse <- mean(resid_error^2) +# +# } +# +# return(mse) +# +# } +# +# results <- optim(par = log(z_start), +# fn = obj_fn, +# method = "L-BFGS-B", +# dat = x, +# control = list(trace = 5), +# kriging_formula = kriging_formula, +# location_formula = location_formula, +# anisotropy_parameters = anisotropy_parameters, +# variogram_parameters = variogram_parameters, +# lower = log(z_limits[1]), +# upper = log(z_limits[2]), +# cv_index = cv_index, +# vgm_width = vgm_width, +# model = model, +# use_for_mse = use_for_mse +# ) +# +# results$par <- exp(results$par) +# +# return(results) +# +# } \ No newline at end of file diff --git a/R/kriging_cv.R b/R/kriging_cv.R new file mode 100644 index 0000000..0f73f69 --- /dev/null +++ b/R/kriging_cv.R @@ -0,0 +1,271 @@ +#' Compare interpolation methods using cross-validation +#' +#' Conducts cross validation of a single predictor using kriging, nearest-neighbor, and inverse-distance weighting interpolation method using gstat. +#' +#' @param x A data.frame containing variables to be interpolated and latitude and longitude coordinates. +#' @param fold Vector of numeric values indicating which observations are assigned to which fold. Default NULL performs leave-one-out cross validation. (see 'nfold' argument in ?gstat::gstat). +#' @param location_formula Formula to use for location argument to gstat. See gstat documentation for description of the formula interface (see ?gstat::gstat). +#' @param kriging_formula Formula to use for kriging. See gstat documentation for description of the formula interface (see ?gstat::gstat). +#' @param anisotopy_parameters Optional. Anisotropy parameters to use for ordinary kriging as a 2L vector for 2D kriging or 5L vector for 3D kriging. See: ?gstat::vgm. If NULL and estimate_anisotropy, anisotropy is estimated. +#' @param estimate_anisotropy Logical indicating whether anisotropy should be estimated for kriging methods based on k-fold cross validation. Default = FALSE +#' @param anisotropy_kfold 1L numeric vector indicating how many folds to use for cross-validation if estimate_anisotropy = TRUE. +#' @param nm Maximum number of nearest neighbor observations to use for interpolation. +#' @param vgm_width Optional. Width of variogram breaks. +#' @param interplation_methods Interpolation methods to use. Valid options are nearest-neighbor, inverse distance weighting, inverse distance weighting using the closest nm stations (idw_nmax), and kriging with and exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste) variogram model. +#' @param seed RNG seed (set.seed(seed)) to use for anisotropy estimation based on cross-validation. +#' @export + +kriging_cv <- function(x, + fold = NULL, + kriging_formula, + location_formula, + anisotropy_parameters = NULL, + nm = Inf, + maxdist = Inf, + interpolation_methods = c("nn", "idw", "exp", "cir", "gau", "sph", "mat", "bes", "ste"), + vgm_width = NULL, + estimate_anisotropy = FALSE, + only_return_anisotropy = FALSE, + anisotropy_kfold = 10, + + seed = 19673) { + + stopifnot("kriging_loocv: x must be a data.frame" = "data.frame" %in% class(x)) + + interpolation_methods <- tolower(interpolation_methods) + stopifnot("kriging_loocv: One or more interpolation_methods invalid." = all(interpolation_methods %in% c("nn", "idw", "exp", "cir", "gau", "sph", "mat", "bes", "ste"))) + + interpolation_fits <- vector("list", + length = length(interpolation_methods)) + names(interpolation_fits) <- interpolation_methods + + kriging_methods <- c("Exp", "Cir", "Gau", "Sph", "Mat", "Bes", "Ste")[match(interpolation_methods, c("exp", "cir", "gau", "sph", "mat", "bes", "ste"))] + kriging_methods <- kriging_methods[!is.na(kriging_methods)] + kriging_methods_lowercase <- tolower(kriging_methods) + + # Setup cross validation + if(is.null(fold)) { + fold <- 1:nrow(x) + } + + + if(is.null(vgm_width)) { + warning("kriging_cv: vgm_width not provided. Calculating width of breaks for sample variogram as the 1% quantile of Euclidean distances among points.") + vgm_width <- x |> + sf::st_as_sf(coords = all.vars(location_formula)) |> + sf::st_distance() |> + quantile(probs = 0.01) |> + as.numeric() + } + + mod_idw <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + set = list(idp = 2), + nmax = Inf, + maxdist = maxdist) + + if("nn" %in% interpolation_methods) { + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + set = list(idp = 0), + nmax = nm, + maxdist = maxdist) + + mod_fit <- gstat::gstat.cv(object = mod, + nfold = fold, + verbose = FALSE, + debug.level = 0, + remove.all = TRUE) + interpolation_fits[['nn']] <- mod_fit$var1.pred + + } + + if("idw" %in% interpolation_methods) { + + mod_fit <- gstat::gstat.cv(object = mod_idw, + nfold = fold, + verbose = FALSE, + debug.level = 0, + remove.all = TRUE) + interpolation_fits[['idw']] <- mod_fit$var1.pred + } + + if(length(kriging_methods) < 1) { + return(cbind(x, + as.data.frame(interpolation_fits))) + } + + + for(ii in 1:length(kriging_methods)) { + + if(estimate_anisotropy) { + + if(is.null(anisotropy_parameters)) { + + message("kriging_cv: Conducting grid search to choose starting parameters for anisotropy estimation.") + anis_ssq <- expand.grid(angle = seq(0,180,15), + ratio = seq(0.05, 1, 0.1), + ssq = as.numeric(NA)) + + for(kk in 1:nrow(anis_ssq)) { + anis_ssq$ssq[kk] <- coldpool:::fit_anisotropy_cv(anis_pars = c(anis_ssq$angle[kk], anis_ssq$ratio[kk]), + x_mod = mod_idw, + dat = x, + kriging_method = kriging_methods[ii], + kriging_formula = kriging_formula, + location_formula = location_formula, + nm = nm, + vgm_width = vgm_width, + vgm_directions = c(0, 45, 90, 135), + seed = seed, + fold = fold) + } + + anis_pars <- as.numeric(anis_ssq[which.min(anis_ssq$ssq), 1:2]) + + message("kriging_cv: Anisotropy starting parameters: ", paste(anis_pars, collapse = ", ")) + + } else { + anis_pars <- anisotropy_parameters + } + + anis_fit <- optim(par = anis_pars, + fn = coldpool:::fit_anisotropy_cv, + method = "L-BFGS-B", + lower = c(0, 1e-5), + upper = c(180, 1), + control = list(parscale = c(1,0.1)), + x_mod = mod_idw, + dat = x, + vgm_width = vgm_width, + kriging_formula = kriging_formula, + location_formula = location_formula, + nm = nm, + kriging_method = kriging_methods[ii], + vgm_directions = c(0, 45, 90, 135), + fold = fold, + seed = seed) + + anis_pars <- anis_fit$par + + # Internal use, for estimating anisotropy + if(only_return_anisotropy) { + return(anisotropy_parameters) + } + + if(class(anisotropy_parameters) == "try-error") { + warning("kriging_loocv: Anisotropy estimation error. Assuming no anisotropy.") + anisotropy_parameters <- NULL + } + + } else { + + if(is.null(anisotropy_parameters)) { + + if(length(all.vars(~ location_formula)) == 3) { + + anis_pars <- c(0, 0, 0, 1, 1) + + } + + } else { + + anis_pars <- anisotropy_parameters + + } + } + + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width), + gstat::vgm(kriging_methods[ii])) + + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width, + alpha = c(0, 45, 90, 135)), + gstat::vgm(model = kriging_methods[ii], + anis = anis_pars, + nugget = vgm_mod$psill[1])) + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + model = vgm_mod, + nmax = nm, + maxdist = maxdist) + + mod_fit <- gstat::gstat.cv(object = mod, + nfold = fold, + verbose = FALSE, + debug.level = 0) + interpolation_fits[[kriging_methods_lowercase[ii]]] <- mod_fit$var1.pred + + } + + return(cbind(x, + as.data.frame(interpolation_fits))) + +} + + + +#' Estimate anisotropy RMSE from k-fold cross validation +#' +#' Internal function for estimating anisotropy based on cross-validation; wrapper around gstat functions. +#' +#' @param anis_pars Anisotropy parameters as a 2L numeric vector indicating the angle and anisotropy ratio, e.g., c(90, 0.5) +#' @param x_mod A gstat object from which a variogram can be computed. +#' @param kriging_method Kriging method for gstat variogram. +#' @param kriging_formula Formula to use for kriging (formula argument in gstat::gstat). +#' @param location_formula Formula to use for location (location argument in gstat::gstat). +#' @param nm nmax for gstat +#' @param vgm_directions Directions for variogram (default = c(0, 45, 90, 135)) +#' @param kfold Number of folds to use for cross-validation (default = 10) +#' @param vgm_width Width of variogram breaks. +#' @param seed Random number seed to use. +#' @noRd + +fit_anisotropy_cv <- function(anis_pars, + x_mod, + kriging_formula, + location_formula, + kriging_method, + nm, + vgm_width, + vgm_directions = c(0, 45, 90, 135), + dat, + fold = NULL, + seed = 19673) { + + if(is.null(fold)) { + fold <- 1:nrow(dat) + } + + vgm_mod <- gstat::fit.variogram(gstat::variogram(x_mod, + width = vgm_width), + gstat::vgm(kriging_method)) + + vgm_mod <- gstat::fit.variogram(gstat::variogram(x_mod, + width = vgm_width, + alpha = vgm_directions), + gstat::vgm(model = kriging_method, + anis = anis_pars, + nugget = vgm_mod$psill[1]) + ) + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = dat, + model = vgm_mod, + nmax = nm) + + set.seed(seed) + ssq <- sum(gstat::gstat.cv(object = mod, + verbose = FALSE, + debug.level = 0, + nfold = fold)$residual^2) + + return(ssq) + +} \ No newline at end of file diff --git a/R/utils.R b/R/utils.R index f65309b..69a726e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -378,3 +378,5 @@ theme_multi_map_blue_strip <- function() { convert_ddm_to_dd <- function(x) { return(floor(x/100) + x%%100/60) } + + diff --git a/analysis/temp3d/R/00_analysis.R b/analysis/temp3d/R/00_analysis.R new file mode 100644 index 0000000..160ad07 --- /dev/null +++ b/analysis/temp3d/R/00_analysis.R @@ -0,0 +1,19 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) + +# Retrieve temperature/depth profiles, low-pass filter temperature data, and calculate average temperature for 1-m depth bins. +# Outputs: data/[region]/profile_data_[region].rds +source(here::here("R", "01_get_data.R")) + +# Thin profile data to 5-m depth intervals, exclude surface measurements (<5 m), and assign each haul to a cross-validation fold. EBS, NBS, and slope data are combined into a single region (EBS) +# Outputs: data/[region]/cv_data_[region].rds +source(here::here("R", "02_setup_crossvalidation_data.R")) + +# Conduct 10-fold cross-validation on temperature interpolation using ordinary kriging, regression kriging, nearest neighbor, and inverse distance weighting interpolation. +# Outputs: output/[region]/2d_interp_cv_temperature_[region].rds +source(here::here("R", "03_interpolate_temperature_2d.R")) + +# Make tables and plots of 2D cross validation results +source(here::here("R", "04_plot_2d_cross_validation_results.R")) diff --git a/analysis/temp3d/R/01_get_data.R b/analysis/temp3d/R/01_get_data.R new file mode 100644 index 0000000..ec3f135 --- /dev/null +++ b/analysis/temp3d/R/01_get_data.R @@ -0,0 +1,205 @@ +library(RODBC) +library(akgfmaps) +library(gapctd) +library(navmaps) + + +get_data_temp3d <- function() { + + channel <- gapctd::get_connected(schema = "AFSC") + + survey_definition_ids <- c(47, 52, 98, 143, 78) + region_id <- c("GOA", "AI", "EBS", "NBS", "SLOPE") + + event_names <- data.frame(EVENT_TYPE_ID = c(3, 4, 6, 8, 9, 15, 16), + EVENT_NAME = c("on_bottom_time", "eq_time", "haulback_time", "doors_up_time", "doors_out_time", "start_haul_time", "end_haul_time")) + + for(ii in 1:length(survey_definition_ids)) { + + dir.create(here::here("data", region_id[ii])) + + hauls <- RODBC::sqlQuery( + channel = channel, + query = paste0( + "select rbh.cruise, rbh.vessel, rbh.haul, rbh.gear_temperature, rbh.bottom_depth, + rbh.surface_temperature, h.gear_depth, h.haul_id, rbh.start_time, rbh.start_latitude, + rbh.start_longitude, rbh.end_latitude, rbh.end_longitude, rbh.performance, e.event_type_id, + e.date_time + from race_data.hauls h, race_data.cruises c, race_data.surveys s, racebase.haul rbh, + race_data.events e + where h.cruise_id = c.cruise_id + and s.survey_id = c.survey_id + and h.haul = rbh.haul + and c.vessel_id = rbh.vessel + and c.cruise = rbh.cruise + and rbh.performance >= 0 + and rbh.abundance_haul = 'Y' + and e.haul_id = h.haul_id + and e.event_type_id in (3, 4, 6, 8, 9, 15, 16) + and s.survey_definition_id = ", + survey_definition_ids[ii] + ) + ) |> + dplyr::mutate(DATE_TIME = lubridate::with_tz(lubridate::force_tz(DATE_TIME, tzone = "UTC"), tzone = "America/Anchorage")) |> + dplyr::inner_join(event_names, by = "EVENT_TYPE_ID") |> + dplyr::select(-EVENT_TYPE_ID) |> + tidyr::pivot_wider(values_from = "DATE_TIME", names_from = "EVENT_NAME") + + hauls$end_haul_time[is.na(hauls$end_haul_time)] <- hauls$doors_up_time[is.na(hauls$end_haul_time)] + 180 + hauls$start_haul_time[is.na(hauls$start_haul_time)] <- hauls$doors_up_time[is.na(hauls$start_haul_time)] - 180 + + + haul_midpoints <- navmaps::start_end_to_midpoint(start_latitude = hauls$START_LATITUDE, + start_longitude = hauls$START_LONGITUDE, + end_latitude = hauls$END_LATITUDE, + end_longitude = hauls$END_LONGITUDE) |> + as.data.frame() + + names(haul_midpoints) <- toupper(names(haul_midpoints)) + + cast_locations <- hauls |> + dplyr::bind_cols(haul_midpoints) |> + tidyr::pivot_longer(cols = c(START_LATITUDE, START_LONGITUDE, END_LATITUDE, END_LONGITUDE, MID_LATITUDE, MID_LONGITUDE)) |> + dplyr::inner_join( + data.frame(name = c("START_LATITUDE", "START_LONGITUDE", "END_LATITUDE", "END_LONGITUDE", "MID_LATITUDE", "MID_LONGITUDE"), + CAST = c("downcast", "downcast", "upcast", "upcast", "bottom", "bottom"), + coord = c("LATITUDE", "LONGITUDE", "LATITUDE", "LONGITUDE", "LATITUDE", "LONGITUDE")) + ) |> + dplyr::select(HAUL_ID, START_TIME, BOTTOM_DEPTH, value, CAST, coord) |> + tidyr::pivot_wider(names_from = c("coord"), values_from = value) + + temp_depth <- RODBC::sqlQuery( + channel = channel, + query = paste0( + "select bt.temperature, bt.depth, bt.date_time, h.haul_id + from race_data.bathythermics bt, race_data.bathythermic_headers bth, race_data.hauls h, + race_data.cruises c, race_data.surveys s + where bth.haul_id = h.haul_id + and bt.bathythermic_header_id = bth.bathythermic_header_id + and h.cruise_id = c.cruise_id + and s.survey_id = c.survey_id + and bt.depth > 0.1 + and bt.temperature > -2.0 + and bt.temperature < 20 + and s.survey_definition_id = ", + survey_definition_ids[ii] + ) + ) |> dplyr::mutate(DATE_TIME = lubridate::with_tz(lubridate::force_tz(DATE_TIME, tzone = "UTC"), tzone = "America/Anchorage")) + + temp_depth <- dplyr::filter(temp_depth, HAUL_ID %in% unique(hauls$HAUL_ID)) + hauls <- dplyr::filter(hauls, HAUL_ID %in% unique(temp_depth$HAUL_ID)) + + unique_haul_id <- sort(unique(temp_depth$HAUL_ID)) + + profile_dat <- data.frame() + + for(jj in 1:length(unique_haul_id)) { + + if(jj%%100 == 0) { + message(paste0(jj, "/", length(unique_haul_id))) + } + + sel_haul_temp <- dplyr::filter(temp_depth, HAUL_ID == unique_haul_id[jj]) + temp_depth <- dplyr::filter(temp_depth, HAUL_ID != unique_haul_id[jj]) + + sel_haul_events <- dplyr::filter(hauls, HAUL_ID == unique_haul_id[jj]) + + if(nrow(sel_haul_temp) < 10) { + next + } + + haul_temp_oce <- oce::as.oce( + data.frame(datetime = sel_haul_temp$DATE_TIME, + temperature = sel_haul_temp$TEMPERATURE, + depth = sel_haul_temp$DEPTH) |> + dplyr::arrange(datetime) |> + dplyr::mutate(timeS = as.numeric(difftime(datetime, min(datetime), units = "secs"))) + ) |> + gapctd::lowpass_filter(variables = "temperature") # Low-pass filter temperature + + haul_temp_oce@metadata$startTime <- min(haul_temp_oce@data$datetime) + + if(survey_definition_ids[ii] %in% c(47, 52)) { + + dc_start <- sel_haul_events$start_haul_time + dc_end <- sel_haul_events$eq_time + uc_start <- sel_haul_events$haulback_time + uc_end <- sel_haul_events$end_haul_time + + } + + if(survey_definition_ids[ii] %in% c(78, 98, 143)) { + + dc_start <- sel_haul_events$start_haul_time + dc_end <- sel_haul_events$on_bottom_time + uc_start <- sel_haul_events$haulback_time + uc_end <- sel_haul_events$end_haul_time + + } + + dc_dat <- NULL + uc_dat <- NULL + + dc_dat <- haul_temp_oce |> + gapctd::section_oce(by = "datetime", start = dc_start, end = dc_end, cast_direction = "downcast") |> + gapctd::bin_average(by = "depth", interpolate_missing = FALSE, exclude_bad_flag = FALSE) + + uc_dat <- haul_temp_oce |> + gapctd::section_oce(by = "datetime", start = uc_start, end = uc_end, cast_direction = "upcast") |> + gapctd::bin_average(by = "depth", interpolate_missing = FALSE, exclude_bad_flag = FALSE) + + + + bottom_dat <- haul_temp_oce |> + gapctd::section_oce(by = "datetime", start = dc_end, end = uc_start, cast_direction = "bottom") + + if(!is.null(dc_dat)) { + dc_dat <- as.data.frame(dc_dat@data) |> + dplyr::select(depth, temperature) + + dc_dat$cast <- "downcast" + } + + if(!is.null(uc_dat)) { + uc_dat <- as.data.frame(uc_dat@data) |> + dplyr::select(depth, temperature) + + uc_dat$cast <- "upcast" + } + + if(!is.null(bottom_dat)) { + bottom_dat <- data.frame(temperature = mean(bottom_dat@data$temperature, na.rm = TRUE), + depth = mean(bottom_dat@data$depth, na.rm = TRUE)) + + bottom_dat$cast <- "bottom" + } + + if(any(c(!is.null(dc_dat), !is.null(uc_dat), !is.null(bottom_dat)))) { + + haul_dat <- dplyr::bind_rows(dc_dat, uc_dat, bottom_dat) + + names(haul_dat) <- toupper(names(haul_dat)) + + haul_dat$HAUL_ID <- unique_haul_id[jj] + + profile_dat <- dplyr::bind_rows(profile_dat, haul_dat) + + } + + + + } + + profile_dat <- dplyr::left_join(profile_dat, cast_locations) + + saveRDS(list(profile = profile_dat, + haul = hauls), + file = here::here("data", + region_id[ii], + paste0("profile_data_", region_id[ii], ".rds"))) + } + + +} + +get_data_temp3d() \ No newline at end of file diff --git a/analysis/temp3d/R/02_setup_crossvalidation_data.R b/analysis/temp3d/R/02_setup_crossvalidation_data.R new file mode 100644 index 0000000..4aa712c --- /dev/null +++ b/analysis/temp3d/R/02_setup_crossvalidation_data.R @@ -0,0 +1,97 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) + + +setup_crossvalidation_data <- function(region, depth_interval, min_depth, nfold, seed = 19673) { + + # UTM zones based on the most frequent zone among survey samples. + crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + + utm_crs <- crs_by_region$utmcrs[crs_by_region$region == region] + + profile_dat <- readRDS(here::here("data", region, paste0("profile_data_", region, ".rds"))) + + if(region == "EBS") { + slope_dat <- readRDS(here::here("data", "slope", paste0("profile_data_slope.rds"))) + nbs_dat <- readRDS(here::here("data", "nbs", paste0("profile_data_nbs.rds"))) + + profile_dat$profile <- dplyr::bind_rows(profile_dat$profile, slope_dat$profile, nbs_dat$profile) + profile_dat$haul <- dplyr::bind_rows(profile_dat$haul, slope_dat$haul, nbs_dat$haul) + } + + profile_dat$haul$YEAR <- lubridate::year(profile_dat$haul$START_TIME) + + unique_years <- sort(unique(profile_dat$haul$YEAR)) + + cross_validation_data <- data.frame() + + for(ii in 1:length(unique_years)) { + + sel_haul <- dplyr::filter(profile_dat$haul, + YEAR == unique_years[ii], + !is.na(GEAR_DEPTH)) + + sel_profile <- dplyr::filter(profile_dat$profile, + HAUL_ID %in% unique(sel_haul$HAUL_ID), + !is.na(LONGITUDE), + !is.na(LATITUDE)) |> + dplyr::inner_join(dplyr::select(sel_haul, HAUL_ID, BOTTOM_DEPTH, GEAR_DEPTH, YEAR), + by = c("HAUL_ID", "BOTTOM_DEPTH")) |> + dplyr::mutate(index = as.numeric(factor(HAUL_ID))) + + sel_profile <- dplyr::filter(sel_profile, CAST %in% c("downcast", "upcast")) |> + dplyr::inner_join(expand.grid(CAST = c("upcast", "downcast"), + DEPTH = seq(min_depth, max(sel_profile$DEPTH), depth_interval)), + by = c("DEPTH", "CAST")) |> + dplyr::bind_rows(dplyr::filter(sel_profile, CAST == "bottom")) |> + dplyr::select(-START_TIME) + + sel_profile <- sf::st_as_sf(sel_profile, + coords = c("LONGITUDE", "LATITUDE"), + crs = "WGS84") |> + sf::st_transform(crs = utm_crs) + + sel_profile <- dplyr::bind_cols(sel_profile, + as.data.frame(sf::st_coordinates(sel_profile)) |> + dplyr::rename(LONGITUDE = X, + LATITUDE = Y)) |> + sf::st_drop_geometry() + + set.seed(seed) + sel_profile$fold <- dplyr::inner_join(sel_profile, + data.frame(HAUL_ID = unique(sel_profile$HAUL_ID), + fold = sample(rep(1:nfold, ceiling(length(unique(sel_profile$HAUL_ID))/nfold)), + size = length(unique(sel_profile$HAUL_ID)), + replace = FALSE)), + by = "HAUL_ID")$fold + + cross_validation_data <- dplyr::bind_rows(cross_validation_data, sel_profile) + + } + + saveRDS(object = cross_validation_data, + file = here::here("data", region, paste0("cv_data_", region, ".rds"))) + +} + +setup_crossvalidation_data(region = "AI", + depth_interval = 5, + min_depth = 5, + nfold = 10, + seed = 19673) + +setup_crossvalidation_data(region = "EBS", + depth_interval = 5, + min_depth = 5, + nfold = 10, + seed = 19673) + +setup_crossvalidation_data(region = "GOA", + depth_interval = 5, + min_depth = 5, + nfold = 10, + seed = 19673) diff --git a/analysis/temp3d/R/03_interpolate_temperature_2d.R b/analysis/temp3d/R/03_interpolate_temperature_2d.R new file mode 100644 index 0000000..c249241 --- /dev/null +++ b/analysis/temp3d/R/03_interpolate_temperature_2d.R @@ -0,0 +1,144 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) + +temperature_2d_cv <- function(region) { + + kriging_methods <- c("exp", "cir", "gau", "sph", "mat", "bes", "ste") + + # UTM zones based on the most frequent zone among survey samples. + crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + + profile_dat <- readRDS(here::here("data", region, paste0("cv_data_", region, ".rds"))) + + + dir.create(here::here("output", region), showWarnings = FALSE) + + unique_years <- sort(unique(profile_dat$YEAR)) + + cv_fits <- data.frame() + + for(ii in 1:length(unique_years)) { + + sel_profile <- dplyr::filter(profile_dat, + YEAR == unique_years[ii]) |> + dplyr::filter(CAST == "bottom") + + message("Inverse distance weighting") + nn_fit <- coldpool::kriging_cv( + x = sel_profile, + kriging_formula = TEMPERATURE ~ 1, + location_formula = ~ LONGITUDE + LATITUDE, + fold = sel_profile$fold, + anisotropy_parameters = NULL, + estimate_anisotropy = FALSE, + nm = Inf, + vgm_width = NULL, + interpolation_methods = c("nn") + ) |> + dplyr::mutate(MODEL = "NN") + + message("Nearest-neighbor") + idw_fit <- coldpool::kriging_cv( + x = sel_profile, + kriging_formula = TEMPERATURE ~ 1, + location_formula = ~ LONGITUDE + LATITUDE, + fold = sel_profile$fold, + anisotropy_parameters = NULL, + estimate_anisotropy = FALSE, + nm = Inf, + vgm_width = NULL, + interpolation_methods = c("idw") + ) |> + dplyr::mutate(MODEL = "IDW") + + message("Kriging base") + krige_base <- coldpool::kriging_cv( + x = sel_profile, + kriging_formula = TEMPERATURE ~ 1, + location_formula = ~ LONGITUDE + LATITUDE, + fold = sel_profile$fold, + anisotropy_parameters = NULL, + estimate_anisotropy = TRUE, + nm = Inf, + vgm_width = NULL, + interpolation_methods = kriging_methods + ) |> + dplyr::mutate(MODEL = "OK") + + message("Kriging linear depth") + krige_with_depth_linear <- coldpool::kriging_cv( + x = sel_profile, + kriging_formula = TEMPERATURE ~ GEAR_DEPTH, + location_formula = ~ LONGITUDE + LATITUDE, + fold = sel_profile$fold, + anisotropy_parameters = NULL, + estimate_anisotropy = TRUE, + nm = Inf, + vgm_width = NULL, + interpolation_methods = kriging_methods + ) |> + dplyr::mutate(MODEL = "RK (Linear depth)") + + message("Kriging quadratic") + krige_with_depth_quadratic <- coldpool::kriging_cv( + x = sel_profile, + kriging_formula = TEMPERATURE ~ GEAR_DEPTH + I(GEAR_DEPTH^2), + location_formula = ~ LONGITUDE + LATITUDE, + fold = sel_profile$fold, + anisotropy_parameters = NULL, + estimate_anisotropy = TRUE, + nm = Inf, + vgm_width = NULL, + interpolation_methods = kriging_methods + ) |> + dplyr::mutate(MODEL = "RK (Quadratic depth)") + + message("Kriging log depth") + krige_with_log_depth <- coldpool::kriging_cv( + x = sel_profile, + kriging_formula = TEMPERATURE ~ I(log(GEAR_DEPTH)), + location_formula = ~ LONGITUDE + LATITUDE, + fold = sel_profile$fold, + anisotropy_parameters = NULL, + estimate_anisotropy = TRUE, + nm = Inf, + vgm_width = NULL, + interpolation_methods = kriging_methods + ) |> + dplyr::mutate(MODEL = "RK (Log depth)") + + + saveRDS(object = dplyr::bind_rows(nn_fit, + idw_fit, + krige_base, + krige_with_depth_linear, + krige_with_depth_quadratic, + krige_with_log_depth), + file = here::here("output", region, paste0("2d_interp_cv_temperature_", region, "_", unique_years[ii], ".rds"))) + + cv_fits <- dplyr::bind_rows(cv_fits, + nn_fit, + idw_fit, + krige_base, + krige_with_depth_linear, + krige_with_depth_quadratic, + krige_with_log_depth) + + } + + + saveRDS(object = cv_fits, + file = here::here("output", region, paste0("2d_interp_cv_temperature_", region, ".rds")) + ) + + return(cv_fits) + +} + +ai_fits <- temperature_2d_cv(region = "AI") +ebs_fits <- temperature_2d_cv(region = "EBS") +goa_fits <- temperature_2d_cv(region = "GOA") diff --git a/analysis/temp3d/R/03_interpolate_temperature_2d_spmodel.R b/analysis/temp3d/R/03_interpolate_temperature_2d_spmodel.R new file mode 100644 index 0000000..021b111 --- /dev/null +++ b/analysis/temp3d/R/03_interpolate_temperature_2d_spmodel.R @@ -0,0 +1,84 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) +library(spmodel) + +fit_2d_spmod <- function(data, + spcov_type, + model_formula, + anisotropy, + ...) { + + model_structure <- expand.grid(spcov_type = spcov_type, + formula = model_formula, + anisotropy = anisotropy) + model_structure$spcov_type <- as.character(model_structure$spcov_type) + model_structure$convergence <- as.numeric(NA) + model_structure$AICc <- as.numeric(NA) + model_structure$mspe <- as.numeric(NA) + model_structure$anis_rotate <- 0 + model_structure$anis_scale <- 1 + model_structure$df <- as.numeric(NA) + + for(jj in 1:nrow(model_structure)) { + + mod_fit <- spmodel::splm(formula = model_structure$formula[[jj]], + data = data, + spcov_type = model_structure$spcov_type[jj], + anisotropy = model_structure$anisotropy[jj], + ...) + + model_structure$anis_rotate[jj] <- mod_fit$coefficients$spcov[['rotate']] + model_structure$anis_scale[jj] <- mod_fit$coefficients$spcov[['scale']] + + model_structure$AICc[jj] <- spmodel::AICc(mod_fit) + model_structure$convergence[jj] <- mod_fit$optim$convergence + model_structure$df[jj] <- mod_fit$npar + model_structure$mspe[jj] <- spmodel::loocv(mod_fit) + + } + + return(model_structure) + +} + + +# Analysis +crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + +for(ii in 1:nrow(crs_by_region)) { + + aicc_table <- data.frame() + + bottom_dat <- readRDS(here::here("data", crs_by_region$region[ii], paste0("cv_data_", crs_by_region$region[ii], ".rds"))) |> + sf::st_as_sf(coords = c("LONGITUDE", "LATITUDE"), + crs = crs_by_region$utmcrs[ii]) |> + dplyr::filter(CAST == "bottom") + + unique_years <- sort(unique(bottom_dat$YEAR)) + + for(jj in 1:length(unique_years)) { + + message("Region: ", crs_by_region$region[ii], ", Year: ", unique_years[jj]) + + aicc_table <- fit_2d_spmod(data = dplyr::filter(bottom_dat, YEAR == unique_years[jj]), + spcov_type = c("exponential", "circular", "gaussian", "spherical", "matern"), + model_formula = c(TEMPERATURE ~ 1, + TEMPERATURE ~ GEAR_DEPTH, + TEMPERATURE ~ GEAR_DEPTH + I(GEAR_DEPTH^2), + TEMPERATURE ~ I(log(GEAR_DEPTH))), + anisotropy = c(TRUE, FALSE), + estmethod = "ml") |> + dplyr::mutate(YEAR = unique_years[jj]) + dplyr::bind_rows(aicc_table) + + saveRDS(object = aicc_table, + file = here::here("output", crs_by_region$region[ii], paste0("2d_interp_aicc_", crs_by_region$region[ii], "_", unique_years[jj], ".rds"))) + + } + +} + diff --git a/analysis/temp3d/R/04_plot_2d_cross_validation_results.R b/analysis/temp3d/R/04_plot_2d_cross_validation_results.R new file mode 100644 index 0000000..bb1091e --- /dev/null +++ b/analysis/temp3d/R/04_plot_2d_cross_validation_results.R @@ -0,0 +1,73 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) + +# Calculate root-mean square error, mean absolute error, and bias +cv_2d_table_plot <- function(region, + model_order = c("NN", + "IDW", + "OK", + "RK (Linear depth)", + "RK (Quadratic depth)", + "RK (Log depth)")) { + + dat <- readRDS(file = list.files(here::here("output", region), + full.names = TRUE, pattern = "2d_interp_cv_temperature")) |> + dplyr::select(-DEPTH) |> + tidyr::pivot_longer(cols = c(ste, idw, exp, cir, gau, sph, mat, bes, nn, idw)) |> + dplyr::filter(!is.na(value)) + + bias_offset <- 0 + if(min(c(dat$value, dat$TEMPERATURE)) <= 0) { + bias_offset <- abs(min(c(dat$value, dat$TEMPERATURE))) + 1e-5 + } + + output <- dat |> + dplyr::group_by(MODEL, name) |> + dplyr::summarise(RMSE = sqrt(mean((value-TEMPERATURE)^2)), + MAE = mean(abs(value-TEMPERATURE)), + BIAS = 10^mean(abs(log10(value+bias_offset)-log10(TEMPERATURE+bias_offset)))) |> + as.data.frame() |> + dplyr::arrange(RMSE) + + dat <- dat |> + dplyr::mutate(MODEL = factor(MODEL, levels = model_order)) + + rmse_plot <- ggplot() + + geom_violin(data = dat, + mapping = aes(x = MODEL, + y = sqrt((TEMPERATURE-value)^2))) + + geom_violin(data = dat, + mapping = aes(x = MODEL, + y = sqrt((TEMPERATURE-value)^2)), + draw_quantiles = c(0.25, 0.75), linetype = 2, fill = NA) + + geom_violin(data = dat, + mapping = aes(x = MODEL, + y = sqrt((TEMPERATURE-value)^2)), + draw_quantiles = c(0.5), fill = NA) + + geom_point(data = dat |> + dplyr::mutate(residual = (TEMPERATURE-value)^2) |> + dplyr::group_by(MODEL) |> + dplyr::summarise(mean_residual = sqrt(mean(residual))), + mapping = aes(x = MODEL, + y = mean_residual)) + + scale_y_continuous(name = "MSE") + + scale_x_discrete(name = "Model") + + theme_bw() + + ragg::agg_png(filename = here::here("plots", "2d_cv_rmse_violin_", region, ".png"), + width = 120, height = 120, units = "mm", res = 300) + print(rmse_plot) + dev.off() + + write.csv(output, file = here::here("plots", paste0("2d_cv_rmse_table_", region, ".csv")), row.names = FALSE) + + return(list(error_table = output, + rmse_plot = rmse_plot)) + +} + +ebs_dat <- cv_2d_table_plot(region = "EBS") +ai_dat <- cv_2d_table_plot(region = "AI") +goa_dat <- cv_2d_table_plot(region = "GOA") diff --git a/analysis/temp3d/R/05_estimate_vertical_expansion.R b/analysis/temp3d/R/05_estimate_vertical_expansion.R new file mode 100644 index 0000000..0388a33 --- /dev/null +++ b/analysis/temp3d/R/05_estimate_vertical_expansion.R @@ -0,0 +1,66 @@ +library(akgfmaps) +library(coldpool) +library(doParallel) +library(foreach) + + +est_vertical_expansion <- function(region, vgm_width) { + + profile_dat <- readRDS(list.files(here::here("data", region), full.names = TRUE, pattern = "cv_data")) |> + dplyr::mutate(cv_index = CAST == "bottom") + + file_paths_aicc <- list.files(here::here("output", region), full.names = TRUE, pattern = "2d_interp_aicc") + + # UTM zones based on the most frequent zone among survey samples. + crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + + utm_crs <- crs_by_region$utmcrs[crs_by_region$region == region] + + + # Setup four clusters and folds for each matchups + doParallel::registerDoParallel(parallel::makeCluster(4)) + + foreach::foreach(ii = 1:length(file_paths_aicc)) %dopar% { + best_model <- readRDS(file_paths_aicc[ii]) + + while(any(best_model$AICc[which.min(best_model$AICc)] - best_model$AICc[-which.min(best_model$AICc)] < -2)) { + best_model <- best_model[-which.max(best_model$AICc), ] + } + + while(!all(best_model$df == max(best_model$df))) { + best_model <- best_model[-which.max(best_model$df), ] + } + + best_model <- best_model[which.min(best_model$AICc), ] + + sel_profile <- dplyr::filter(profile_dat, YEAR == best_model$YEAR) + + z_expansion <- coldpool::estimate_z_expansion(x = sel_profile, + location_formula = ~ LONGITUDE + LATITUDE + DEPTH, + kriging_formula = best_model$formula[[1]], + model = paste0(toupper(substr(best_model$spcov_type, 1, 1)), + tolower(substr(best_model$spcov_type, 2, 3))), + z_start = 5000, + cv_index = sel_profile$fold, + anisotropy_parameters = c(best_model$anis_rotate,0,0,best_model$anis_scale,1), + vgm_width = vgm_width, + nm = Inf, + maxdist = Inf, + use_for_mse = sel_profile$cv_index, + z_limits = c(10, 500000)) + + best_model$z_expansion <- z_expansion$par + + saveRDS(object = best_model, file = here::here("output", region, paste0("3d_vert_expansion_est_", best_model$YEAR, ".rds"))) + + return(best_model) + + } +} + + +est_vertical_expansion(region = "AI", vgm_width = 11000) +est_vertical_expansion(region = "GOA", vgm_width = 11000) +est_vertical_expansion(region = "EBS", vgm_width = 38000) diff --git a/analysis/temp3d/R/fit_anisotropy_new.R b/analysis/temp3d/R/fit_anisotropy_new.R new file mode 100644 index 0000000..48ca56c --- /dev/null +++ b/analysis/temp3d/R/fit_anisotropy_new.R @@ -0,0 +1,43 @@ +fit_anisotropy_new <- function(anis_pars, + x_mod, + kriging_method, + kriging_formula, + location_formula, + nm, + vgm_width, + vgm_directions = c(0, 45, 90, 135), + dat, + kfold = 10, + seed = 19673) { + + if(length(anis_pars) == 5) { + anis_pars <- c(anis_pars[1], 0, 0, anis_pars[2], 0) + } + + vgm_mod <- gstat::fit.variogram(gstat::variogram(x_mod, + width = vgm_width), + gstat::vgm(kriging_method)) + + vgm_mod_2 <- gstat::fit.variogram(gstat::variogram(x_mod, + width = vgm_width, + alpha = vgm_directions), + gstat::vgm(model = kriging_method, + anis = anis_pars, + nugget = vgm_mod$psill[1])) + + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = dat, + model = vgm_mod_2, + nmax = nm) + + set.seed(seed) + ssq <- sum(gstat::gstat.cv(object = mod, + verbose = FALSE, + debug.level = 0, + nfold = kfold)$residual^2) + + return(ssq) + +} diff --git a/analysis/temp3d/R/test_3d.R b/analysis/temp3d/R/test_3d.R new file mode 100644 index 0000000..b1638d6 --- /dev/null +++ b/analysis/temp3d/R/test_3d.R @@ -0,0 +1,257 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) + +# UTM zones based on the most frequent zone among survey samples. +crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + +region = "AI" + +profile_dat <- readRDS(here::here("data", region, paste0("profile_data_", region, ".rds"))) + +if(region == "EBS") { + slope_dat <- readRDS(here::here("data", "slope", paste0("profile_data_slope.rds"))) + nbs_dat <- readRDS(here::here("data", "nbs", paste0("profile_data_nbs.rds"))) + + profile_dat$profile <- dplyr::bind_rows(profile_dat$profile, slope_dat$profile, nbs_dat$profile) + profile_dat$haul <- dplyr::bind_rows(profile_dat$haul, slope_dat$haul, nbs_dat$haul) +} + +profile_dat$haul$YEAR <- lubridate::year(profile_dat$haul$START_TIME) + +unique_years <- sort(unique(profile_dat$haul$YEAR)) + +loocv_fits <- data.frame() + +ii <- 1 + +sel_haul <- dplyr::filter(profile_dat$haul, + YEAR == unique_years[ii], + !is.na(GEAR_DEPTH)) + +sel_profile <- dplyr::filter(profile_dat$profile, + HAUL_ID %in% unique(sel_haul$HAUL_ID), + !is.na(LONGITUDE), + !is.na(LATITUDE)) |> + dplyr::inner_join(dplyr::select(sel_haul, HAUL_ID, BOTTOM_DEPTH, GEAR_DEPTH, YEAR)) |> + dplyr::mutate(LOG_GEAR_DEPTH = log(GEAR_DEPTH)) |> + # dplyr::filter(CAST == "bottom") |> + # dplyr::mutate(index = dplyr::row_number()) + dplyr::mutate(index = as.numeric(factor(HAUL_ID))) + +sel_profile <- dplyr::filter(sel_profile, CAST %in% c("downcast", "upcast")) |> + dplyr::inner_join(expand.grid(CAST = c("upcast", "downcast"), + DEPTH = seq(5, max(sel_profile$DEPTH), 5))) |> + dplyr::bind_rows(dplyr::filter(sel_profile, CAST == "bottom")) + + + +test <- dplyr::bind_rows(dplyr::filter(sel_profile, CAST == "bottom")) + + +# +# unique_casts <- dplyr::filter(sel_profile, +# CAST != "bottom") |> +# dplyr::select(CAST, index) |> +# unique() +# +# semivariance <- data.frame() +# +# for(ii in 1:nrow(unique_casts)) { +# +# sel_cast <- dplyr::inner_join(sel_profile, unique_casts[ii,]) |> +# dplyr::arrange(DEPTH) |> +# dplyr::mutate(dummy_dim = 0) +# +# mod <- gstat::gstat(formula = TEMPERATURE ~ 1, +# locations = ~ DEPTH + dummy_dim, +# data = sel_cast) +# +# vario <- gstat::variogram(mod, width = 1) +# +# +# semivariance <- dplyr::bind_rows(semivariance, +# dplyr::bind_cols( +# data.frame(dist = vario$dist, +# gamma = vario$gamma), +# unique_casts[ii,]) +# ) +# +# } +# +# ggplot(data = semivariance, +# mapping = aes(x = dist, +# y = gamma)) + +# geom_point() + +# geom_smooth() +# +# test <- dplyr::filter(sel_profile, +# index == 1, +# CAST == "upcast") |> +# dplyr::mutate(y_dim = 0) +# +# test_variogram <- gstat::variogram(test_mod, width = 1) +# +# +# test_vgm_fit <- fit.variogram(object = test_variogram, +# gstat::vgm("Exp", range = max(test$DEPTH)) +# ) +# +# plot(test_vgm_fit, cutoff = 25) + +max_depth <- dplyr::group_by(sel_profile, HAUL_ID) |> + dplyr::filter(CAST != "bottom") |> + dplyr::summarise(MAX_DEPTH=max(DEPTH)) +sel_profile <- dplyr::inner_join(sel_profile, max_depth) |> + dplyr::filter((MAX_DEPTH-DEPTH) < 7) + +krige_base <- coldpool::kriging_loocv( + x = dplyr::filter(sel_profile, CAST == "bottom"), + variable_name = "TEMPERATURE", + latitude_name = "LATITUDE", + longitude_name = "LONGITUDE", + elevation_name = NULL, + input_crs = "WGS84", + interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], + anisotropy_parameters = NULL, + estimate_anisotropy = TRUE, + nm = Inf, + vgm_width = 20000, + kriging_formula = variable_name ~ log(GEAR_DEPTH), + interpolation_methods = c("exp") +) |> + dplyr::mutate(MODEL = "Base") + +krige_log_depth <- coldpool::kriging_loocv( + x = dplyr::filter(sel_profile, CAST == "bottom"), + variable_name = "TEMPERATURE", + latitude_name = "LATITUDE", + longitude_name = "LONGITUDE", + elevation_name = NULL, + input_crs = "WGS84", + interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], + anisotropy_parameters = NULL, + estimate_anisotropy = TRUE, + nm = Inf, + vgm_width = 20000, + kriging_formula = variable_name ~ LOG_GEAR_DEPTH, + interpolation_methods = c("exp") +) |> + dplyr::mutate(MODEL = "log_depth") + +# krige_test_index <- coldpool::kriging_loocv( +# x = sel_profile, +# variable_name = "TEMPERATURE", +# latitude_name = "LATITUDE", +# longitude_name = "LONGITUDE", +# cv_index_name = "index", +# elevation_name = NULL, +# input_crs = "WGS84", +# interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], +# anisotropy_parameters = NULL, +# estimate_anisotropy = TRUE, +# nm = Inf, +# vgm_width = NULL, +# kriging_formula = variable_name ~ 1, +# interpolation_methods = c("nn", "idw", "exp") +# ) |> +# dplyr::mutate(MODEL = "test_index") +# +# krige_base$exp == krige_test_index$exp +# krige_base$idw == krige_test_index$idw +# krige_base$nn == krige_test_index$nn +# +# krige_test_vert <- coldpool::kriging_loocv( +# x = dplyr::mutate(sel_profile, DEPTH = DEPTH * 1000), +# variable_name = "TEMPERATURE", +# latitude_name = "LATITUDE", +# longitude_name = "LONGITUDE", +# cv_index_name = "index", +# elevation_name = "DEPTH", +# input_crs = "WGS84", +# interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], +# anisotropy_parameters = NULL, +# estimate_anisotropy = FALSE, +# nm = Inf, +# vgm_width = NULL, +# kriging_formula = variable_name ~ 1, +# interpolation_methods = c("nn", "idw", "exp") +# ) |> +# dplyr::mutate(MODEL = "test_vert") +# +# +# krige_test_vert_no_index <- coldpool::kriging_loocv( +# x = dplyr::mutate(sel_profile, DEPTH = DEPTH * 1000), +# variable_name = "TEMPERATURE", +# latitude_name = "LATITUDE", +# longitude_name = "LONGITUDE", +# cv_index_name = NULL, +# elevation_name = "DEPTH", +# input_crs = "WGS84", +# interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], +# anisotropy_parameters = NULL, +# estimate_anisotropy = FALSE, +# nm = Inf, +# vgm_width = NULL, +# kriging_formula = variable_name ~ 1, +# interpolation_methods = c("nn", "idw", "exp") +# ) |> +# dplyr::mutate(MODEL = "test_vert_no_index") + +start_time <- Sys.time() +krige_test_vert_anisotropy <- coldpool::kriging_loocv( + x = dplyr::mutate(sel_profile, DEPTH = DEPTH * 1000), + variable_name = "TEMPERATURE", + latitude_name = "LATITUDE", + longitude_name = "LONGITUDE", + cv_index_name = "index", + elevation_name = "DEPTH", + input_crs = "WGS84", + interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], + anisotropy_parameters = c(63.433424, 0, 0, 0.565689, 1), #anis = c(p,s) is equivalent to anis = c(p,0,0,s,1) + estimate_anisotropy = FALSE, + nm = Inf, + vgm_width = 20000, + kriging_formula = variable_name ~ 1, + interpolation_methods = c("exp") +) |> + dplyr::mutate(MODEL = "test_3d") +end_time <- Sys.time() +end_time-start_time + +start_time <- Sys.time() +krige_test_vert_anisotropy_gear_depth <- coldpool::kriging_loocv( + x = dplyr::mutate(sel_profile, DEPTH = DEPTH * 10000), + variable_name = "TEMPERATURE", + latitude_name = "LATITUDE", + longitude_name = "LONGITUDE", + cv_index_name = "index", + elevation_name = "DEPTH", + input_crs = "WGS84", + interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], + anisotropy_parameters = c(63.433424, 0, 0, 0.565689, 1), #anis = c(p,s) is equivalent to anis = c(p,0,0,s,1) + estimate_anisotropy = FALSE, + nm = Inf, + vgm_width = 20000, + kriging_formula = variable_name ~ LOG_GEAR_DEPTH, + interpolation_methods = c("exp") +) |> + dplyr::mutate(MODEL = "test_3d_gear_depth") +end_time <- Sys.time() +end_time-start_time + + +dplyr::bind_rows(krige_test_vert_anisotropy, + krige_base, + krige_log_depth, + krige_test_vert_anisotropy_gear_depth) |> + dplyr::filter(CAST == "bottom") |> + dplyr::group_by(MODEL) |> + dplyr::summarise(mse = sqrt(mean((TEMPERATURE-exp)^2))) + + + + diff --git a/analysis/temp3d/R/test_aniso.R b/analysis/temp3d/R/test_aniso.R new file mode 100644 index 0000000..9e9c345 --- /dev/null +++ b/analysis/temp3d/R/test_aniso.R @@ -0,0 +1,260 @@ +#' Compare interpolation methods using cross-validation +#' +#' Conducts cross validation of a single predictor using kriging, nearest-neighbor, and inverse-distance weighting interpolation method using gstat. +#' +#' @param x A data.frame containing variables to be interpolated and latitude and longitude coordinates. +#' @param fold Vector of numeric values indicating which observations are assigned to which fold. Default NULL performs leave-one-out cross validation. (see 'nfold' argument in ?gstat::gstat). +#' @param location_formula Formula to use for location argument to gstat. See gstat documentation for description of the formula interface (see ?gstat::gstat). +#' @param kriging_formula Formula to use for kriging. See gstat documentation for description of the formula interface (see ?gstat::gstat). +#' @param anisotopy_parameters Optional. Anisotropy parameters to use for ordinary kriging as a 2L vector for 2D kriging or 5L vector for 3D kriging. See: ?gstat::vgm. If NULL and estimate_anisotropy, anisotropy is estimated. +#' @param estimate_anisotropy Logical indicating whether anisotropy should be estimated for kriging methods based on k-fold cross validation. Default = FALSE +#' @param anisotropy_kfold 1L numeric vector indicating how many folds to use for cross-validation if estimate_anisotropy = TRUE. +#' @param nm Maximum number of nearest neighbor observations to use for interpolation. +#' @param vgm_width Optional. Width of variogram breaks. +#' @param interplation_methods Interpolation methods to use. Valid options are nearest-neighbor, inverse distance weighting, inverse distance weighting using the closest nm stations (idw_nmax), and kriging with and exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste) variogram model. +#' @param seed RNG seed (set.seed(seed)) to use for anisotropy estimation based on cross-validation. +#' @export + +kriging_cv <- function(x, + fold = NULL, + kriging_formula, + location_formula, + anisotropy_parameters = NULL, + nm = Inf, + maxdist = Inf, + interpolation_methods = c("nn", "idw", "exp", "cir", "gau", "sph", "mat", "bes", "ste"), + vgm_width = NULL, + estimate_anisotropy = FALSE, + only_return_anisotropy = FALSE, + seed = 19673) { + + + #### + + library(coldpool) + library(gapctd) + library(navmaps) + library(lubridate) + + kriging_methods <- c("exp", "cir", "gau", "sph", "mat", "bes", "ste") + region <- "AI" + # UTM zones based on the most frequent zone among survey samples. + crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + + profile_dat <- readRDS(here::here("data", region, paste0("cv_data_", region, ".rds"))) + + unique_years <- sort(unique(profile_dat$YEAR)) + + cv_fits <- data.frame() + + ii <- 1 + + sel_profile <- dplyr::filter(profile_dat, + YEAR == unique_years[ii]) |> + dplyr::filter(CAST == "bottom") + + x = sel_profile + kriging_formula = TEMPERATURE ~ 1 + location_formula = ~ LONGITUDE + LATITUDE + fold = sel_profile$fold + anisotropy_parameters = NULL + estimate_anisotropy = TRUE + nm = Inf + maxdist = Inf + vgm_width = NULL + interpolation_methods = "gau" + seed = 19673 + #### + + stopifnot("kriging_loocv: x must be a data.frame" = "data.frame" %in% class(x)) + + interpolation_methods <- tolower(interpolation_methods) + stopifnot("kriging_loocv: One or more interpolation_methods invalid." = all(interpolation_methods %in% c("nn", "idw", "exp", "cir", "gau", "sph", "mat", "bes", "ste"))) + + interpolation_fits <- vector("list", + length = length(interpolation_methods)) + names(interpolation_fits) <- interpolation_methods + + kriging_methods <- c("Exp", "Cir", "Gau", "Sph", "Mat", "Bes", "Ste")[match(interpolation_methods, c("exp", "cir", "gau", "sph", "mat", "bes", "ste"))] + kriging_methods <- kriging_methods[!is.na(kriging_methods)] + kriging_methods_lowercase <- tolower(kriging_methods) + + # Setup cross validation + if(is.null(fold)) { + fold <- 1:nrow(x) + } + + + if(is.null(vgm_width)) { + warning("kriging_cv: vgm_width not provided. Calculating width of breaks for sample variogram as the 1% quantile of Euclidean distances among points.") + vgm_width <- x |> + sf::st_as_sf(coords = all.vars(location_formula)) |> + sf::st_distance() |> + quantile(probs = 0.01) |> + as.numeric() + } + + mod_idw <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + set = list(idp = 2), + nmax = Inf, + maxdist = maxdist) + + if("nn" %in% interpolation_methods) { + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + set = list(idp = 0), + nmax = nm, + maxdist = maxdist) + + mod_fit <- gstat::gstat.cv(object = mod, + nfold = fold, + verbose = FALSE, + debug.level = 0, + remove.all = TRUE) + interpolation_fits[['nn']] <- mod_fit$var1.pred + + } + + if("idw" %in% interpolation_methods) { + + mod_fit <- gstat::gstat.cv(object = mod_idw, + nfold = fold, + verbose = FALSE, + debug.level = 0, + remove.all = TRUE) + interpolation_fits[['idw']] <- mod_fit$var1.pred + } + + if(length(kriging_methods) < 1) { + return(cbind(x, + as.data.frame(interpolation_fits))) + } + + for(ii in 1:length(kriging_methods)) { + + if(estimate_anisotropy) { + + if(is.null(anisotropy_parameters)) { + + message("kriging_cv: Conducting grid search to choose starting parameters for anisotropy estimation.") + anis_ssq <- expand.grid(angle = seq(0,180,15), + ratio = seq(0.05, 1, 0.1), + ssq = as.numeric(NA)) + + for(kk in 1:nrow(anis_ssq)) { + anis_ssq$ssq[kk] <- coldpool:::fit_anisotropy_cv(anis_pars = c(anis_ssq$angle[kk], anis_ssq$ratio[kk]), + x_mod = mod_idw, + dat = x, + kriging_method = kriging_methods[ii], + kriging_formula = kriging_formula, + location_formula = location_formula, + nm = nm, + vgm_width = vgm_width, + vgm_directions = c(0, 45, 90, 135), + seed = seed, + fold = fold) + } + + anis_pars <- as.numeric(anis_ssq[which.min(anis_ssq$ssq), 1:2]) + + dplyr::arrange(anis_ssq, ssq) + + message("kriging_cv: Anisotropy starting parameters: ", anis_pars) + + } else{ + anis_pars <- anisotropy_parameters + } + + anis_fit <- optim(par = anis_pars, + fn = coldpool:::fit_anisotropy_cv, + method = "L-BFGS-B", + lower = c(0, 1e-5), + upper = c(180, 1), + control = list(parscale = c(1,0.1), maxit = 500), + x_mod = mod_idw, + dat = x, + vgm_width = vgm_width, + kriging_formula = kriging_formula, + location_formula = location_formula, + nm = nm, + kriging_method = kriging_methods[ii], + vgm_directions = c(0, 45, 90, 135), + fold = fold, + seed = seed) + + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width), + gstat::vgm(kriging_method)) + + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width, + alpha = vgm_directions), + gstat::vgm(model = kriging_method, + anis = anis_pars, + nugget = vgm_mod$psill[1])) + + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + model = vgm_mod, + nmax = nm, + maxdist = maxdist) + + + # Internal use, for estimating anisotropy + if(only_return_anisotropy) { + return(anisotropy_parameters) + } + + if(class(anisotropy_parameters) == "try-error") { + warning("kriging_loocv: Anisotropy estimation error. Assuming no anisotropy.") + anisotropy_parameters <- NULL + } + + } + + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width), + gstat::vgm(kriging_methods[ii])) + + if(!is.null(anisotropy_parameters)) { + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width, + alpha = c(0, 45, 90, 135)), + gstat::vgm(model = kriging_methods[ii], + anis = anisotropy_parameters, + nugget = vgm_mod$psill[1])) + } else { + vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width), + gstat::vgm(model = kriging_methods[ii], + nugget = vgm_mod$psill[1])) + } + + mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + model = vgm_mod, + nmax = nm, + maxdist = maxdist) + + mod_fit <- gstat::gstat.cv(object = mod, + nfold = fold, + verbose = FALSE, + debug.level = 0) + interpolation_fits[[kriging_methods_lowercase[ii]]] <- mod_fit$var1.pred + + } + + return(cbind(x, + as.data.frame(interpolation_fits))) + +} + + diff --git a/analysis/temp3d/R/test_anisotropy_zexpansion.R b/analysis/temp3d/R/test_anisotropy_zexpansion.R new file mode 100644 index 0000000..bf7cfad --- /dev/null +++ b/analysis/temp3d/R/test_anisotropy_zexpansion.R @@ -0,0 +1,70 @@ + + +z_expansion <- numeric() +rmse <- numeric() +kriging_model <- c("nn", "idw", "exp", "cir", "gau", "sph", "mat", "bes", "ste") + +start_time <- Sys.time() +est_expansion <- estimate_z_expansion_new(z_start = 1e4, + x = x, + cv_index = x$fold, + location_formula = ~ LONGITUDE + LATITUDE + DEPTH, + kriging_formula = TEMPERATURE ~ 1, + vgm_width = 15000, + nm = Inf, + model = "Exp", + use_for_rmse = x$CAST == "bottom") +end_time <- Sys.time() +print(end_time-start_time) + +z_expansion <- c(z_expansion, est_expansion$par) +rmse <- c(rmse, est_expansion$value) + + +best_variogram_model <- dat$error_table$name[which.min(dat$error_table$RMSE)] + +region <- "AI" + +# UTM zones based on the most frequent zone among survey samples. +crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + +profile_dat <- readRDS(here::here("data", region, paste0("profile_data_", region, ".rds"))) + +if(region == "EBS") { + slope_dat <- readRDS(here::here("data", "slope", paste0("profile_data_slope.rds"))) + nbs_dat <- readRDS(here::here("data", "nbs", paste0("profile_data_nbs.rds"))) + + profile_dat$profile <- dplyr::bind_rows(profile_dat$profile, slope_dat$profile, nbs_dat$profile) + profile_dat$haul <- dplyr::bind_rows(profile_dat$haul, slope_dat$haul, nbs_dat$haul) +} + +profile_dat$haul$YEAR <- lubridate::year(profile_dat$haul$START_TIME) + +unique_years <- sort(unique(profile_dat$haul$YEAR)) + +for(ii in 1:length(unique_years)) { + + sel_haul <- dplyr::filter(profile_dat$haul, + YEAR == unique_years[ii], + !is.na(GEAR_DEPTH)) + sel_profile <- dplyr::filter(profile_dat$profile, + HAUL_ID %in% unique(sel_haul$HAUL_ID), + !is.na(LONGITUDE), + !is.na(LATITUDE)) |> + dplyr::inner_join(dplyr::select(sel_haul, HAUL_ID, BOTTOM_DEPTH, GEAR_DEPTH, YEAR)) |> + dplyr::mutate(LOG_GEAR_DEPTH = log(GEAR_DEPTH)) + + anis <- coldpool::estimate_anisotropy(x = dplyr::filter(sel_profile, CAST == "bottom"), + variable_name = "TEMPERATURE", + latitude_name = "LATITUDE", + longitude_name = "LONGITUDE", + input_crs = "WGS84", + interpolation_crs = crs_by_region$utmcrs[crs_by_region$region == region], + nm = Inf, + vgm_width = NULL, + variogram_model = best_variogram_model, + kriging_formula = variable_name ~ 1) + +} \ No newline at end of file diff --git a/analysis/temp3d/R/test_expansion_estimation.R b/analysis/temp3d/R/test_expansion_estimation.R new file mode 100644 index 0000000..33aa8f4 --- /dev/null +++ b/analysis/temp3d/R/test_expansion_estimation.R @@ -0,0 +1,340 @@ +library(coldpool) +library(gapctd) +library(navmaps) +library(lubridate) + +region = "AI" + +kriging_methods <- c("Exp", "Cir", "Gau", "Sph", "Mat", "Bes", "Ste") + +# UTM zones based on the most frequent zone among survey samples. +crs_by_region <- data.frame(region = c("AI", "GOA", "EBS"), + utmcrs = c("EPSG:32660", "EPSG:32605", "EPSG:32602"), + aeacrs = "EPSG:3338") + +profile_dat <- readRDS(here::here("data", region, paste0("cv_data_", region, ".rds"))) + + +dir.create(here::here("output", region), showWarnings = FALSE) + +unique_years <- sort(unique(profile_dat$YEAR)) + +cv_fits <- data.frame() + +ii <- 1 + + sel_profile <- dplyr::filter(profile_dat, + YEAR == unique_years[ii]) #|> + # dplyr::filter(CAST == "bottom") + + x = sel_profile + kriging_formula = TEMPERATURE ~ 1 + location_formula = ~ LONGITUDE + LATITUDE + DEPTH + nm = Inf + maxdist = Inf + +# mod_idw <- gstat::gstat(formula = kriging_formula, +# locations = ~ LONGITUDE + LATITUDE, +# data = dplyr::filter(x, CAST == "bottom"), +# set = list(idp = 2), +# nmax = Inf) +# +# vgm_mod <- gstat::variogram(mod_idw) + +mod_vertical <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + nmax = Inf, + maxdist = ceiling(max(x$DEPTH))) + +mod_horizontal <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + nmax = Inf, + maxdist = maxdist) + +(vgm_horizontal <- gstat::variogram(mod_horizontal, + beta = 0, + tol.ver = 0)) + +(vgm_vertical <- gstat::variogram(mod_vertical, + beta = 90, + cutoff = 500, + tol.hor = 0)) + +fit_vg_horizontal <- gstat::fit.variogram(vgm_vertical, + gstat::vgm(kriging_methods[ii])) + +fit_vg_vertical <- gstat::fit.variogram(vgm_horizontal, + gstat::vgm(kriging_methods[ii])) + + +mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + nmax = Inf, + maxdist = ceiling(max(x$DEPTH)), + model = fit_vg_horizontal) |> + gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + nmax = Inf, + maxdist = maxdist, + model = fit_vg_vertical) + + +optim_exp_z <- function(z_start, kriging_formula, location_formula, anis = c(0,0,0,1,1), nmax, maxdist, data, model = "Exp") { + + vgm_fn <- function(range, psill, nugget, dist, model) { + + exp_vgm <- function(nugget = 0, psill, range, dist, a = 1) { + a <- range * a + return(nugget + psill *(1-exp(-1*dist/a))) + } + + sph_vgm <- function(nugget = 0, psill, range, dist, a = 1) { + a <- range*a + val <- numeric(length = length(dist)) + val[dist > a] <- nugget + psill + val[dist <= a] <- nugget + psill * (1.5*dist[dist <= a]/a - 0.5 * (dist[dist <= a]/a)^3) + return(val) + + } + + cir_vgm <- function(nugget = 0, psill, range, dist, a = 1) { + a <- range*a + val <- numeric(length = length(dist)) + val[dist > a] <- nugget + psill + val[dist <= a] <- nugget + psill * ((2*dist[dist <= a])/(pi*a) * sqrt(1-(dist[dist <= a]/a)^2)+(2/pi)*asin(dist[dist <= a]/a)) + return(val) + } + + + gau_vgm <- function(nugget = 0, psill, range, dist, a = 1) { + a <- range*a + return(nugget + psill*(1-exp(-(dist/a)^2))) + } + + mat_vgm <- function(nugget = 0, psill, dist, range, kappa = 0.5) { + + dist <- dist * 1/range + # call some special cases for half fractions of nu + if( kappa == .5){ + return(nugget+psill*exp(-dist)) + } + if( kappa == 1.5){ + return(nugget+psill*(1+dist)*exp(-dist)) + } + if( kappa == 2.5){ + return(nugget+psill*(1+dist+dist^2/3)*exp(-dist)) + } + + con <- 1/(2^(kappa - 1)) * gamma(kappa) + + dist[dist == 0] <- 1e-10 + + return(nugget + psill * con * (dist^kappa) * besselK(dist, kappa)) + } + + bes_vgm <- function(nugget = 0, psill, range, dist, a = 1) { + + dist[dist == 0] <- 1e-10 + val <- nugget + psill*(1-dist/a*besselK(dist/a, nu = 1)) + + return() + + } + + bes_vgm(nugget = 0.1, psill = 1, range = 2, dist = 0:3) + + sel_fn <- switch(model, + "Exp" = exp_vgm, + "Sph" = sph_vgm, + "Cir" = cir_vgm, + "Gau" = gau_vgm, + ) + + return(out) + } + + + z_fn <- function(zz, + kriging_formula, + location_formula, + data, + nmax, + maxdist, + model) { + + zz <- exp(zz) + + print(zz) + vert_max_dist <- ceiling(max(data[all.vars(location_formula)[[3]]])) + + mod_vertical <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = data, + nmax = nmax, + maxdist = vert_max_dist) + + vgm_vertical <- gstat::variogram(mod_vertical, + beta = 90, # Vertical + cutoff = vert_max_dist, + tol.hor = 0) + + mod_horizontal <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = data, + nmax = nmax, + maxdist = maxdist) + + vgm_horizontal <- gstat::variogram(mod_horizontal, + beta = 0, # + tol.ver = 0) + + fit_vg_vertical <- gstat::fit.variogram(vgm_vertical, + gstat::vgm("Mat")) + + fit_vg_horizontal <- gstat::fit.variogram(vgm_horizontal, + gstat::vgm(model)) + + plot(vgm_horizontal, fit_vg_horizontal) + plot(seq(min(vgm_horizontal$dist), max(vgm_horizontal$dist), 12000), h1, col = "red", type = 'l') + + h1 <- exp_vgm(range = fit_vg_horizontal$range[2], + psill = fit_vg_horizontal$psill[2], + nugget = fit_vg_horizontal$psill[1], + dist = seq(min(vgm_horizontal$dist), max(vgm_horizontal$dist), 12000))#, + model = model) + + h2 <- vgm_fn(range = fit_vg_vertical$range[2], + psill = fit_vg_vertical$psill[2], + nugget = fit_vg_vertical$psill[1], + dist = vgm_horizontal$dist/zz, + model = model) + + return(sum((h2-h1)^2)) + + } + + + best_z <- optim(par = log(z_start), + fn = z_fn, + kriging_formula = kriging_formula, + location_formula = location_formula, + data = data, + nmax = nmax, + maxdist = maxdist, + model = model, + method = "Brent", + lower = c(log(1e-5)), + upper = c(log(5e5)), + control = list(trace = 1)) + + best_z$par <- exp(best_z$par) + + return(best_z) + +} + + + + +z_start = 10 +kriging_formula = TEMPERATURE ~ 1 +location_formula = ~ LONGITUDE + LATITUDE + DEPTH +nmax = Inf +maxdist = Inf +data = x +model = "Exp" + + + + + + + z_fn_3d <- function(zz, + kriging_formula, + location_formula, + data, + nmax, + maxdist, + model) { + + mod_3d <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = dplyr::mutate(data, DEPTH = DEPTH * zz), + nmax = nmax, + maxdist = nmax) + + (vgm_3d <- gstat::variogram(mod_3d)) + + vgm_fit_3d <- gstat::fit.variogram(vgm_3d, + gstat::vgm(model)) + + return(sum(vgm_fit_3d$psill)) + + } + + + best_z <- optim(par = log(zz), + fn = z_fn_3d, + kriging_formula = kriging_formula, + location_formula = location_formula, + data = data, + nmax = nmax, + maxdist = nmax, + model = model, + method = "Brent", + lower = c(1e-5), + upper = c(5e5), + control = list(trace = 6)) + + + mod_3d <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = dplyr::mutate(data, DEPTH = DEPTH * best_z$par), + nmax = nmax, + maxdist = nmax) + + (vgm_3d <- gstat::variogram(mod_3d)) + + vgm_fit_3d <- gstat::fit.variogram(vgm_3d, + gstat::vgm(model)) + + plot(vgm_3d, vgm_fit_3d) + +gstat::fit.variogram.reml(formula = kriging_formula, + location = location_formula, + data = x, + model = vgm_fit_3d) + + + + +vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width), + gstat::vgm(kriging_methods[ii])) + +test <- gstat::fit.variogram.reml(formula = I(TEMPERATURE) ~1, + location = ~ LONGITUDE + LATITUDE + DEPTH, + data = x, + model = vgm_mod) + +mod <- gstat::gstat(formula = kriging_formula, + locations = location_formula, + data = x, + set = list(idp = 2), + nmax = Inf, + model = test, + maxdist = maxdist) + +vgm_mod <- gstat::fit.variogram(gstat::variogram(mod_idw, + width = vgm_width, + alpha = c(0, 45, 90, 135)), + gstat::vgm(model = kriging_methods[ii], + anis = anis_pars, + nugget = vgm_mod$psill[1])) + + + diff --git a/man/estimate_anisotropy.Rd b/man/estimate_anisotropy.Rd new file mode 100644 index 0000000..1ff057f --- /dev/null +++ b/man/estimate_anisotropy.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimate_anisotropy.R +\name{estimate_anisotropy} +\alias{estimate_anisotropy} +\title{Estimate 2D variogram anisotropy parameters for kriging using 10-fold cross-validation} +\usage{ +estimate_anisotropy( + x, + kriging_formula = variable_name ~ 1, + folds = 10, + variogram_model, + starting_values = NULL, + vgm_width = NULL, + nm = Inf, + seed = 19673 +) +} +\arguments{ +\item{x}{A data.frame containing variables to be interpolated and latitude and longitude coordinates.} + +\item{kriging_formula}{Formula to use for kriging. Default (kriging_formula = variable_name ~ 1) is ordinary kriging without covariates. Formula for the gstat interface (see ?gstat::gstat). Covariate example: kriging_formula = variable_name ~ BOTTOM_DEPTH} + +\item{folds}{1L numeric vector indicating how many folds to use for cross-validation.} + +\item{variogram_model}{Character vector indicating which variogram model to use for interpolation. Valid options are exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste).} + +\item{starting_values}{Optional. Starting value of anisotropy parameters to use for optimization. See: ?gstat::vgm.} + +\item{vgm_width}{Optional. Width of variogram breaks.} + +\item{nm}{Maximum number of nearest neighbor observations to use for interpolation.} + +\item{seed}{RNG seed (set.seed(seed)) to use for anisotropy estimation based on cross-validation.} + +\item{variable_name}{1L character vector indicating which varible to interpolate (e.g. 'TEMPERATURE')} + +\item{latitude_name}{1L character vector indicating the name of the latitude column (e.g., 'LATITUDE')} + +\item{longitude_name}{1L character vector indicating the name of the longitude column (e.g., 'LONGITUDE')} + +\item{input_crs}{Character vector indicating the coordinate reference system for x (default = "WGS84")} + +\item{interpolation_crs}{Coordinate reference system to use for interpolation.} + +\item{only_return_anisotropy}{For internal use. Default = FALSE} +} +\description{ +Uses the coldpool::kriging_loocv() interface. +} diff --git a/man/estimate_z_expansion.Rd b/man/estimate_z_expansion.Rd new file mode 100644 index 0000000..75c82e5 --- /dev/null +++ b/man/estimate_z_expansion.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/estimate_z_expansion.R +\name{estimate_z_expansion} +\alias{estimate_z_expansion} +\title{Estimate vertical expansion factor for 3D kriging} +\usage{ +estimate_z_expansion( + x, + location_formula, + kriging_formula, + model, + z_start, + cv_index, + anisotropy_parameters = c(0, 0, 0, 1, 1), + vgm_width, + nm = Inf, + maxdist = Inf, + use_for_mse = rep(TRUE, length(cv_index)), + z_limits = c(10, 5e+05) +) +} +\arguments{ +\item{x}{data.frame containing} + +\item{location_formula}{Formula to use for location argument to gstat. See gstat documentation for description of the formula interface (see ?gstat::gstat).} + +\item{kriging_formula}{Formula to use for kriging. See gstat documentation for description of the formula interface (see ?gstat::gstat).} + +\item{z_start}{Starting value for vertical expansion estimation} + +\item{cv_index}{Index of folds for cross validation} + +\item{vgm_width}{Optional.} + +\item{nm}{Maximum number of nearest neighbor observations to use for interpolation.} + +\item{use_for_mse}{Logical vector indicating whether to use an observation to calculate MSE for cross-validation.} + +\item{z_limits}{Upper and lower bounds for z_expansion, for optimization.} + +\item{anisotopy_parameters}{Anisotropy parameters to use for ordinary kriging as a 5L vector. See: ?gstat::vgm. If NULL and estimate_anisotropy, anisotropy is estimated.} + +\item{variogram_model}{Character vector indicating which variogram model to use for interpolation. Valid options are exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste).} +} +\description{ +Uses L-BFGS algorithm implemented in optim(). +} diff --git a/man/kriging_cv.Rd b/man/kriging_cv.Rd new file mode 100644 index 0000000..42d69d9 --- /dev/null +++ b/man/kriging_cv.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kriging_cv.R +\name{kriging_cv} +\alias{kriging_cv} +\title{Compare interpolation methods using cross-validation} +\usage{ +kriging_cv( + x, + fold = NULL, + kriging_formula, + location_formula, + anisotropy_parameters = NULL, + nm = Inf, + maxdist = Inf, + interpolation_methods = c("nn", "idw", "exp", "cir", "gau", "sph", "mat", "bes", "ste"), + vgm_width = NULL, + estimate_anisotropy = FALSE, + only_return_anisotropy = FALSE, + anisotropy_kfold = 10, + seed = 19673 +) +} +\arguments{ +\item{x}{A data.frame containing variables to be interpolated and latitude and longitude coordinates.} + +\item{fold}{Vector of numeric values indicating which observations are assigned to which fold. Default NULL performs leave-one-out cross validation. (see 'nfold' argument in ?gstat::gstat).} + +\item{kriging_formula}{Formula to use for kriging. See gstat documentation for description of the formula interface (see ?gstat::gstat).} + +\item{location_formula}{Formula to use for location argument to gstat. See gstat documentation for description of the formula interface (see ?gstat::gstat).} + +\item{nm}{Maximum number of nearest neighbor observations to use for interpolation.} + +\item{vgm_width}{Optional. Width of variogram breaks.} + +\item{estimate_anisotropy}{Logical indicating whether anisotropy should be estimated for kriging methods based on k-fold cross validation. Default = FALSE} + +\item{anisotropy_kfold}{1L numeric vector indicating how many folds to use for cross-validation if estimate_anisotropy = TRUE.} + +\item{seed}{RNG seed (set.seed(seed)) to use for anisotropy estimation based on cross-validation.} + +\item{anisotopy_parameters}{Optional. Anisotropy parameters to use for ordinary kriging as a 2L vector for 2D kriging or 5L vector for 3D kriging. See: ?gstat::vgm. If NULL and estimate_anisotropy, anisotropy is estimated.} + +\item{interplation_methods}{Interpolation methods to use. Valid options are nearest-neighbor, inverse distance weighting, inverse distance weighting using the closest nm stations (idw_nmax), and kriging with and exponential (exp), circular (cir), gaussian (gau), Bessel (bes), Matern (mat), or Stein's Matern (ste) variogram model.} +} +\description{ +Conducts cross validation of a single predictor using kriging, nearest-neighbor, and inverse-distance weighting interpolation method using gstat. +}