From 640defbabf40f6a2d5b2a023e5e654491fae312d Mon Sep 17 00:00:00 2001 From: Philina English Date: Tue, 15 Oct 2024 16:09:05 -0700 Subject: [PATCH] Add the get_all_* functions (#23) * return species code with length types to allow for use with multi spp calls * new get_all* functions and all supporting code * document new functions * untested draft light-weight comparison functions and example script * updated with load_all() and subfolders * updated with missing package:: * get all vignette * fix bug in comparison functions * vignette edits * fixed precautionary object resets * make remove duplicates the default for get_all_survey_sets * move the dropping of all zeros and make it optional * update default args * vignette formatting * draft version flexible to argument changes * working for any argument and using function defaults when not provided * and grouping_area_km2 to event data returned for both functions * functionize area swept calc * minor clean up/style * vignette edits from LR and add design-based section * fix error in compare samples function; make both compare functions robust to Null results * more vignette edits * better way of checking for gear type differences * more vignette edits * minor vignette and comparison edits * more minor edits * more vignette edits * fix problem with missing sublevel data for ssid 14--now only gets sublevel data for events with gear differences * update vignette to match change in when skate level data is returned * add warning in case situations arise where skate-level counts are inconsistent with counts for events with gear differences * minor fixes to last change * for samples, add checks to make sure counts are coming in currectly when at the set level * more vignette edits * fix for edge case of missing sub level id for some gear comparision specimens * tiny edit * Improve messages to include the filtering options used. * document and fixes for passing devtools::check * better message placement; filter ssid 0 when a duplication of data from other ssid * minor vignette edits * more minor edits * fixing vignette check issues * precompile option for get-all vignette * update ignore and precompiled version * test commit * implement precompile option for orginal vignette * document * buildignore knitr-cache --------- Co-authored-by: Rogers --- .Rbuildignore | 2 + DESCRIPTION | 2 +- NAMESPACE | 6 + R/correct-ssids.R | 35 + R/get-all-survey-samples.R | 687 +++++++++++++++++++ R/get-all-survey-sets.R | 676 +++++++++++++++++++ R/gfdata-package.R | 69 +- R/parent-level-counts.R | 69 ++ R/skate-level-counts.R | 107 +++ R/utils.R | 38 +- inst/compare/compare-survey-samples.R | 235 +++++++ inst/compare/compare-survey-sets.R | 278 ++++++++ inst/compare/example-comparisons.R | 60 ++ inst/sql/get-activity-code.sql | 6 + inst/sql/get-all-survey-samples.sql | 90 +++ inst/sql/get-all-survey-sets.sql | 37 ++ inst/sql/get-event-data.sql | 87 +++ inst/sql/get-ll-hook-data-generalized.sql | 62 ++ inst/sql/get-ll-sub-level-hook-data.sql | 66 ++ inst/sql/get-spp-sample-length-type.sql | 6 +- inst/sql/get-sub-level-catch.sql | 18 + man/assign_areas.Rd | 6 +- man/correct_ssids.Rd | 17 + man/get_all.Rd | 141 ++++ man/get_parent_level_counts.Rd | 18 + man/get_skate_level_counts.Rd | 16 + man/survey_blocks.Rd | 1 + vignettes/.gitignore | 1 - vignettes/gfdata-vignette-get-all.Rmd | 724 +++++++++++++++++++++ vignettes/gfdata-vignette-get-all.Rmd.orig | 345 ++++++++++ vignettes/gfdata-vignette.Rmd | 393 ++++++++++- vignettes/gfdata-vignette.Rmd.orig | 191 ++++++ vignettes/precompile.R | 5 + 33 files changed, 4448 insertions(+), 46 deletions(-) create mode 100644 R/correct-ssids.R create mode 100644 R/get-all-survey-samples.R create mode 100644 R/get-all-survey-sets.R create mode 100644 R/parent-level-counts.R create mode 100644 R/skate-level-counts.R create mode 100644 inst/compare/compare-survey-samples.R create mode 100644 inst/compare/compare-survey-sets.R create mode 100644 inst/compare/example-comparisons.R create mode 100644 inst/sql/get-activity-code.sql create mode 100644 inst/sql/get-all-survey-samples.sql create mode 100644 inst/sql/get-all-survey-sets.sql create mode 100644 inst/sql/get-event-data.sql create mode 100644 inst/sql/get-ll-hook-data-generalized.sql create mode 100644 inst/sql/get-ll-sub-level-hook-data.sql create mode 100644 inst/sql/get-sub-level-catch.sql create mode 100644 man/correct_ssids.Rd create mode 100644 man/get_all.Rd create mode 100644 man/get_parent_level_counts.Rd create mode 100644 man/get_skate_level_counts.Rd create mode 100644 vignettes/gfdata-vignette-get-all.Rmd create mode 100644 vignettes/gfdata-vignette-get-all.Rmd.orig create mode 100644 vignettes/gfdata-vignette.Rmd.orig create mode 100644 vignettes/precompile.R diff --git a/.Rbuildignore b/.Rbuildignore index 98f594a..56d7ba0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,3 +4,5 @@ ^NEWS\.md$ ^\.github$ ^data-raw$ +^vignettes/get-all-cache +^vignettes/knitr-cache diff --git a/DESCRIPTION b/DESCRIPTION index ada9b7c..1615a1e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ Imports: tibble, tidyr, rlang -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Suggests: usethis, testthat, diff --git a/NAMESPACE b/NAMESPACE index 17f8ea8..fe4cecd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,8 @@ export(get_active_survey_blocks) export(get_age_methods) export(get_age_precision) export(get_all_stomachs) +export(get_all_survey_samples) +export(get_all_survey_sets) export(get_catch) export(get_catch_spatial) export(get_comm_gear_types) @@ -58,6 +60,7 @@ importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,contains) +importFrom(dplyr,distinct) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) @@ -66,6 +69,7 @@ importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,pull) +importFrom(dplyr,relocate) importFrom(dplyr,rename) importFrom(dplyr,right_join) importFrom(dplyr,select) @@ -73,5 +77,7 @@ importFrom(dplyr,semi_join) importFrom(dplyr,summarise) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) +importFrom(dplyr,where) importFrom(magrittr,"%>%") importFrom(rlang,.data) +importFrom(stats,na.omit) diff --git a/R/correct-ssids.R b/R/correct-ssids.R new file mode 100644 index 0000000..f538419 --- /dev/null +++ b/R/correct-ssids.R @@ -0,0 +1,35 @@ +#' Custom fixes for problem surveys with shared trip ids resulting in assignment to wrong ssid +#' +#' @param dat df containing these columns: fishing_event_ids, survey_series_id, survey_id, +#' major_stat_area_code, minor_stat_area_code +#' @param specimens Defaults to FALSE where checks for duplication of fishing_event_ids +#' +correct_ssids <- function(dat, specimens = FALSE) { + try(dat[dat$survey_series_id %in% c(6, 7), ]$survey_id <- NA, silent = TRUE) + try(dat[((dat$survey_series_id == 6 & dat$major_stat_area_code %in% c("03", "04"))), ]$survey_series_id <- 7, silent = TRUE) + try(dat[((dat$survey_series_id == 7 & dat$major_stat_area_code %in% c("05", "06"))), ]$survey_series_id <- 6, silent = TRUE) + try(dat[((dat$survey_series_og == 6 & dat$major_stat_area_code %in% c("03", "04"))), ]$survey_series_og <- 7, silent = TRUE) + try(dat[((dat$survey_series_og == 7 & dat$major_stat_area_code %in% c("05", "06"))), ]$survey_series_og <- 6, silent = TRUE) + + # SABLE doesn't work with SSIDs, use reason_desc and or grouping codes instead? + try(dat[dat$survey_series_id %in% c(35, 41, 42, 43), ]$survey_id <- NA, silent = TRUE) # this throws a warning when others don't + try(dat[dat$survey_series_id %in% c(35, 41, 42, 43), ]$survey_series_id <- 35, silent = TRUE) + try(dat[dat$survey_series_og %in% c(35, 41, 42, 43), ]$survey_series_og <- 35, silent = TRUE) + # try(dat[ ((dat$survey_series_id %in% c(35, 41, 42, 43) & dat$reason_desc == "EXPLORATORY")), ]$survey_series_id <- 35, silent = TRUE) + # try(dat[ ((dat$survey_series_id %in% c(35, 41, 42, 43) & dat$reason_desc == "SABLEFISH STANDARDIZED OFFSHORE SURVEY")), ]$survey_series_id <- 42, silent = TRUE) + + # Jig surveys are split into too many separate survey series, so we'll assume all were assigned correctly and drop everything that doesn't match + dat <- dat[(!(dat$survey_series_id == 82 & !(dat$minor_stat_area_code %in% c("12")))), ] + dat <- dat[(!(dat$survey_series_id == 83 & !(dat$minor_stat_area_code %in% c("13")))), ] + dat <- dat[(!(dat$survey_series_id == 84 & !(dat$minor_stat_area_code %in% c("15")))), ] + dat <- dat[(!(dat$survey_series_id == 85 & !(dat$minor_stat_area_code %in% c("16")))), ] + dat <- dat[(!(dat$survey_series_id == 86 & !(dat$minor_stat_area_code %in% c("18")))), ] + dat <- dat[(!(dat$survey_series_id == 87 & !(dat$minor_stat_area_code %in% c("19")))), ] + + # for IPHC station specific ssids, drop everything that doesn't match + dat <- dat[(!(dat$survey_series_id == 17 & !(dat$minor_stat_area_code %in% c("3")))), ] # 3CD + dat <- dat[(!(dat$survey_series_id == 18 & !(dat$minor_stat_area_code %in% c("6")))), ] # 5AB + dat <- dat[(!(dat$survey_series_id == 19 & !(dat$minor_stat_area_code %in% c("7")))), ] # 5CD + + dat |> dplyr::distinct() +} diff --git a/R/get-all-survey-samples.R b/R/get-all-survey-samples.R new file mode 100644 index 0000000..6347296 --- /dev/null +++ b/R/get-all-survey-samples.R @@ -0,0 +1,687 @@ +#' +#' @param species One or more species common names (e.g. "pacific ocean +#' perch") or one or more species codes (e.g. `396`). Species codes can be +#' specified as numeric vectors `c(396, 442`) or characters `c("396", "442")`. +#' Numeric values shorter than 3 digits will be expanded to 3 digits and +#' converted to character objects (`1` turns into `"001"`). Species common +#' names and species codes should not be mixed. If any element is missing a +#' species code, then all elements will be assumed to be species common +#' names. Does not work with non-numeric species codes, so in those cases the +#' common name will be needed. +#' @param ssid A numeric vector of survey series IDs. Run [get_ssids()] for a +#' look-up table of available survey series IDs with surveys series +#' descriptions. Default is to return all data from all surveys. Some of the +#' most useful ids include: contemporary trawl (1, 3, 4, 16), historic trawl +#' (2), IPHC (14), sablefish (35), and HBLL (22, 36, 39, 40). +#' @param major Character string (or vector) of major stat area code(s) to +#' include (characters). Use get_major_areas() to lookup area codes with +#' descriptions. Default is NULL. +#' @param usability A vector of usability codes to include. Defaults to NULL, +#' but typical set for trawl is`c(0, 1, 2, 6)`. IPHC codes may be different to +#' other surveys and the modern Sablefish survey doesn't seem to assign +#' usabilities. +#' @param unsorted_only Defaults to FALSE, which will return all specimens +#' collected on research trips. TRUE returns only unsorted (`1`) and `NA` +#' specimens for both `species_category_code` and `sample_source_code`. +#' @param random_only Defaults to FALSE, which will return all specimens +#' collected on research trips. TRUE returns only randomly sampled +#' specimens (`sample_type_code` = `1, 2, 6, 7, or 8`). +#' @param grouping_only Defaults to FALSE, which will return all specimens or sets +#' collected on research trips. TRUE returns only sets or specimens from fishing events +#' with grouping codes that match that expected for a survey. Can also be +#' achieved by filtering for specimens where `!is.na(grouping_code)`. +#' @param include_event_info Logical for whether to append all relevant fishing +#' event info (location, timing, effort, catch, etc.). Defaults to TRUE. +#' @param include_activity_matches TRUE gets all records collected with activity +#' codes that match chosen ssids. Default is FALSE. +#' @param remove_bad_data Remove known bad data, such as unrealistic +#' length or weight values and duplications due to trips that include multiple +#' surveys. Default is TRUE. +#' @param remove_duplicates Remove duplicated specimen records due to overlapping +#' survey stratifications when original_ind = 'N', or from known issues with +#' MSSM trips including both survey areas. +#' @param return_dna_info Should DNA container ids and sample type be returned? +#' This can create duplication of specimen ids for some species. Defaults to +#' FALSE. +#' @param quiet_option Default option, `"message"`, suppresses messages from +#' sections of code with lots of `join_by` messages. Any other string will allow +#' messages. +#' @param drop_na_columns Logical for removing all columns that only contain NAs. +#' Defaults to TRUE. +#' +#' @export +#' +#' @rdname get_all +get_all_survey_samples <- function(species, ssid = NULL, + major = NULL, + usability = NULL, + unsorted_only = FALSE, + random_only = FALSE, + grouping_only = FALSE, + include_event_info = FALSE, + include_activity_matches = FALSE, + remove_bad_data = TRUE, + remove_duplicates = TRUE, + return_dna_info = FALSE, + drop_na_columns = TRUE, + quiet_option = "message") { + .q <- read_sql("get-all-survey-samples.sql") + .q <- inject_filter("AND SP.SPECIES_CODE IN", species, sql_code = .q) + + if (!is.null(ssid)) { + if (any(ssid %in% c(35, 41, 42, 43))) { + ssid <- unique(c(ssid, 35, 41, 42, 43)) + } + + if (any(ssid %in% c(6, 7, 67))) { + ssid_original <- ssid + ssid <- unique(c(ssid, 6, 7, 67)) + } + + if (include_activity_matches) { + ## draft approach that gets all samples collected using the same activities as the ssid(s) of interest + .a <- read_sql("get-activity-code.sql") + .a <- run_sql("GFBioSQL", .a) + + .a <- filter(.a, SURVEY_SERIES_ID %in% ssid) |> distinct() + + activities <- unique(.a$ACTIVITY_CODE) + .q <- inject_filter("AND TA.ACTIVITY_CODE IN", activities, + sql_code = .q, + search_flag = "-- insert ssid here", conversion_func = I + ) + } else { + .q <- inject_filter("AND S.SURVEY_SERIES_ID IN", ssid, + sql_code = .q, + search_flag = "-- insert ssid here", conversion_func = I + ) + } + } + + if (!is.null(major)) { + .q <- inject_filter("AND SM.MAJOR_STAT_AREA_CODE IN", major, .q, + search_flag = "-- insert major here", conversion_func = I + ) + } + + search_flag <- "-- insert lengths here" + i <- grep(search_flag, .q) + + .q[i] <- paste0("CAST(ROUND(Fork_Length/ 10, 1) AS DECIMAL(8,1)) AS Fork_Length, + CAST(ROUND(Standard_Length/ 10, 1) AS DECIMAL(8,1)) AS Standard_Length, + CAST(ROUND(Total_Length/ 10, 1) AS DECIMAL(8,1)) AS Total_Length, + CAST(ROUND(Second_Dorsal_Length/ 10, 1) AS DECIMAL(8,1)) AS Second_Dorsal_Length, + ") + + .d <- run_sql("GFBioSQL", .q) + + names(.d) <- tolower(names(.d)) + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No survey specimens for ", toString(species), ".")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No survey specimens for ", toString(species), " from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No survey specimens for ", toString(species), " from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No survey specimens for ", toString(species), " from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + + ## dna_container_id and dna_sample_type can cause duplication for some species with multiple samples collected per individual + ## Could do something about record duplication with multiple DNA samples like combining or not returning them? + if (!return_dna_info) { + .d <- .d |> + select(-dna_container_id, -dna_sample_type) |> + distinct() + } + + ## populate length and length_type variables with most common type for each species ---- + + .d$length <- NA + .d$length_type <- NA + + species_code <- list() + + for (i in seq_along(species)) { + length_type <- get_spp_sample_length_type(species[i]) + length_type <- tolower(length_type) + + species_code[i] <- common2codes(tolower(species[i])) + + .d[.d$species_code == tolower(species_code[i]), ]$length <- .d[.d$species_code == species_code[i], length_type] + .d[.d$species_code == tolower(species_code[i]), ]$length_type <- length_type + } + + ## ---- + + # if using include_activity_matches = TRU`then remove_duplicates = TRUE + if (include_activity_matches & !is.null(ssid)) { + remove_duplicates <- TRUE + } + + suppressMessages( + if (remove_bad_data) { + .d <- correct_ssids(.d, specimens = TRUE) + }, + classes = quiet_option + ) + + + if(!is.null(usability)|unsorted_only|random_only|grouping_only) { + print( + paste0("Looking for samples that are", + ifelse(!is.null(usability), paste0( " usable (", toString(usability), ")"), ""), + ifelse(unsorted_only, " unsorted", ""), + ifelse(random_only, " random", ""), + ifelse(grouping_only, " with originally specified grouping codes.", ".") + ) + ) + } + + + if (!is.null(ssid) & !include_activity_matches) { + .d <- .d |> + group_by(specimen_id, survey_series_id) |> + mutate( + grouping_desc_updated = ifelse(is.logical(na.omit(grouping_desc_updated)), NA, na.omit(grouping_desc_updated)), + grouping_code_updated = mean(grouping_code_updated, na.rm = TRUE), + grouping_code_updated = ifelse(is.nan(grouping_code_updated), NA, grouping_code_updated) + ) |> + dplyr::distinct() |> + ungroup() + + + if (any(ssid %in% c(6, 7, 67)) & !include_activity_matches) { + ssid <- ssid_original + } + + .d <- filter(.d, survey_series_id %in% ssid) + + if (is.null(major)) { + print( + paste0("Returning all ", toString(species), " specimens from survey series ", toString(ssid), ".") + ) + } + if (!is.null(major)) { + print( + paste0("Returning all ", toString(species), " specimens from within major area(s) ", toString(major), " and belonging to survey series ", toString(ssid), ".") + ) + } + } else { + .d <- .d |> + group_by(fishing_event_id) |> + mutate( + # make sure updated codes are from the original survey design and purge others + grouping_desc_updated = ifelse(grouping_code_updated == grouping_code_original, grouping_desc_updated, NA), + grouping_desc_updated = ifelse(is.logical(na.omit(grouping_desc_updated)), NA, na.omit(grouping_desc_updated)), + grouping_code_updated = ifelse(grouping_code_updated == grouping_code_original, grouping_code_updated, NA), + grouping_code_updated = mean(grouping_code_updated, na.rm = TRUE), + grouping_code_updated = ifelse(is.nan(grouping_code_updated), NA, grouping_code_updated) + ) |> + dplyr::distinct() |> + ungroup() + + + if (is.null(major)) { + print( + paste0("Returning all ", toString(species), " specimens from all survey series.") + ) + } + if (!is.null(major)) { + print( + paste0("Returning all ", toString(species), " specimens from major area(s) ", toString(major), " from any survey series.") + ) + } + } + + if (unsorted_only) { + # # .d <- filter(.d, sampling_desc == "UNSORTED") + # # replaces SQL code + # # (SPECIES_CATEGORY_CODE IS NULL OR SPECIES_CATEGORY_CODE IN (1, 3, 5, 6, 7)) AND + # # (SAMPLE_SOURCE_CODE IS NULL OR SAMPLE_SOURCE_CODE IN(1, 2)) AND + # .d <- filter(.d, is.na(species_category_code) | species_category_code %in% c(1, 6)) + # # # only 1 = unsorted makes sense! 3 = keepers, 5 = remains, = 6 head only, 7 doesn't exist? + # .d <- filter(.d, is.na(sample_source_code) | sample_source_code %in% c(1)) + # # # only 1 = unsorted makes sense! 2 = keepers, 3 = discards + + # TODO: try instead + .d <- filter( + .d, species_category_code %in% c(1) | # unsorted + sample_source_code %in% c(1) | # unsorted + sample_type_code %in% c(1) # total catch + ) + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No unsorted survey samples for ", toString(species), ".")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No unsorted survey samples for ", toString(species), " from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No unsorted survey samples for ", toString(species), " from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No unsorted survey samples for ", toString(species), " from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + } else { + .ss <- get_table("Sample_Source") |> select(-ROW_VERSION) + names(.ss) <- tolower(names(.ss)) + .d <- left_join(.d, .ss, by = "sample_source_code") + } + + if (!is.null(usability)) { + .d <- filter(.d, usability_code %in% usability) + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No 'usable' survey samples for ", toString(species), ".")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No 'usable' survey samples for ", toString(species), " from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No 'usable' survey samples for ", toString(species), " from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No 'usable' survey samples for ", toString(species), " from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + } + + .u <- get_table("Usability") |> select(-ROW_VERSION) + names(.u) <- tolower(names(.u)) + .d <- left_join(.d, .u, by = "usability_code") + + if (random_only) { + # replaces SQL code + # SM.SAMPLE_TYPE_CODE IN (1, 2, 6, 7, 8) AND + .d <- filter(.d, sample_type_code %in% c(1, 2, 6, 7, 8)) # 8 = random from set requested by vessel master + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No random survey samples for ", toString(species), ".")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No random survey samples for ", toString(species), " from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No random survey samples for ", toString(species), " from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No random survey samples for ", toString(species), " from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + } + + # remove ages from unaccepted ageing methods: + file <- system.file("extdata", "ageing_methods.csv", package = "gfdata") + + ageing_methods <- readr::read_csv(file, + col_types = readr::cols( + species_code = readr::col_character() + ) + ) + + .d <- left_join(.d, + select(ageing_methods, species_code, species_ageing_group), + by = "species_code" + ) + + .d <- .d %>% + mutate( + age = case_when( + species_ageing_group == "rockfish_flatfish_hake" & ageing_method_code %in% c(1, 3, 16, 17) ~ .d$age, + species_ageing_group == "sharks_skates" & ageing_method_code %in% c(12) ~ .d$age, + species_ageing_group == "dogfish" & ageing_method_code %in% c(11) ~ .d$age, + species_ageing_group == "pcod_lingcod" & ageing_method_code %in% c(6) ~ .d$age, + species_ageing_group == "pollock" & ageing_method_code %in% c(7) ~ .d$age, + species_ageing_group == "shortraker_thornyheads" & ageing_method_code %in% c(1, 3, 4, 16, 17) ~ .d$age, + is.na(species_ageing_group) ~ NA_real_ + ) + ) + + # removes known data problems + # if FALSE, specimens could be reported and corrected in the database + if (remove_bad_data) { + .d <- .d[!(.d$length > 600 & + .d$species_common_name == "north pacific spiny dogfish"), ] + .d <- .d[!(.d$length > 600 & .d$species_common_name == "big skate"), ] + .d <- .d[!(.d$length > 600 & .d$species_common_name == "longnose skate"), ] + .d <- .d[!(.d$length > 60 & .d$species_common_name == "pacific tomcod"), ] + .d <- .d[!(.d$length > 50 & + .d$species_common_name == "quillback-rockfish"), ] + .d <- .d[!(.d$length < 10 & .d$weight / 1000 > 1.0 & + .d$species_common_name == "pacific flatnose"), ] + } + + # dogfish were producing a whole bunch of NAs for some reason + .d <- .d %>% filter(!is.na(specimen_id)) + + if (include_event_info) { + print("Specimens found. Fetching additional event info.") + } + + suppressMessages( + if (include_event_info) { + options(scipen = 999) + # needed for big skate because of a MSA set with an id that was getting converted + + .f <- .d %>% filter(!is.na(fishing_event_id)) + fe_vector <- unique(na.omit(.f$fishing_event_id)) + + .q2 <- read_sql("get-all-survey-sets.sql") + .q2 <- inject_filter("AND SP.SPECIES_CODE IN", species, sql_code = .q2) + .q2 <- inject_filter("AND FE.FISHING_EVENT_ID IN", fe_vector, + sql_code = .q2, + search_flag = "-- insert fe_vector here", conversion_func = I + ) + + if (!is.null(major)) { + .q2 <- inject_filter("AND FE.MAJOR_STAT_AREA_CODE IN", major, .q2, + search_flag = "-- insert major here", conversion_func = I + ) + } + + .c <- run_sql("GFBioSQL", .q2) + + names(.c) <- tolower(names(.c)) + .d <- left_join( + .d, + unique(select( + .c, + fishing_event_id, + species_code, + catch_weight, + catch_count + )) + ) + + # get all fishing event info + .fe <- read_sql("get-event-data.sql") + + ssid_with_samples <- unique(na.omit(.d$survey_series_id)) + + .fe <- inject_filter("AND S.SURVEY_SERIES_ID IN", ssid_with_samples, + sql_code = .fe, + search_flag = "-- insert ssid here", conversion_func = I + ) + + ## this didn't work if we want all the sub level and minor level ids + # .fe <- inject_filter("AND FE.FE_PARENT_EVENT_ID IS NULL OR IN", fe_vector, + # sql_code = .fe, + # search_flag = "-- insert fe_vector here", conversion_func = I + # ) + + # so reduce data size using trip ids instead? + trip_vector <- unique(na.omit(.d$trip_id)) + + .fe <- inject_filter("AND FE.TRIP_ID IN", trip_vector, + sql_code = .fe, + search_flag = "-- insert fe_vector here", conversion_func = I + ) + + + if (!is.null(major)) { + .fe <- inject_filter("AND FE.MAJOR_STAT_AREA_CODE IN", major, .fe, + search_flag = "-- insert major here", conversion_func = I + ) + } + + fe <- run_sql("GFBioSQL", .fe) + + if (any(!is.na(fe$FE_SUB_LEVEL_ID))) { + if (any(na.omit(fe$FE_SUB_LEVEL_ID) > 1)) { + # get both parent and skate level counts + fe1 <- get_parent_level_counts(fe) + fe2 <- get_skate_level_counts(fe) + } else { + fe1 <- get_parent_level_counts(fe) + } + } else { + fe1 <- get_parent_level_counts(fe) + } + + fe1 <- fe1 |> + select( + -SURVEY_SERIES_ID, + -SURVEY_SERIES_OG, + -SURVEY_ID, + -MINOR_STAT_AREA_CODE, # some sub level events had NAs here + -REASON_DESC, -USABILITY_CODE, + -GROUPING_CODE_ORIGINAL, -GROUPING_DESC_ORIGINAL, + -GROUPING_CODE_UPDATED, -GROUPING_DESC_UPDATED, -ORIGINAL_IND + ) |> + distinct() # avoid clashing with values for samples + + names(fe1) <- tolower(names(fe1)) + + if (any(na.omit(fe$FE_SUB_LEVEL_ID) > 1)) { + fe2 <- fe2 |> + select( + -SURVEY_SERIES_ID, + -SURVEY_SERIES_OG, + -SURVEY_ID, + -MINOR_STAT_AREA_CODE, + -REASON_DESC, -USABILITY_CODE, + -GROUPING_CODE_ORIGINAL, -GROUPING_DESC_ORIGINAL, + -GROUPING_CODE_UPDATED, -GROUPING_DESC_UPDATED, -ORIGINAL_IND + ) |> + distinct() # avoid clashing with values for samples + + count_gear_types <- fe2 |> group_by(fishing_event_id) |> + summarise(max = max( + ### can add any more gear variables needed here + sum(!is.na(unique(HOOK_CODE))), + sum(!is.na(unique(HOOKSIZE_DESC))) + )) + + names(fe2) <- tolower(names(fe2)) + + # do any of those sub levels differ in gear type? + ## TODO: other possible variables and tests for difference could be added + if (max(count_gear_types$max, na.rm = TRUE) < 2) { + # NO, then use only the parent level values + .d <- left_join(.d, fe1) + } else { + # YES, then get skate level catch for each species + slc_list <- list() + spp_codes <- unique(.d$species_code) + for (i in seq_along(spp_codes)) { + .slc <- read_sql("get-sub-level-catch.sql") + .slc <- inject_filter("", spp_codes[i], sql_code = .slc) + .slc <- inject_filter("AND C.SPECIES_CODE IN", spp_codes[i], + sql_code = .slc, + search_flag = "-- insert species again here" + ) + slc_list[[i]] <- run_sql("GFBioSQL", .slc) + } + slc <- do.call(rbind, slc_list) |> distinct() + names(slc) <- tolower(names(slc)) + + .d2 <- .d |> + left_join(count_gear_types) |> + # need to include both tests because rarely fe_sub_level_ids can be missing + filter(max > 1 & !is.na(fe_sub_level_id)) |> + left_join(fe2) |> + # select(-survey_series_id) |> + select(-max) |> + distinct() + + # but only replace event level when multiple gear types are present + .d2 <- .d2 %>% + # select(-catch_count) |> + rename(event_level_count = catch_count) |> # used as a temporary check + left_join(slc, by = c( + "trip_id" = "trip_id", + "fishing_event_id" = "fe_parent_event_id", + "fe_major_level_id" = "fe_major_level_id", + "fe_sub_level_id" = "fe_sub_level_id", + "species_code" = "species_code" + )) + + check_counts <- select(.d2, fishing_event_id, skate_id, event_level_count, catch_count) |> + distinct() |> + group_by(fishing_event_id) |> + mutate(missing_skates = event_level_count - catch_count) |> + ungroup() + + # check if any skate-level catch_counts exceed event_level_counts or are missing + if(min(check_counts$missing_skates)<0|any(is.na(.d2$catch_count))) { + warning("Some skate-level counts are inconsistent with counts for events with gear differences.") + } else { + # .d2 <- .d2 |> select(-event_level_count) + } + + .d1 <- .d |> + left_join(count_gear_types) |> + # need to include both tests because rarely fe_sub_level_ids can be missing + filter(max < 2 | is.na(fe_sub_level_id)) |> + left_join(fe1) |> + # select(-survey_series_id) |> + select(-max) |> + distinct() + + # check for missing data + if(nrow(.d1) + nrow(.d2) == nrow(.d)) { + .d <- bind_rows(.d1, .d2) + } else { + warning("Event data appears to be missing for some specimens. Check output carefully.") + .d3 <- bind_rows(.d1, .d2) + .d <- left_join(.d, .d3) + } + + } + } else { + .d <- left_join(.d, fe1) + } + + .d <- .d |> + group_by(specimen_id) |> + mutate( + doorspread_m = ifelse(is.logical(na.omit(doorspread_m)), NA, na.omit(doorspread_m)), + speed_mpm = ifelse(is.logical(na.omit(speed_mpm)), NA, na.omit(speed_mpm)) + ) |> + dplyr::distinct() |> + ungroup() + + + # in trawl data, catch_count is only recorded for small catches + # so 0 in the catch_count column when catch_weight > 0 seems misleading + # note: there are also a few occasions for trawl where count > 0 and catch_weight is 0/NA + # these lines replace false 0s with NA, but additional checks might be needed + + .d$catch_count <- ifelse(.d$catch_weight > 0 & .d$catch_count == 0, NA, .d$catch_count) + .d$catch_weight <- ifelse(.d$catch_count > 0 & .d$catch_weight == 0, NA, .d$catch_weight) + + .d <- trawl_area_swept(.d) + + .d <- hook_area_swept(.d) + + }, + classes = quiet_option + ) + + + .d <- .d |> + relocate(species_common_name, survey_series_id, sex, length, weight, age) |> + arrange(species_common_name, survey_series_id, -fishing_event_id) + + if (grouping_only) { + .d <- filter(.d, !is.na(grouping_code_original)) + } + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No survey samples with expected grouping codes for ", toString(species), ".")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No survey samples with expected grouping codes for ", toString(species), " from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No survey samples with expected grouping codes for ", toString(species), " from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No survey samples with expected grouping codes for ", toString(species), " from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + + if (drop_na_columns) { + .d <- .d %>% select(where(~ !all(is.na(.x)))) + } + + .d <- unique(.d) + + # check if there are duplicate specimen ids + if (length(.d$specimen_id) > length(unique(.d$specimen_id))) { + if (remove_duplicates) { + # if so, separate original_ind from not + .dy <- filter(.d, original_ind == "Y") + .dn <- filter(.d, original_ind != "Y" | is.na(original_ind)) + + # and only keep those not original_ind = Y when the specimen id was missing + .d <- bind_rows(.dy, filter(.dn, !(specimen_id %in% c(unique(.dy$specimen_id))))) + .d <- filter(.d, !(survey_series_id == 0 & (specimen_id %in% c(unique(.dy[duplicated(.dy$specimen_id),]$specimen_id))))) + # check if there are still duplicated specimen ids + if (length(.d$specimen_id) > length(unique(.d$specimen_id))) { + warning( + "Duplicate specimen IDs are still present despite `remove_duplicates = TRUE`. ", + "This is potentially because of overlapping survey stratifications, or multiple ", + "DNA samples from the same specimen. If working with the data yourself, ", + "you should filter them after selecting specific survey stratifications. ", + "For example, `dat <- dat[!duplicated(dat$specimen_id), ]`. ", + "Tidying and plotting functions within gfplot will do this for you." + ) + } + } else { + # check if there are duplicated specimen ids (this often true for SABLE and MSSM surveys) + if (length(.d$specimen_id) > length(unique(.d$specimen_id))) { + warning( + "Duplicate specimen IDs are present. This is usually because of multiple ", + "DNA samples from the same specimen, overlapping survey stratifications, ", + "or trips that include more than one type of survey. Some cases of the ", + "latter two case can be resolved by setting 'remove_duplicates = TRUE'. ", + "If working with the data yourself, filter them after selecting specific ", + "surveys. For example, `dat <- dat[!duplicated(dat$specimen_id), ]`. ", + "The tidying and plotting functions within gfplot will do this for you." + ) + } + } + } + + + surveys <- get_ssids() + + names(surveys) <- tolower(names(surveys)) + + .d <- inner_join(.d, + unique(select( + surveys, + survey_series_id, + survey_series_desc, + survey_abbrev + )), + by = "survey_series_id" + ) + + # TODO: add a check to see if ssid and ssog are identical and drop ssog if so? + + .d$species_common_name <- tolower(.d$species_common_name) + .d$species_science_name <- tolower(.d$species_science_name) + + # we will use grouping_code_original as the primary grouping_code returned + .d <- dplyr::rename(.d, grouping_code = grouping_code_original, grouping_desc = grouping_desc_original) + + add_version(as_tibble(.d)) +} diff --git a/R/get-all-survey-sets.R b/R/get-all-survey-sets.R new file mode 100644 index 0000000..047957d --- /dev/null +++ b/R/get-all-survey-sets.R @@ -0,0 +1,676 @@ +#' Get all data +#' +#' These functions get all survey set or sample data for a set of species by +#' major area, activity, or specific surveys. The main functions in this package +#' focus on retrieving the more commonly used typs of data and are often limited +#' to sets and samples that conform to current design-based standards and survey +#' grids. These functions will retrieve everything and therefore require careful +#' consideration of what data types are reasonable to include depending on the +#' purpose. For this reason these function return a lot of columns, although the +#' exact number depends on which types of surveys are being returned. +#' +#' @param ssid A numeric vector of survey series IDs. Run [get_ssids()] for a +#' look-up table of available survey series IDs with surveys series +#' descriptions. Default is to return all data from all surveys. Some of the +#' most useful ids include: contemporary trawl (1, 3, 4, 16), historic trawl +#' (2), IPHC (14), sablefish (35), and HBLL (22, 36, 39, 40). +#' @param years Default is NULL, which returns all years. +#' @param major Character string (or vector) of major stat area code(s) to +#' include (characters). Use get_major_areas() to lookup area codes with +#' descriptions. Default is NULL. +#' @param join_sample_ids This option was problematic, so now reverts to FALSE. +#' @param remove_false_zeros Default of `TRUE` will make sure weights > 0 don't have +#' associated counts of 0 and vice versa. Mostly useful for trawl data where +#' counts are only taken for small catches. +#' @param remove_bad_data Remove known bad data, such as unrealistic +#' length or weight values and duplications due to trips that include multiple +#' surveys. Default is TRUE. +#' @param remove_duplicates Logical for whether to remove duplicated event +#' records due to overlapping survey stratifications when original_ind = 'N'. +#' Default is FALSE. This option only remains possible when ssids are supplied +#' and activity matches aren't included. Otherwise turns on automatically. +#' @param include_activity_matches Get all surveys with activity codes that +#' match chosen ssids. +#' @param usability A vector of usability codes to include. Defaults to NULL, +#' but typical set for a design-based trawl survey index is `c(0, 1, 2, 6)`. +#' IPHC codes may be different to other surveys and the modern Sablefish survey +#' doesn't seem to assign usabilities. +#' @param grouping_only Defaults to FALSE, which will return all specimens or sets +#' collected on research trips. TRUE returns only sets or specimens from fishing +#' events with grouping codes that match that expected for a survey. Can also be +#' achieved by filtering for specimens where `!is.na(grouping_code)`. +#' @param quiet_option Default option, `"message"`, suppresses messages from +#' sections of code with lots of `join_by` messages. Any other string will allow +#' messages. +#' @param drop_na_columns Logical for removing all columns that only contain NAs. +#' Defaults to TRUE. +#' +#' @export +#' @rdname get_all +#' @examples +#' \dontrun{ +#' ## Import survey catch density and location data by tow or set for plotting +#' ## Specify single or multiple species by common name or species code and +#' ## single or multiple survey series id(s). +#' ## Notes: +#' ## `area_km` is the stratum area used in design-based index calculation. +#' ## `area_swept` is in m^2 and is used to calculate density for trawl surveys +#' ## It is based on `area_swept1` (`doorspread_m` x `tow_length_m`) except +#' ## when `tow_length_m` is missing, and then we use `area_swept2` +#' ## (`doorspread` x `duration_min` x `speed_mpm`). +#' ## `duration_min` is derived in the SQL procedure "proc_catmat_2011" and +#' ## differs slightly from the difference between `time_deployed` and +#' ## `time_retrieved`. +#' } +#' +get_all_survey_sets <- function(species, + ssid = NULL, + major = NULL, + years = NULL, + join_sample_ids = FALSE, + remove_false_zeros = TRUE, + remove_bad_data = TRUE, + remove_duplicates = TRUE, + include_activity_matches = FALSE, + usability = NULL, + grouping_only = FALSE, + drop_na_columns = TRUE, + quiet_option = "message") { + .q <- read_sql("get-all-survey-sets.sql") + + if (!is.null(species)) { + .q <- inject_filter("AND SP.SPECIES_CODE IN", species, sql_code = .q) + } + + if (!is.null(ssid)) { + ssid_original <- ssid + + if (any(ssid %in% c(35, 41, 42, 43))) { + ssid <- unique(c(ssid, 35, 41, 42, 43)) + } + + if (any(ssid %in% c(6, 7, 67))) { + ssid <- unique(c(ssid, 6, 7, 67)) + } + + if (include_activity_matches) { + ## draft approach that gets all samples collected using the same activities as the ssid(s) of interest + .a <- read_sql("get-activity-code.sql") + .a <- run_sql("GFBioSQL", .a) + + .a <- filter(.a, SURVEY_SERIES_ID %in% ssid) |> distinct() + + activities <- unique(.a$ACTIVITY_CODE) + .q <- inject_filter("AND TA.ACTIVITY_CODE IN", activities, + sql_code = .q, + search_flag = "-- insert ssid here", conversion_func = I + ) + } else { + .q <- inject_filter("AND S.SURVEY_SERIES_ID IN", ssid, + sql_code = .q, + search_flag = "-- insert ssid here", conversion_func = I + ) + } + } else { + remove_duplicates <- TRUE + } + + if (!is.null(major)) { + .q <- inject_filter("AND FE.MAJOR_STAT_AREA_CODE IN", major, .q, + search_flag = "-- insert major here", conversion_func = I + ) + } + + .d <- run_sql("GFBioSQL", .q) + + species_codes <- common2codes(species) + missing_species <- setdiff(species_codes, .d$SPECIES_CODE) + + if (length(missing_species) > 0) { + warning( + "The following species codes are not supported or do not have survey set data in GFBio: ", + paste(missing_species, collapse = ", ") + ) + } + + if (!is.null(years)) { + .d <- filter(.d, YEAR %in% years) + } + + if (join_sample_ids) { + # give us each sample_id associated with each fishing_event_id and species: + # sample_trip_ids <- get_sample_trips() + # areas <- get_strata_areas() # this now done in get-event-data.sql + # + # .d <- left_join(.d, sample_trip_ids, + # by = c("SPECIES_CODE", "FISHING_EVENT_ID") + # ) %>% + # left_join(areas, by = c("SURVEY_ID", "GROUPING_CODE")) + + warning( + "The join_sample_ids option has been removed. To bind with ", + "sample data, it is safer to use include_event_info = TRUE ", + "in get_all_survey_samples() instead." + ) + } + + # Just to pull out up to date list of ssids associated with trawl/ll gear type. + Sys.sleep(0.05) # might be useful if server has difficulty + + trawl <- run_sql("GFBioSQL", "SELECT + S.SURVEY_SERIES_ID + FROM SURVEY_SERIES SS + LEFT JOIN SURVEY S ON S.SURVEY_SERIES_ID = SS.SURVEY_SERIES_ID + LEFT JOIN TRIP_SURVEY TS ON TS.SURVEY_ID = S.SURVEY_ID + LEFT JOIN FISHING_EVENT FE ON FE.TRIP_ID = TS.TRIP_ID + WHERE GEAR_CODE IN(1, 6, 8, 11, 14, 16) AND + S.SURVEY_SERIES_ID <> 0 + GROUP BY S.SURVEY_SERIES_ID, [SURVEY_SERIES_DESC] + ,[SURVEY_SERIES_TYPE_CODE] + ,[SURVEY_SERIES_ALT_DESC], + TRAWL_IND, GEAR_CODE + ORDER BY S.SURVEY_SERIES_ID") + trawl <- unique(trawl$SURVEY_SERIES_ID) + + Sys.sleep(0.05) + + ll <- run_sql("GFBioSQL", "SELECT + S.SURVEY_SERIES_ID + FROM SURVEY_SERIES SS + LEFT JOIN SURVEY S ON S.SURVEY_SERIES_ID = SS.SURVEY_SERIES_ID + LEFT JOIN TRIP_SURVEY TS ON TS.SURVEY_ID = S.SURVEY_ID + LEFT JOIN FISHING_EVENT FE ON FE.TRIP_ID = TS.TRIP_ID + WHERE GEAR_CODE IN(4,5,7,10,12) AND + S.SURVEY_SERIES_ID <> 0 + GROUP BY S.SURVEY_SERIES_ID, [SURVEY_SERIES_DESC] + ,[SURVEY_SERIES_TYPE_CODE] + ,[SURVEY_SERIES_ALT_DESC], + TRAWL_IND, GEAR_CODE + ORDER BY S.SURVEY_SERIES_ID") + ll <- unique(ll$SURVEY_SERIES_ID) + + Sys.sleep(0.05) + + # if(!is.null(ssid)){ + # bad_ssid <- c(46, 81) + # if(any(ssid %in% bad_ssid)){ + # warning("SSID(s) ", ssid[ssid %in% bad_ssid], " is/are not currently supported. ", + # "See the function `get_ssids()` for help identifying ", + # "survey series IDs." + # ) + # } + # } + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No survey set data for ", toString(species), ".")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No survey set data for ", toString(species), " from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No survey set data for ", toString(species), " from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No survey set data for ", toString(species), " from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + + names(.d) <- tolower(names(.d)) + + # whenever ssid is included, but not activity matches + # we need to drop duplicated records from trips that include multiple surveys + if (!include_activity_matches & !is.null(ssid)) { + .d <- filter(.d, (survey_series_id %in% c(ssid))) + } + + # if using include_activity_matches = TRUE then remove_duplicates = TRUE + if (include_activity_matches & !is.null(ssid)) { + remove_duplicates <- TRUE + } + + # get all fishing event info + .fe <- read_sql("get-event-data.sql") + + # get only events from surveys that have recorded any of the species selected + .d <- filter(.d, catch_count > 0 | catch_weight > 0) # shouldn't be needed but there were some + ssid_with_catch <- unique(.d$survey_series_id) + + # browser() + + # d1 <- .d #<- select(.d, -survey_series_id) + + .fe <- inject_filter("AND S.SURVEY_SERIES_ID IN", ssid_with_catch, + sql_code = .fe, + search_flag = "-- insert ssid here", conversion_func = I + ) + + if (!is.null(major)) { + .fe <- inject_filter("AND FE.MAJOR_STAT_AREA_CODE IN", major, .fe, + search_flag = "-- insert major here", conversion_func = I + ) + } + + fe <- run_sql("GFBioSQL", .fe) + + + + fe <- fe |> distinct() |> # not sure why, but seems to be some complete duplication + filter(FE_MAJOR_LEVEL_ID < 700 | is.na(FE_MAJOR_LEVEL_ID)) # removes CTD drops + + if (!is.null(years)) { + fe <- filter(fe, YEAR %in% years) + } + + # if (is.null(ssid)) { + # fe <- filter(fe, SURVEY_SERIES_ID > 0) + # } + + suppressMessages( + if (all(ssid_with_catch %in% trawl)) { + # uses raw fe dataframe to save time because sub event counts not need for trawl + names(fe) <- tolower(names(fe)) + + .d <- expand.grid( + fishing_event_id = unique(fe$fishing_event_id), + species_code = unique(.d$species_code) + ) |> + left_join(dplyr::distinct(select( + fe, + #-survey_id, + #-survey_series_id, + -fe_parent_event_id, + # -fe_major_level_id, + -fe_minor_level_id, + -fe_sub_level_id, + -hook_code, + -lglsp_hook_count, + -hook_desc, + -hooksize_desc + ))) %>% + left_join(.d) + } else { + # for other survey types, further wrangling is required + # TODO: might be improved by making trap surveys a special case but for now this works ok + # TODO: could split by survey to see when skate level is needed rather than applying to all + # start by checking the skate level counts and gear details + # sk <- get_skate_level_counts(fe) + # names(sk) <- tolower(names(sk)) + + # get catch for sub levels if skate counts > 1 and gear differs between skates + # sks <- sk %>% filter(skate_count > 1) + # fe_vector <- unique(sks$fishing_event_id) + + spp_codes <- unique(.d$species_code) + + fe1 <- get_skate_level_counts(fe) + + count_gear_types <- fe1 |> group_by(fishing_event_id) |> + summarise(max = max( + ### can add any more gear variables needed here + sum(!is.na(unique(HOOK_CODE))), + sum(!is.na(unique(HOOKSIZE_DESC))) + )) + + if (max(count_gear_types$max, na.rm = TRUE) > 1) { + + .h <- read_sql("get-ll-sub-level-hook-data.sql") + + .h <- inject_filter("AND S.SURVEY_SERIES_ID IN", ssid_with_catch, + sql_code = .h, + search_flag = "-- insert ssid here", conversion_func = I + ) + + .hd <- run_sql("GFBioSQL", .h) + .hd <- dplyr::distinct(.hd) # %>% select(-FE.FISHING_EVENT_ID) + names(.hd) <- tolower(names(.hd)) + names(fe1) <- tolower(names(fe1)) + + fe2 <- filter(count_gear_types, max > 1) |> + left_join(fe1) |> + left_join(.hd) |> + # select(-survey_series_id) |> + select(-max) |> + distinct() + + .d2 <- expand.grid( + fishing_event_id = unique(fe2$fishing_event_id), + species_code = unique(.d$species_code) + ) |> + left_join(fe2) |> + left_join(.d) + + slc_list <- list() + spp_codes <- unique(.d$species_code) + for (i in seq_along(spp_codes)) { + .slc <- read_sql("get-sub-level-catch.sql") + .slc <- inject_filter("", spp_codes[i], sql_code = .slc) + # TODO: this filter is currently not doing anything... don't know why! + .slc <- inject_filter("AND C.SPECIES_CODE IN", spp_codes[i], + sql_code = .slc, + search_flag = "-- insert species again here" + ) + ## this didn't work, not sure why; isn't working for get-all-survey-sets.sql either + # .slc <- inject_filter("AND FE.FE_PARENT_EVENT_ID IN", fe_vector, + # sql_code = .slc, + # search_flag = "-- insert fe_vector here", conversion_func = I + # ) + slc_list[[i]] <- run_sql("GFBioSQL", .slc) + } + slc <- do.call(rbind, slc_list) |> distinct() + names(slc) <- tolower(names(slc)) + + .d2 <- .d2 %>% + # select(-catch_count) |> + rename(event_level_count = catch_count) |> # used as a temporary check + left_join(slc, by = c( + "trip_id" = "trip_id", + "fishing_event_id" = "fe_parent_event_id", + "fe_major_level_id" = "fe_major_level_id", + "fe_sub_level_id" = "fe_sub_level_id", + "species_code" = "species_code" + )) |> group_by(fishing_event_id) |> + mutate(counts_diff = event_level_count - sum(catch_count, na.rm = TRUE)) |> + ungroup() + + if(sum(.d2$counts_diff, na.rm = TRUE) != 0) { + warning("Some skate-level counts are inconsistent with counts for events with gear differences.") + } else { + .d2 <- .d2 |> select(-counts_diff, -event_level_count) + } + } + + ## when hooks do not differ between skates, get hook data and catch for whole event + .h <- read_sql("get-ll-hook-data-generalized.sql") + + .h <- inject_filter("AND S.SURVEY_SERIES_ID IN", ssid_with_catch, + sql_code = .h, + search_flag = "-- insert ssid here", conversion_func = I + ) + + .hd <- run_sql("GFBioSQL", .h) + names(.hd) <- tolower(names(.hd)) + + .hd <- dplyr::distinct(.hd) + + fe3 <- get_parent_level_counts(fe) + names(fe3) <- tolower(names(fe3)) + + + fe3 <- filter(count_gear_types, max < 2) |> + left_join(fe3) |> + left_join(.hd) |> + # select(-survey_series_id) |> + select(-max) |> + dplyr::distinct() + + .d1 <- expand.grid( + fishing_event_id = unique(fe3$fishing_event_id), + species_code = unique(.d$species_code) + ) |> + left_join(fe3) |> + left_join(.d) + + .d <- bind_rows(.d1, .d2) + + }, + classes = quiet_option + ) + + + suppressMessages( + if (remove_bad_data) { + .d <- correct_ssids(.d) + }, + classes = quiet_option + ) + + if (!is.null(ssid)) { + # deal with NAs somehow causing duplicated rows of data + .d <- .d |> + group_by(fishing_event_id) |> + mutate( + doorspread_m = ifelse(is.logical(na.omit(doorspread_m)), NA, na.omit(doorspread_m)), + speed_mpm = ifelse(is.logical(na.omit(speed_mpm)), NA, na.omit(speed_mpm)) + ) |> + group_by(fishing_event_id, survey_series_id) |> + mutate( + grouping_desc_updated = ifelse(is.logical(na.omit(grouping_desc_updated)), NA, na.omit(grouping_desc_updated)), + grouping_code_updated = mean(grouping_code_updated, na.rm = TRUE), + grouping_code_updated = ifelse(is.nan(grouping_code_updated), NA, grouping_code_updated) + ) |> + dplyr::distinct() |> + ungroup() + + if (any(ssid %in% c(6, 7, 67)) & !include_activity_matches) { + ssid <- ssid_original + } + + .d <- filter(.d, survey_series_id %in% ssid) + + if (is.null(major)) { + print(paste0(toString(species), " have been recorded by survey series ", toString(ssid), " at least once. ")) + print("Returning all relevant sets/events/skates including those with no catch.") + } + if (!is.null(major)) { + print(paste0( + toString(species), " have been recorded by survey series ", toString(ssid), + "within major area(s) ", toString(major), " at least once. " + )) + print("Returning all relevant sets/events/skates including those with no catch.") + } + } else { + # when not specifying ssid + # deal with NAs somehow causing duplicated rows of data + .d <- .d |> + group_by(fishing_event_id) |> + mutate( + speed_mpm = ifelse(is.logical(na.omit(speed_mpm)), NA, na.omit(speed_mpm)), + doorspread_m = ifelse(is.logical(na.omit(doorspread_m)), NA, na.omit(doorspread_m)), + # make sure updated codes are from the original survey design and purge others + grouping_desc_updated = ifelse(grouping_code_updated == grouping_code_original, grouping_desc_updated, NA), + grouping_desc_updated = ifelse(is.logical(na.omit(grouping_desc_updated)), NA, na.omit(grouping_desc_updated)), + grouping_code_updated = ifelse(grouping_code_updated == grouping_code_original, grouping_code_updated, NA), + grouping_code_updated = mean(grouping_code_updated, na.rm = TRUE), + grouping_code_updated = ifelse(is.nan(grouping_code_updated), NA, grouping_code_updated) + ) |> + dplyr::distinct() |> + ungroup() + + if (is.null(major)) { + print( + paste0( + "Returning all sets/events/skates (including those with no catch) from all survey series that recorded ", + toString(species), " at least once." + ) + ) + } + if (!is.null(major)) { + print( + paste0( + "Returning all sets/events/skates (including those with no catch) from all survey series that recorded ", + toString(species), " within major area(s) ", toString(major), " at least once." + ) + ) + } + } + + # check if there are duplicate fishing_event ids + if (length(.d$fishing_event_id) > length(unique(.d$fishing_event_id))) { + if (remove_duplicates) { + # if so, separate original_ind from not + .dy <- filter(.d, original_ind == "Y") + .dn <- filter(.d, original_ind != "Y" | is.na(original_ind)) + + # and only keep those not original_ind = Y when the fishing_event id was missing + .d <- bind_rows(.dy, filter(.dn, !(fishing_event_id %in% c(unique(.dy$fishing_event_id))))) + + # check if there are still duplicated fishing_event ids + if (length(.d$fishing_event_id) > length(unique(.d$fishing_event_id))) { + warning( + "Duplicate fishing_event_ids are still present despite ", + "`remove_duplicates = TRUE`. This may be because of overlapping ", + "survey stratifications or multiple skates per event ", + "(specifically when at least one survey included used skates with ", + "differences in gear type), but could also be due to trips participating ", + "in more than one type of survey. If the latter, location, gear, or `reason_desc` ", + "columns should be used to choose which events to keep. ", + "After selecting specific survey stratifications and determining that ", + "all relevant variables are accurate, the remaining duplications ", + "can be filtered using `dat <- dat[!duplicated(dat$fishing_event_id), ]`. " + ) + } + } else { + # check if there are duplicated fishing_event ids (this often true for SABLE and MSSM surveys) + if (length(.d$fishing_event_id) > length(unique(.d$fishing_event_id))) { + warning( + "Duplicate fishing_event_ids are present. This is usually because of ", + "overlapping survey stratifications, multiple skates per event ", + "(specifically when at least one survey included used skates with ", + "differences in gear type), or trips that include more than one type of ", + "survey. Some cases of the former can be resolved by setting ", + "'remove_duplicates = TRUE'. If the latter, location, gear, or `reason_desc` ", + "columns should be used to choose which events to keep. ", + "After selecting specific survey stratifications and determining that ", + "all relevant variables are accurate, the remaining duplications ", + "can be filtered using `dat <- dat[!duplicated(dat$fishing_event_id), ]`. " + ) + } + } + } + + if (nrow(.d[.d$survey_series_id %in% c(35, 41, 42, 43), ]) > 0) { + warning( + "All sablefish research related sets are returned as survey_series_id 35. ", + "To separate types of sets, use reason_desc and grouping_code variables." + ) + } + + surveys <- get_ssids() + names(surveys) <- tolower(names(surveys)) + + .d <- inner_join(.d, + dplyr::distinct(select( + surveys, + survey_series_id, + survey_series_desc, + survey_abbrev + )), + by = "survey_series_id" + ) + + species_df <- run_sql("GFBioSQL", "SELECT * FROM SPECIES") %>% + select( + SPECIES_CODE, + SPECIES_COMMON_NAME, + SPECIES_SCIENCE_NAME, + SPECIES_DESC + ) + names(species_df) <- tolower(names(species_df)) + + .d <- inner_join(.d, + dplyr::distinct(species_df), + by = "species_code" + ) + + # create zeros + .d$catch_count <- ifelse(is.na(.d$catch_count), 0, .d$catch_count) + .d$catch_weight <- ifelse(is.na(.d$catch_weight), 0, .d$catch_weight) + + # in trawl data, catch_count is only recorded for small catches + # so 0 in the catch_count column when catch_weight > 0 seems misleading + # note: there are also a few occasions for trawl where count > 0 and catch_weight is 0/NA + # these lines replace false 0s with NA, but additional checks might be needed + if (remove_false_zeros) { + .d$catch_count <- ifelse(.d$catch_weight > 0 & .d$catch_count == 0, NA, .d$catch_count) + .d$catch_weight <- ifelse(.d$catch_count > 0 & .d$catch_weight == 0, NA, .d$catch_weight) + } + + if (!is.null(usability)) { + .d <- filter(.d, usability_code %in% usability) + } else { + u <- get_table("usability") + + names(u) <- tolower(names(u)) + .d <- left_join( + .d, + dplyr::distinct(select( + u, + usability_code, + usability_desc + )), + by = "usability_code" + ) + } + + if (any(ssid_with_catch %in% trawl)) { + # calculate area_swept for trawl exactly as it has been done for the density values in this dataframe + # note: is NA if doorspread_m is missing and duration_min may be time in water (not just bottom time) + .d <- trawl_area_swept(.d) + # # won't do this here because there may be ways of using mean(.d$doorspread_m) to fill in some NAs + # # .d <- dplyr::filter(.d, !is.na(area_swept)) + # # instead use this to make sure false 0 aren't included + .d$density_kgpm2 <- .d$catch_weight / .d$area_swept + .d$density_kgpm2 <- ifelse(!is.na(.d$area_swept), .d$density_kgpm2, NA) # don't think this is doing anything + .d$density_pcpm2 <- .d$catch_count / .d$area_swept2 # using area_swept2 is how it's done in "poc_catmat_2011" + .d$density_pcpm2 <- ifelse(!is.na(.d$area_swept2), .d$density_pcpm2, NA) # don't think this is doing anything + } + + if (any(ssid_with_catch %in% ll)) { + .d <- hook_area_swept(.d) + + .d$density_ppkm2 <- .d$catch_count / (.d$hook_area_swept_km2) + # .d$density_pppm2 <- .d$catch_count/(.d$hook_area_swept_km2*1000000) + } + + .d <- mutate(.d, + species_science_name = tolower(species_science_name), + species_desc = tolower(species_desc), + species_common_name = tolower(species_common_name) + ) + + if (grouping_only) { + .d <- filter(.d, !is.na(grouping_code_original)) + + if (nrow(.d) < 1) { + if (is.null(ssid) & is.null(major)) { + stop(paste0("No survey set data with expected grouping codes.")) + } else { + if (!is.null(ssid) & is.null(major)) { + stop(paste0("No survey set data with expected grouping codes from ssid(s) ", toString(ssid), ".")) + } + if (is.null(ssid) & !is.null(major)) { + stop(paste0("No survey set data with expected grouping codes from major area(s) ", toString(major), ".")) + } + if (!is.null(ssid) & !is.null(major)) { + stop(paste0("No survey set data with expected grouping codes from ssid(s) ", toString(ssid), " in major area(s) ", toString(major), ".")) + } + } + } + } + + .d <- .d |> + relocate(species_common_name, catch_count, catch_weight, survey_series_id, survey_abbrev, year, fishing_event_id) |> + arrange(species_common_name, survey_series_id, -year, -fishing_event_id) + + # not sure where things are getting duplicated, but this will get rid of any complete duplication + .d <- dplyr::distinct(.d) + + # we will use grouping_code_original as the primary grouping_code returned + .d <- dplyr::rename(.d, grouping_code = grouping_code_original, grouping_desc = grouping_desc_original) + + # return only events from surveys that have recorded any of the species selected + # rechecking this after SSID corrections + .dpos <- filter(.d, catch_count > 0 | catch_weight > 0) + ssid_with_catch <- unique(.dpos$survey_series_id) + .d <- filter(.d, survey_series_id %in% ssid_with_catch) + + # this drops any columns entirely populated with NAs + if (drop_na_columns) { + .d <- .d %>% select(where(~ !all(is.na(.x)))) + } + + # TODO: could add a check to see if ssid and ssog are identical and drop ssog if so? But might be useful to keep... + add_version(as_tibble(.d)) +} diff --git a/R/gfdata-package.R b/R/gfdata-package.R index 2d34ed8..ffcf4d3 100644 --- a/R/gfdata-package.R +++ b/R/gfdata-package.R @@ -7,10 +7,11 @@ NULL #' @importFrom dplyr filter mutate summarise select group_by n arrange ungroup #' @importFrom dplyr inner_join left_join right_join anti_join full_join -#' @importFrom dplyr semi_join +#' @importFrom dplyr semi_join relocate where distinct #' @importFrom dplyr bind_rows case_when pull contains tibble rename as_tibble #' @importFrom magrittr %>% #' @importFrom rlang .data +#' @importFrom stats na.omit #' NULL @@ -67,7 +68,71 @@ if (getRversion() >= "2.15.1") { "DEPTH_M", "LATITUDE", "LONGITUDE", - "VESSEL_ID" + "VESSEL_ID", + "FE_MAJOR_LEVEL_ID", + "FE_MINOR_LEVEL_ID", + "FE_PARENT_EVENT_ID", + "FE_SUB_LEVEL_ID", + "GROUPING_CODE", + "GROUPING_DESC", + "GROUPING_CODE_ORIGINAL", + "GROUPING_DESC_ORIGINAL", + "GROUPING_CODE_UPDATED", + "GROUPING_DESC_UPDATED", + "HOOKSIZE_DESC", + "HOOK_CODE", + "HOOK_DESC", + "LGLSP_HOOK_COUNT", + "MINOR_ID_COUNT", + "MINOR_ID_MAX", + "MINOR_STAT_AREA_CODE", + "ORIGINAL_IND", + "REASON_DESC", + "ROW_VERSION", + "SKATE_COUNT", + "SURVEY_ID", + "SURVEY_SERIES_OG", + "TRIP_ID", + "USABILITY_CODE", + "YEAR", + "age", + "catch_count", + "catch_weight", + "dna_container_id", + "dna_sample_type", + "doorspread_m", + "fe_minor_level_id", + "fe_parent_event_id", + "fe_sub_level_id", + "fishing_event_id", + "grouping_code", + "grouping_desc", + "grouping_code_original", + "grouping_desc_original", + "grouping_code_updated", + "grouping_desc_updated", + "hook_code", + "hook_desc", + "hooksize_desc", + "lglsp_hook_count", + "minor_id_count", + "minor_id_max", + "original_ind", + "sample_source_code", + "sample_type_code", + "sex", + "skate_count", + "skate_id", + "species_category_code", + "specimen_id", + "speed_mpm", + "survey_abbrev", + "survey_series_desc", + "survey_series_id", + "usability_desc", + "weight", + "counts_diff", + "event_level_count" ) ) } diff --git a/R/parent-level-counts.R b/R/parent-level-counts.R new file mode 100644 index 0000000..1effe54 --- /dev/null +++ b/R/parent-level-counts.R @@ -0,0 +1,69 @@ +#' summarize sub/minor level counts from fishing event data at the parent event level +#' includes correction for a typo in dataframe +#' retrieves missing fishing_event_ids for sablefish surveys using major_level_ids +#' +#' @param fe df retrieved with get-event-data.sql +#' +get_parent_level_counts <- function(fe) { + # just actual parent-level events + fe_A_no_parent <- filter(fe, is.na(FE_PARENT_EVENT_ID), is.na(FE_MINOR_LEVEL_ID), is.na(FE_SUB_LEVEL_ID)) + + # get sub events (known as skates) + fe_B_no_minor <- filter(fe, !is.na(FE_PARENT_EVENT_ID), is.na(FE_MINOR_LEVEL_ID)) %>% + select(FE_PARENT_EVENT_ID, FISHING_EVENT_ID, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + dplyr::distinct() %>% + group_by(FE_PARENT_EVENT_ID, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + mutate(SKATE_COUNT = n()) %>% + rename(skate_id = FISHING_EVENT_ID, fishing_event_id = FE_PARENT_EVENT_ID) %>% + ungroup() + + # get sub-sub events (usually hooks) + fe_C_w_minor <- filter(fe, !is.na(FE_PARENT_EVENT_ID), !is.na(FE_MINOR_LEVEL_ID)) + + fe_C <- fe_C_w_minor %>% + select(FE_PARENT_EVENT_ID, FE_MAJOR_LEVEL_ID, FE_MINOR_LEVEL_ID, YEAR, TRIP_ID) %>% + dplyr::distinct() %>% + group_by(FE_PARENT_EVENT_ID, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + mutate( + MINOR_ID_COUNT = n(), + MINOR_ID_MAX = ifelse(all(is.na(FE_MINOR_LEVEL_ID)), NA, max(FE_MINOR_LEVEL_ID, na.rm = TRUE)) + ) %>% + select(-FE_MINOR_LEVEL_ID) |> + dplyr::distinct() %>% + mutate(skate_id = FE_PARENT_EVENT_ID) %>% + ungroup() + + sub_event_counts <- full_join( + fe_B_no_minor, + fe_C + ) + + ## up to 220 skates, all sablefish 39, 41, or 43, are missing parent event ids + missing_event_ids <- filter(sub_event_counts, is.na(fishing_event_id)) %>% + select(-fishing_event_id) %>% + left_join(select(fe_A_no_parent, + fishing_event_id = FISHING_EVENT_ID, FE_MAJOR_LEVEL_ID, TRIP_ID, + YEAR + )) + + final_event_counts <- sub_event_counts %>% + filter(!is.na(fishing_event_id)) %>% + bind_rows(missing_event_ids) %>% + group_by(fishing_event_id, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + dplyr::distinct() %>% + dplyr::summarise( + skate_count = mean(SKATE_COUNT, na.rm = T), + mean_per_skate = mean(MINOR_ID_COUNT, na.rm = T), + minor_id_count = sum(MINOR_ID_COUNT, na.rm = T), + minor_id_max = ifelse(all(is.na(MINOR_ID_MAX)), NA, max(MINOR_ID_MAX, na.rm = TRUE)) + ) %>% + mutate(diff = ifelse(minor_id_max > 0, minor_id_max - minor_id_count, NA)) %>% + dplyr::distinct() + + fe2 <- fe_A_no_parent %>% + rename(fishing_event_id = FISHING_EVENT_ID) %>% + left_join(final_event_counts) %>% + select(-FE_PARENT_EVENT_ID, -FE_MINOR_LEVEL_ID, -FE_SUB_LEVEL_ID) + + fe2 %>% dplyr::distinct() +} diff --git a/R/skate-level-counts.R b/R/skate-level-counts.R new file mode 100644 index 0000000..edfc9c4 --- /dev/null +++ b/R/skate-level-counts.R @@ -0,0 +1,107 @@ +#' summarize fishing event data at the skate level +#' retrieves missing fishing_event_ids for sablefish surveys using major_level_ids +#' +#' @param fe df retrieved with get-event-data.sql +#' + +get_skate_level_counts <- function(fe) { + fe <- fe |> distinct() + + # just actual parent-level events + fe_A_no_parent <- filter(fe, is.na(FE_PARENT_EVENT_ID), is.na(FE_MINOR_LEVEL_ID), is.na(FE_SUB_LEVEL_ID)) |> + select(-FE_PARENT_EVENT_ID, -FE_SUB_LEVEL_ID, -FE_MINOR_LEVEL_ID) |> + rename(fishing_event_id = FISHING_EVENT_ID) + + # get sub events (known as skates) + # when present hook data is stored at this level, while other event info tends to be stored at the parent event level + fe_B_no_minor <- filter(fe, !is.na(FE_PARENT_EVENT_ID), is.na(FE_MINOR_LEVEL_ID)) %>% + select( + FE_PARENT_EVENT_ID, FISHING_EVENT_ID, FE_MAJOR_LEVEL_ID, FE_SUB_LEVEL_ID, + SURVEY_SERIES_ID, + YEAR, TRIP_ID, HOOK_CODE, LGLSP_HOOK_COUNT, HOOK_DESC, HOOKSIZE_DESC + ) %>% + dplyr::distinct() %>% + group_by(YEAR, TRIP_ID, FE_PARENT_EVENT_ID, FE_MAJOR_LEVEL_ID) %>% + mutate(SKATE_COUNT = n()) %>% + rename(skate_id = FISHING_EVENT_ID, fishing_event_id = FE_PARENT_EVENT_ID) %>% + ungroup() + + # select all actual events that lack sub levels + fe_without_B <- fe_A_no_parent |> + anti_join(fe_B_no_minor, by = "fishing_event_id") + + ## new version + # sublevel missing hook info, needs all parent event level covariates + # there also seems to be some disagreement between levels and the SSID assigned, + # so using parent level when hook code unknown at skate level + # fe_with_B_no_hook <- fe_B_no_minor[which(fe_B_no_minor$HOOK_CODE %in% 0 | is.na(fe_B_no_minor$HOOK_CODE)),] + fe_with_B_no_hook <- fe_B_no_minor |> + filter(is.na(HOOK_CODE) | HOOK_CODE == 0) |> + select(-HOOK_CODE, -LGLSP_HOOK_COUNT, -HOOK_DESC, -HOOKSIZE_DESC, -SURVEY_SERIES_ID) |> + left_join(fe_A_no_parent) + + # sublevel w hook info, needs all parent event covariates except the hook ones and SSID + # using sub level SSID when hook code IS known at skate level + fe_A_data_no_hook <- fe_A_no_parent |> + select(-HOOK_CODE, -LGLSP_HOOK_COUNT, -HOOK_DESC, -HOOKSIZE_DESC, -SURVEY_SERIES_ID) |> + distinct() + + # fe_with_B_and_hook <- fe_B_no_minor[which(fe_B_no_minor$HOOK_CODE %in% c(1,3)),] + fe_with_B_and_hook <- fe_B_no_minor |> + filter(HOOK_CODE != 0) |> + left_join(fe_A_data_no_hook) + + # # this sometimes adds up to more than what we started with because of survey or grouping code duplications + fe_by_event_or_skate <- bind_rows(fe_without_B, fe_with_B_and_hook) |> + bind_rows(fe_with_B_no_hook) + + # # get sub-sub events (usually hooks) + fe_C_w_minor <- filter(fe, !is.na(FE_PARENT_EVENT_ID), !is.na(FE_MINOR_LEVEL_ID)) + + fe_C <- fe_C_w_minor %>% + select(FE_MINOR_LEVEL_ID, FE_SUB_LEVEL_ID, FE_PARENT_EVENT_ID, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + dplyr::distinct() %>% + group_by(FE_SUB_LEVEL_ID, FE_PARENT_EVENT_ID, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + mutate( + MINOR_ID_COUNT = n(), + MINOR_ID_MAX = ifelse(all(is.na(FE_MINOR_LEVEL_ID)), NA, max(FE_MINOR_LEVEL_ID, na.rm = TRUE)) + ) %>% + select(-FE_MINOR_LEVEL_ID) %>% + dplyr::distinct() %>% + mutate(skate_id = FE_PARENT_EVENT_ID) %>% + ungroup() + + sub_event_counts <- full_join( + fe_B_no_minor, + fe_C + ) + + ## up to 220 skates, all sablefish 39, 41, or 43, are missing parent event ids + missing_event_ids <- filter(sub_event_counts, is.na(fishing_event_id)) %>% + select(-fishing_event_id) %>% + left_join(select( + fe_A_no_parent, + fishing_event_id, FE_MAJOR_LEVEL_ID, TRIP_ID, + YEAR + )) + + final_event_counts <- sub_event_counts %>% + filter(!is.na(fishing_event_id)) %>% + bind_rows(missing_event_ids) %>% + dplyr::distinct() %>% + group_by(skate_id, FE_SUB_LEVEL_ID, fishing_event_id, FE_MAJOR_LEVEL_ID, YEAR, TRIP_ID) %>% + dplyr::summarise( + # skate_count = mean(SKATE_COUNT, na.rm = T), + mean_per_skate = mean(MINOR_ID_COUNT, na.rm = T), + minor_id_count = sum(MINOR_ID_COUNT, na.rm = T), + minor_id_max = ifelse(all(is.na(MINOR_ID_MAX)), NA, max(MINOR_ID_MAX, na.rm = TRUE)) + ) %>% + dplyr::distinct() %>% + mutate(diff = ifelse(minor_id_max > 0, minor_id_max - minor_id_count, NA)) + + fe2 <- fe_by_event_or_skate |> + left_join(final_event_counts) |> + dplyr::distinct() + + fe2 +} diff --git a/R/utils.R b/R/utils.R index e6e6e0c..f6e874c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -78,12 +78,16 @@ inject_filter <- function(sql_precode, species, sql_code, } first_cap <- function(s, strict = FALSE) { - cap <- function(s) paste(toupper(substring(s, 1, 1)), { - s <- substring(s, 2) - if (strict) tolower(s) else s - }, - sep = "", collapse = " " + cap <- function(s) { + paste(toupper(substring(s, 1, 1)), + { + s <- substring(s, 2) + if (strict) tolower(s) else s + }, + sep = "", + collapse = " " ) + } sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s))) } @@ -156,8 +160,10 @@ codes2common <- function(spp_code) { #' @param area_regex A vector of regular expressions describing the areas group. #' @export #' @examples -#' x <- c("5D: NORTHERN HECATE STRAIT", "3C: S.W. VANCOUVER ISLAND", -#' "3D: N.W. VANCOUVER ISLAND") +#' x <- c( +#' "5D: NORTHERN HECATE STRAIT", "3C: S.W. VANCOUVER ISLAND", +#' "3D: N.W. VANCOUVER ISLAND" +#' ) #' assign_areas(x) assign_areas <- function(major_stat_area_description, area_regex = c("3[CD]+", "5[AB]+", "5[CDE]+")) { @@ -188,7 +194,7 @@ get_spp_sample_length_type <- function(species) { value = "count" ) .d <- .d %>% dplyr::filter(count == max(count)) - if (nrow(.d) > 1L) .d <- .d[1L, ,drop = FALSE] # happens if all 0! pick any + if (nrow(.d) > 1L) .d <- .d[1L, , drop = FALSE] # happens if all 0! pick any .d$length_type } @@ -197,3 +203,19 @@ add_version <- function(x) { attr(x, "date") <- Sys.time() x } + +trawl_area_swept <- function(.d) { + .d$area_swept1 <- .d$doorspread_m * .d$tow_length_m + .d$area_swept2 <- .d$doorspread_m * (.d$speed_mpm * .d$duration_min) + .d$area_swept <- ifelse(!is.na(.d$area_swept1), .d$area_swept1, .d$area_swept2) + .d$area_swept_km2 <- .d$area_swept / 1000000 + .d +} + +hook_area_swept <- function(.d) { + .d$hook_area_swept_km2 <- ifelse(.d$survey_series_id == 14, + 0.0054864 * 0.009144 * .d$minor_id_count, + 0.0024384 * 0.009144 * .d$minor_id_count + ) + .d +} diff --git a/inst/compare/compare-survey-samples.R b/inst/compare/compare-survey-samples.R new file mode 100644 index 0000000..1b79359 --- /dev/null +++ b/inst/compare/compare-survey-samples.R @@ -0,0 +1,235 @@ +# Compare survey samples +compare_survey_samples <- function ( + # Shared args + spp, + ssids, + ... + ) { + + # Store '...' args as list of named values + arg_list <- list(...) + # Prepare argument list for each function + get_args <- arg_list[ names(arg_list) %in% names(formals(get_survey_samples)) ] + get_all_args <- arg_list[ names(arg_list) %in% names(formals(get_all_survey_samples)) ] + + if(length(get_args)==0){ + get_args <- list() + } + if(length(get_all_args)==0){ + get_all_args <- list() + } + + # Initialize default tibble + init <- tibble::tibble( + fn = numeric(0), + species = character(0), + ssid = numeric(0) + ) + + # Initialize tibbles + x <- init # Extra specimens + u <- init # Unlike values + s <- init # Summary of returned + a <- init # All specimens when any unlike (x1, x2, a12) + + # Iterate over cases + for (i in seq_along(spp)) { + # Print current species + cat(paste0("spp = ", spp[i], "\n")) + # Reset tibbles + d1_all <- NULL + d2_all <- NULL + d1e <- NULL + d2e <- NULL + d1_safe <- NULL + d2_safe <- NULL + + # Safely + safe_get_survey_samples <- purrr::safely( + gfdata::get_survey_samples + ) + safe_get_all_survey_samples <- purrr::safely( + gfdata::get_all_survey_samples + ) + + get_args$species <- get_all_args$species <- spp[i] + get_args$ssid <- get_all_args$ssid <- ssids + + # Get survey samples + d1_safe <- do.call(safe_get_survey_samples, get_args) + + # Extract result and (first) error message + d1_all <- d1_safe$result + d1e <- d1_safe$error[[1]][1] # Extract first list element + + # Let server have a rest + Sys.sleep(0.05) + + # Get all survey samples + d2_safe <- do.call(safe_get_all_survey_samples, get_all_args) + # Extract result and (first) error message + d2_all <- d2_safe$result + d2e <- d2_safe$error[[1]][1] # Extract first list element + + if(is.null(d1_all)){ + d1_all <- tibble::tibble( + survey_series_id = numeric(0), + survey_series_desc = character(0) + ) + } + if(is.null(d2_all)){ + d2_all <- tibble::tibble( + survey_series_id = numeric(0), + survey_series_desc = character(0) + ) + } + + ssids_found <- dplyr::bind_rows( + dplyr::select(d1_all, survey_series_id, survey_series_desc), + dplyr::select(d2_all, survey_series_id, survey_series_desc)) |> + dplyr::distinct() + + ssid <- unique(ssids_found$survey_series_id) + + for (j in seq_along(ssid)) { + # Reset tibbles + a12 <- NULL # + d1 <- NULL + d2 <- NULL + dd <- NULL # + s1 <- NULL # + s2 <- NULL # + x1 <- NULL # + x2 <- NULL # + # Reset vectors + b <- NULL # + n1 <- NULL # + n2 <- NULL # + r1 <- NULL # + r2 <- NULL # + r12 <- NULL # + + # Drop NA specimen_id + if ("specimen_id" %in% colnames(d1_all)) { + d1 <- d1_all |> + dplyr::filter(survey_series_id == ssid[j]) |> + tidyr::drop_na(specimen_id) + } + if ("specimen_id" %in% colnames(d2_all)) { + d2 <- d2_all |> + dplyr::filter(survey_series_id == ssid[j]) |> + tidyr::drop_na(specimen_id) + } + # Identify extra specimen_id + n1 <- setdiff(d1$specimen_id, d2$specimen_id) + n2 <- setdiff(d2$specimen_id, d1$specimen_id) + # Identify shared specimen_id + b <- dplyr::intersect(d1$specimen_id, d2$specimen_id) + # New summary rows + s1 <- tibble::tibble( + fn = 1L, + species = spp[i], + ssid = ssid[j], + survey = ssids_found$survey_series_desc[j], + count_ids = ifelse(is.null(d1), NA, length(unique(d1$specimen_id))), + extra_ids = ifelse(is.null(d1), NA, length(n1)), + shared_ids = ifelse(is.null(d1), NA, length(b)), + error = ifelse(is.null(d1e), "No", "Yes"), + message = ifelse(is.null(d1e), "NULL", d1e) + ) + s2 <- tibble::tibble( + fn = 2L, + species = spp[i], + ssid = ssid[j], + survey = ssids_found$survey_series_desc[j], + count_ids = ifelse(is.null(d2), NA, length(unique(d2$specimen_id))), + extra_ids = ifelse(is.null(d2), NA, length(n2)), + shared_ids = ifelse(is.null(d2), NA, length(b)), + error = ifelse(is.null(d2e), "No", "Yes"), + message = ifelse(is.null(d2e), "NULL", d2e) + ) + # Augment summary + s <- dplyr::bind_rows(s, s1, s2) + + # New extra specimen rows + if (length(n1) > 0) { + # Extra specimen rows numbers + r1 <- which(d1$specimen_id %in% n1) + # Extra specimen tibbles + x1 <- tibble::tibble( + fn = 1L, + species = spp[i], + ssid = ssid[j], + d1[r1, ] + ) + } + + if (length(n2) > 0) { + # Extra specimen rows numbers + r2 <- which(d2$specimen_id %in% n2) + x2 <- tibble::tibble( + fn = 2L, + species = spp[i], + ssid = ssid[j], + d2[r2, ] + ) + } + # Augment extra specimens + x <- dplyr::bind_rows(x, x1, x2) + + # If either function had extra, include all shared as fn = 12 + if ((length(n1) + length(n2)) > 0 & length(unique(d2$specimen_id)) > 0) { + # Shared specimen rows numbers + r12 <- which(!(d2$specimen_id %in% n2)) + a12 <- tibble::tibble( + fn = 12L, + species = spp[i], + ssid = ssid[j], + d2[r12, ] + ) + } + # Augment all specimens only when functions differed + a <- dplyr::bind_rows(a, x1, x2, a12) + + # Only compares non-null output for both functions + if (!is.null(d1) & !is.null(d2)) { + # Only compare shared colnames + cn <- intersect(colnames(d1), colnames(d2)) + # Shared columns + d1 <- d1 |> + dplyr::select(tidyselect::all_of(cn)) |> + dplyr::mutate(fn = 1L, .before = 1) + d2 <- d2 |> + dplyr::select(tidyselect::all_of(cn)) |> + dplyr::mutate(fn = 2L, .before = 1) + # Bind rows + dd <- dplyr::bind_rows(d1, d2) |> + # Augment + dplyr::mutate(species = spp[i], .before = 2L) |> + dplyr::mutate(ssid = ssid[j], .before = 3L) |> + # Drop dogfish NAs + tidyr::drop_na(specimen_id) |> + # Arrange so same specimen_ids are in sequential rows + dplyr::arrange(fishing_event_id, sample_id, specimen_id, fn) |> + # Hack to fix different value case + dplyr::mutate(length_type = tolower(length_type)) |> + # Collapse to one row if equal (except in fn column) + dplyr::distinct(dplyr::across(-fn), .keep_all = TRUE) |> + dplyr::group_by(specimen_id) |> + # Keep only groups with more than one row (the inconsistent groups) + dplyr::filter(dplyr::n() > 1) |> + dplyr::ungroup() + } # End if + # Bind rows + u <- dplyr::bind_rows(u, dd) + } + } + + # Arrange + x <- dplyr::arrange(x, fn, species, ssid) + s <- dplyr::arrange(s, species, ssid, fn) + a <- dplyr::arrange(a, fn, species, ssid) + + # Return + list(x = x, u = u, s = s, a = a) +} diff --git a/inst/compare/compare-survey-sets.R b/inst/compare/compare-survey-sets.R new file mode 100644 index 0000000..9400f2a --- /dev/null +++ b/inst/compare/compare-survey-sets.R @@ -0,0 +1,278 @@ +# Compare survey sets +compare_survey_sets <- function (spp, + ssids = NULL, + drop_if_all_zero = FALSE, + ... + ) { + + # Store '...' args as list of named values + arg_list <- list(...) + # Prepare argument list for each function + get_args <- arg_list[ names(arg_list) %in% names(formals(get_survey_sets)) ] + get_all_args <- arg_list[ names(arg_list) %in% names(formals(get_all_survey_sets)) ] + + if(is.null(ssids)) { + ssid <- c(1, 3, 4, 16, 2, 14, 22, 36, 39, 40) + ssid2 <- NULL + } else { + ssid <- ssids + ssid2 <- ssids + } + + + # Initialize default tibble + init <- tibble::tibble( + fn = numeric(0), + species = character(0), + ssid = numeric(0) + ) + + # Initialize tibbles + x <- init # Extra sets + u <- init # Unlike values + s <- init # Summary of returned + a <- init # All sets when any unlike (x1, x2, a12) + + # Iterate over cases + for (i in seq_along(spp)) { + # Print current species + cat(paste0("spp = ", spp[i], "\n")) + # Reset tibbles + d1_all <- NULL + d2_all <- NULL + d1e <- NULL + d2e <- NULL + d1_safe <- NULL + d2_safe <- NULL + # Safely + safe_get_survey_sets <- purrr::safely(gfdata::get_survey_sets) + safe_get_all_survey_sets <- purrr::safely(gfdata::get_all_survey_sets) + + if(length(get_args)==0){ + get_args <- list() + } + if(length(get_all_args)==0){ + get_all_args <- list() + } + + get_args$species <- get_all_args$species <- spp[i] + get_args$ssid <- ssid + get_all_args$ssid <- ssid2 + + + # Get survey sets + d1_safe <- do.call(safe_get_survey_sets, get_args) + + # Extract result and (first) error message + d1_all <- d1_safe$result + d1e <- d1_safe$error[[1]][1] # Extract first list element + + # Let server have a rest + Sys.sleep(0.05) + # Get all survey sets + d2_safe <- do.call(safe_get_all_survey_sets, get_all_args) + + # Extract result and (first) error message + d2_all <- d2_safe$result + d2e <- d2_safe$error[[1]][1] # Extract first list element + + if(is.null(d1_all)){ + d1_all <- tibble::tibble( + survey_series_id = numeric(0), + survey_series_desc = character(0) + ) + } + if(is.null(d2_all)){ + d2_all <- tibble::tibble( + survey_series_id = numeric(0), + survey_series_desc = character(0) + ) + } + ssids_found <- dplyr::bind_rows( + dplyr::select(d1_all, survey_series_id, survey_series_desc), + dplyr::select(d2_all, survey_series_id, survey_series_desc)) |> + dplyr::distinct() + + ssid <- unique(ssids_found$survey_series_id) + + for (j in seq_along(ssid)) { + # Reset tibbles + a12 <- NULL # + d1 <- NULL + d2 <- NULL + dd <- NULL # + s1 <- NULL # + s2 <- NULL # + x1 <- NULL # + x2 <- NULL # + # Reset vectors + b <- NULL # + n1 <- NULL # + n2 <- NULL # + r1 <- NULL # + r2 <- NULL # + r12 <- NULL # + + # Create comparison columns + # - Robust to d1 <- NULL: Condition evaluates FALSE + # - Robust to ncol(d1) == 0: Condition evaluates FALSE + # - Robust to nrow(d1) == 0: Assigned value has nrow == 0 + # + if (all(c("species_code", "fishing_event_id") %in% colnames(d1_all))) { + d1 <- d1_all |> + dplyr::filter(survey_series_id == ssid[j]) |> + tidyr::drop_na(species_code, fishing_event_id) |> + dplyr::mutate( + comparison_id = paste0(species_code, fishing_event_id), + .before = 1 + ) + + if(drop_if_all_zero){ + # Drop all rows if all counts and weights each either zero or NA + if (all(c(d1$catch_count, d1$catch_weight) %in% c(0, NA))) { + d1 <- d1[0, ] # Drop all rows and keep columns + } + } + + } + # Create comparison columns + # - Robust to d2 <- NULL: Condition evaluates FALSE + # - Robust to ncol(d2) == 0: Condition evaluates FALSE + # - Robust to nrow(d2) == 0: Assigned value has nrow == 0 + # + if (all(c("species_code", "fishing_event_id") %in% colnames(d2_all))) { + d2 <- d2_all |> + dplyr::filter(survey_series_id == ssid[j]) |> + tidyr::drop_na(species_code, fishing_event_id) |> + dplyr::mutate( + comparison_id = paste0(species_code, fishing_event_id), + .before = 1 + ) + } + # Identify extra comparison_id + n1 <- setdiff(d1$comparison_id, d2$comparison_id) + n2 <- setdiff(d2$comparison_id, d1$comparison_id) + # Identify shared comparison_id + b <- dplyr::intersect(d1$comparison_id, d2$comparison_id) + # New summary rows + s1 <- tibble::tibble( + fn = 1L, + species = spp[i], + ssid = ssid[j], + survey = ssids_found$survey_series_desc[j], + count_ids = ifelse(is.null(d1), NA, length(unique(d1$comparison_id))), + extra_ids = ifelse(is.null(d1), NA, length(n1)), + shared_ids = ifelse(is.null(d1), NA, length(b)), + error = ifelse(is.null(d1e), "No", "Yes"), + message = ifelse(is.null(d1e), "NULL", d1e) + ) + s2 <- tibble::tibble( + fn = 2L, + species = spp[i], + ssid = ssid[j], + survey = ssids_found$survey_series_desc[j], + count_ids = ifelse(is.null(d2), NA, length(unique(d2$comparison_id))), + extra_ids = ifelse(is.null(d2), NA, length(n2)), + shared_ids = ifelse(is.null(d2), NA, length(b)), + error = ifelse(is.null(d2e), "No", "Yes"), + message = ifelse(is.null(d2e), "NULL", d2e) + + ) + # Augment summary + s <- dplyr::bind_rows(s, s1, s2) + + # New extra set rows + if (length(n1) > 0) { + # Extra set rows numbers + r1 <- which(d1$comparison_id %in% n1) + # Extra set tibble + x1 <- tibble::tibble( + fn = 1L, + species = spp[i], + ssid = ssid[j], + d1[r1, ] + ) + } + if (length(n2) > 0) { + # Extra set rows numbers + r2 <- which(d2$comparison_id %in% n2) + # Extra set tibble + x2 <- tibble::tibble( + fn = 2L, + species = spp[i], + ssid = ssid[j], + d2[r2, ] + ) + } + # Augment extra sets + x <- dplyr::bind_rows(x, x1, x2) + + # If either function had extra, include all shared as fn = 12 + any_d1 <- (length(unique(d1$comparison_id)) > 0) + any_d2 <- (length(unique(d2$comparison_id)) > 0) + any_extra <- ((length(n1) + length(n2)) > 0) + if (any_d1 & any_d2 & any_extra) { + # Shared event row numbers + r12 <- which(!(d2$comparison_id %in% n2)) + a12 <- tibble::tibble( + fn = 12L, + species = spp[i], + ssid = ssid[j], + d2[r12, ] + ) + } + # Augment return all fishing events only when functions differed + a <- dplyr::bind_rows(a, x1, x2, a12) + + # Only compares non-null output for both functions + if (!is.null(d1) & !is.null(d2)) { + # Only compare shared colnames + cn <- intersect(colnames(d1), colnames(d2)) + # Shared columns + d1 <- d1 |> + dplyr::select(tidyselect::all_of(cn)) |> + dplyr::mutate(fn = 1L, .before = 1) + d2 <- d2 |> + dplyr::select(tidyselect::all_of(cn)) |> + dplyr::mutate(fn = 2L, .before = 1) + # Bind rows + dd <- dplyr::bind_rows(d1, d2) |> + # Augment + dplyr::mutate(species = spp[i], .before = 2L) |> + dplyr::mutate(ssid = ssid[j], .before = 3L) |> + # Round + dplyr::mutate( + dplyr::across( + tidyselect::starts_with("density"), ~ round(.x, digits = 10) + ), + dplyr::across( + tidyselect::starts_with("speed"), ~ round(.x, digits = 10) + ), + dplyr::across( + tidyselect::starts_with("area"), ~ round(.x, digits = 8) + ) + ) |> + # Drop NAs + tidyr::drop_na(species_code, fishing_event_id) |> + # Arrange so same fishing_event_id are in sequential rows + dplyr::arrange(species_code, fishing_event_id, fn) |> + # Collapse to one row if equal (except in fn column) + dplyr::distinct(dplyr::across(-fn), .keep_all = TRUE) |> + dplyr::group_by(comparison_id) |> + # Keep only groups with more than one row (the inconsistent groups) + dplyr::filter(dplyr::n() > 1) |> + dplyr::ungroup() + } # End if + # Bind rows + u <- dplyr::bind_rows(u, dd) + } + } + + # Arrange + x <- dplyr::arrange(x, fn, species, ssid) + s <- dplyr::arrange(s, species, ssid, fn) + a <- dplyr::arrange(a, fn, species, ssid) + + # Return + list(x = x, u = u, s = s, a = a) +} diff --git a/inst/compare/example-comparisons.R b/inst/compare/example-comparisons.R new file mode 100644 index 0000000..9194968 --- /dev/null +++ b/inst/compare/example-comparisons.R @@ -0,0 +1,60 @@ +# Load +library(gfdata) +library(tidyverse) + +# Source functions +source(here::here("inst", "compare", "compare-survey-sets.R")) +source(here::here("inst", "compare", "compare-survey-samples.R")) + +# Define species and ssids +spp <- c("Yelloweye Rockfish", "Eulachon") + +# ssids <- NULL +ssids <- c(1,2,7,14,22) # shortlist spanning common types of surveys +# ssids <- c(1,2,3,4,6,7,14,16,22,36,39,40) # full list of gfsynopsis surveys +# ssids <- c(35, 68, 76, 82:87) # surveys excluded from gfsynopsis sablefish, hake, dogfish and jig + +# Compare survey sets +de <- compare_survey_sets(spp = spp, + ssids = ssids, # NULL here (default) returns default ssids for og fn + ## settings for get_all function that match more closely behaviour of og fn + usability = c(0, 1, 2, 6), # not default + grouping_only = TRUE, # not default + remove_false_zeros = FALSE # not default + ) + +# Extra events for a given species-survey combination +de$x + +# Unlike values +de$u + +# Summary of returns +de$s + +# All events when any differed +de$a + + +# Compare survey samples +ds <- compare_survey_samples(spp = spp, + ssids = ssids, # default is NULL for both + # major = NULL, # default is NULL for both + usability = c(0, 1, 2, 6), # default is NULL for both + unsorted_only = TRUE, # not default + random_only = TRUE, # not default + grouping_only = TRUE, + drop_na_columns = FALSE # not default + ) + +# Extra samples for a given species-survey combination +ds$x + +# Unlike values +ds$u + +# Summary of returns +ds$s + +# All samples when any differed +ds$a diff --git a/inst/sql/get-activity-code.sql b/inst/sql/get-activity-code.sql new file mode 100644 index 0000000..758e71c --- /dev/null +++ b/inst/sql/get-activity-code.sql @@ -0,0 +1,6 @@ +SELECT +SURVEY_SERIES_ID, +ACTIVITY_CODE +FROM GFBioSQL.dbo.SURVEY S + INNER JOIN GFBioSQL.dbo.TRIP_SURVEY TS ON TS.SURVEY_ID = S.SURVEY_ID + INNER JOIN GFBioSQL.dbo.TRIP_ACTIVITY TA ON TA.TRIP_ID = TS.TRIP_ID diff --git a/inst/sql/get-all-survey-samples.sql b/inst/sql/get-all-survey-samples.sql new file mode 100644 index 0000000..bf47daf --- /dev/null +++ b/inst/sql/get-all-survey-samples.sql @@ -0,0 +1,90 @@ +SELECT + ISNULL(S1.SURVEY_SERIES_ID, S.SURVEY_SERIES_ID) AS SURVEY_SERIES_ID, + ISNULL(G.SURVEY_SERIES_ID, S.SURVEY_SERIES_ID) AS SURVEY_SERIES_OG, + A.ACTIVITY_DESC, + TA.ACTIVITY_CODE, + ISNULL(SM.FE_PARENT_EVENT_ID, SM.FISHING_EVENT_ID) AS FISHING_EVENT_ID, + YEAR(TRIP_START_DATE) AS TRIP_YEAR, + SM.SAMPLE_DATE, + SM.SPECIES_CODE, + SPP.SPECIES_COMMON_NAME, + SPP.SPECIES_SCIENCE_NAME, + SP.SPECIMEN_ID, + SM.SAMPLE_ID, + SP.SPECIMEN_SEX_CODE AS SEX, + -- insert lengths here + ROUND_WEIGHT AS WEIGHT, + SPECIMEN_AGE AS AGE, + AGEING_METHOD_CODE, + SP.MATURITY_CODE, + MD.MATURITY_NAME, + MD.MATURITY_DESC, + SM.MATURITY_CONVENTION_CODE, + MC.MATURITY_CONVENTION_DESC, + MC.MATURITY_CONVENTION_MAXVALUE, + SM.MAJOR_STAT_AREA_CODE, + MSA.MAJOR_STAT_AREA_NAME, + SM.MINOR_STAT_AREA_CODE, + FE.GEAR_CODE AS GEAR, + R.REASON_DESC, + S.SURVEY_ID, + SM.TRIP_ID, + TRIP_SUB_TYPE_CODE, + SM.FE_PARENT_EVENT_ID, + SM.FE_MAJOR_LEVEL_ID, + SM.FE_SUB_LEVEL_ID, + SM.SAMPLE_TYPE_CODE, + ISNULL(SM.SAMPLE_TYPE_COMMENT, ST.SAMPLE_TYPE_DESC) AS SAMPLE_TYPE_COMMENT, + SM.SPECIES_CATEGORY_CODE, + SM.SAMPLE_SOURCE_CODE, + CASE WHEN SC.SPECIMEN_COLLECTED_IND = 'Y' OR SC.SPECIMEN_COLLECTED_IND = 'y' THEN 1 ELSE 0 END AS AGE_SPECIMEN_COLLECTED, + DNA.STORAGE_CONTAINER_TYPE_CODE AS DNA_SAMPLE_TYPE, + DNA.STORAGE_CONTAINER_SUB_ID AS DNA_CONTAINER_ID, + CASE WHEN SM.GEAR_CODE IN (1, 6, 8, 11, 14, 16) THEN ISNULL(TRSP.USABILITY_CODE, 0) + WHEN SM.GEAR_CODE IN (2) THEN ISNULL(TPSP.USABILITY_CODE, 0) + WHEN SM.GEAR_CODE IN (5) THEN ISNULL(LLSP.USABILITY_CODE, 0) + WHEN SM.GEAR_CODE IN (4) THEN ISNULL(HLSP.USABILITY_CODE, 0) + ELSE 0 END AS USABILITY_CODE, + SM.GROUPING_CODE AS GROUPING_CODE_ORIGINAL, + G.GROUPING_DESC AS GROUPING_DESC_ORIGINAL, + FEG.GROUPING_CODE AS GROUPING_CODE_UPDATED, -- for updated survey definitions + G2.GROUPING_DESC AS GROUPING_DESC_UPDATED, + S.ORIGINAL_IND + FROM GFBioSQL.dbo.B21_Samples SM + INNER JOIN GFBioSQL.dbo.B22_Specimens SP ON SM.SAMPLE_ID = SP.SAMPLE_ID + INNER JOIN GFBioSQL.dbo.SPECIES SPP ON SPP.SPECIES_CODE = SM.SPECIES_CODE + INNER JOIN GFBioSQL.dbo.FISHING_EVENT FE ON SM.FISHING_EVENT_ID = FE.FISHING_EVENT_ID + LEFT JOIN GROUPING G ON G.GROUPING_CODE = SM.GROUPING_CODE + LEFT JOIN TRIP_SURVEY TRS ON TRS.TRIP_ID = SM.TRIP_ID + LEFT JOIN SURVEY S1 ON S1.SURVEY_ID = TRS.SURVEY_ID --AND S1.SURVEY_SERIES_ID = G.SURVEY_SERIES_ID + LEFT JOIN SURVEY_GROUPING SG ON S1.SURVEY_ID = SG.SURVEY_ID AND SG.GROUPING_CODE = FE.GROUPING_CODE + LEFT JOIN FISHING_EVENT_GROUPING FEG ON SM.FISHING_EVENT_ID = FEG.FISHING_EVENT_ID AND SG.GROUPING_CODE = FEG.GROUPING_CODE + LEFT JOIN GROUPING G2 ON G2.GROUPING_CODE = FEG.GROUPING_CODE + LEFT JOIN SURVEY S ON S.SURVEY_ID = TRS.SURVEY_ID + LEFT JOIN GFBioSQL.dbo.REASON R ON FE.REASON_CODE = R.REASON_CODE + LEFT JOIN GFBioSQL.dbo.TRIP_ACTIVITY TA ON TA.TRIP_ID = SM.TRIP_ID + LEFT JOIN GFBioSQL.dbo.ACTIVITY A ON A.ACTIVITY_CODE = TA.ACTIVITY_CODE + LEFT JOIN GFBioSQL.dbo.Maturity_Convention MC ON SM.MATURITY_CONVENTION_CODE = MC.MATURITY_CONVENTION_CODE + LEFT JOIN GFBioSQL.dbo.MAJOR_STAT_AREA MSA ON SM.MAJOR_STAT_AREA_CODE = MSA.MAJOR_STAT_AREA_CODE + LEFT JOIN GFBioSQL.dbo.MATURITY_DESCRIPTION MD ON SM.MATURITY_CONVENTION_CODE = MD.MATURITY_CONVENTION_CODE AND SP.MATURITY_CODE = MD.MATURITY_CODE AND SP.SPECIMEN_SEX_CODE = MD.SPECIMEN_SEX_CODE + LEFT JOIN GFBioSQL.dbo.TRAWL_SPECS TRSP ON TRSP.FISHING_EVENT_ID = SM.FISHING_EVENT_ID + LEFT JOIN GFBioSQL.dbo.TRAP_SPECS TPSP ON TPSP.FISHING_EVENT_ID = SM.FISHING_EVENT_ID + LEFT JOIN GFBioSQL.dbo.LONGLINE_SPECS LLSP ON LLSP.FISHING_EVENT_ID = SM.FISHING_EVENT_ID + LEFT JOIN GFBioSQL.dbo.HANDLINE_SPECS HLSP ON HLSP.FISHING_EVENT_ID = SM.FISHING_EVENT_ID + LEFT JOIN HOOK H ON H.HOOK_CODE = LLSP.HOOK_CODE + LEFT JOIN HOOKSIZE HSZ ON HSZ.HOOKSIZE_CODE = LLSP.HOOKSIZE_CODE + LEFT JOIN SAMPLE_TYPE ST ON ST.SAMPLE_TYPE_CODE = SM.SAMPLE_TYPE_CODE + LEFT JOIN (SELECT SAMPLE_ID, MIN(SPECIMEN_ID) AS SPECIMEN_ID, COLLECTED_ATTRIBUTE_CODE, STORAGE_CONTAINER_TYPE_CODE, STORAGE_CONTAINER_SUB_ID + FROM GFBioSQL.dbo.SPECIMEN_COLLECTED + WHERE COLLECTED_ATTRIBUTE_CODE BETWEEN 3.5 AND 4.5 + GROUP BY SAMPLE_ID, SPECIMEN_ID, COLLECTED_ATTRIBUTE_CODE, STORAGE_CONTAINER_TYPE_CODE, STORAGE_CONTAINER_SUB_ID) DNA ON SP.SPECIMEN_ID = DNA.SPECIMEN_ID AND SP.SAMPLE_ID = DNA.SAMPLE_ID + LEFT JOIN (SELECT SAMPLE_ID, MIN(SPECIMEN_ID) AS SPECIMEN_ID, SPECIMEN_COLLECTED_IND + FROM GFBioSQL.dbo.SPECIMEN_COLLECTED + WHERE COLLECTED_ATTRIBUTE_CODE BETWEEN 20 AND 25 + GROUP BY SAMPLE_ID, SPECIMEN_ID, SPECIMEN_COLLECTED_IND) SC ON SP.SPECIMEN_ID = SC.SPECIMEN_ID AND SP.SAMPLE_ID = SC.SAMPLE_ID +WHERE TRIP_SUB_TYPE_CODE IN (2, 3) AND + ISNULL(G.SURVEY_SERIES_ID, S.SURVEY_SERIES_ID) <> 0 +-- insert species here +-- insert ssid here +-- insert major here +ORDER BY SM.SPECIES_CODE, YEAR(TRIP_START_DATE), TA.ACTIVITY_CODE, S.SURVEY_SERIES_ID diff --git a/inst/sql/get-all-survey-sets.sql b/inst/sql/get-all-survey-sets.sql new file mode 100644 index 0000000..cf41b8c --- /dev/null +++ b/inst/sql/get-all-survey-sets.sql @@ -0,0 +1,37 @@ + SELECT + C.SPECIES_CODE, + S.SURVEY_SERIES_ID, + T.TRIP_ID, + FE.FISHING_EVENT_ID, + SUM(ISNULL(CATCH_WEIGHT,0)) AS CATCH_WEIGHT, + SUM(ISNULL(CATCH_COUNT,0)) AS CATCH_COUNT + FROM SURVEY S + INNER JOIN SURVEY_SERIES ss ON + S.SURVEY_SERIES_ID = SS.SURVEY_SERIES_ID + INNER JOIN TRIP_SURVEY TRS ON + S.SURVEY_ID = TRS.SURVEY_ID + INNER JOIN TRIP T ON + TRS.TRIP_ID = T.TRIP_ID + INNER JOIN FISHING_EVENT FE ON + T.TRIP_ID = FE.TRIP_ID + INNER JOIN FISHING_EVENT_CATCH FEC ON + FEC.FISHING_EVENT_ID = FE.FISHING_EVENT_ID + INNER JOIN CATCH C ON + FEC.CATCH_ID = C.CATCH_ID + INNER JOIN SPECIES SP ON + SP.SPECIES_CODE = C.SPECIES_CODE + LEFT JOIN TRIP_ACTIVITY TA ON + TA.TRIP_ID = FE.TRIP_ID + LEFT JOIN ACTIVITY A ON + A.ACTIVITY_CODE = TA.ACTIVITY_CODE + WHERE S.SURVEY_SERIES_ID <> 0 + AND FE.FE_MAJOR_LEVEL_ID < 700 + AND FE.FE_PARENT_EVENT_ID IS NULL + -- insert species here + -- insert ssid here + -- insert fe_vector here + -- insert major here + GROUP BY C.SPECIES_CODE, A.ACTIVITY_DESC, TA.ACTIVITY_CODE, + S.SURVEY_SERIES_ID, S.SURVEY_ID, T.TRIP_ID, FE.FISHING_EVENT_ID, + FE.FE_MAJOR_LEVEL_ID, + T.TRIP_START_DATE, S.ORIGINAL_IND diff --git a/inst/sql/get-event-data.sql b/inst/sql/get-event-data.sql new file mode 100644 index 0000000..710a404 --- /dev/null +++ b/inst/sql/get-event-data.sql @@ -0,0 +1,87 @@ +SELECT + FE.FISHING_EVENT_ID, + FE.FE_PARENT_EVENT_ID, + FE.FE_MAJOR_LEVEL_ID, + FE.FE_SUB_LEVEL_ID, + FE.FE_MINOR_LEVEL_ID, + T.TRIP_ID, + ISNULL(S1.SURVEY_SERIES_ID, S.SURVEY_SERIES_ID) AS SURVEY_SERIES_ID, + ISNULL(G.SURVEY_SERIES_ID, S.SURVEY_SERIES_ID) AS SURVEY_SERIES_OG, + ISNULL(S1.SURVEY_ID, S.SURVEY_ID) AS SURVEY_ID, + A.ACTIVITY_DESC, + TA.ACTIVITY_CODE, + R.REASON_DESC, + YEAR(T.TRIP_START_DATE) AS TRIP_YEAR, + YEAR(COALESCE (FE_BEGIN_BOTTOM_CONTACT_TIME, FE_END_BOTTOM_CONTACT_TIME, FE_END_DEPLOYMENT_TIME, + FE_BEGIN_RETRIEVAL_TIME, FE_BEGIN_DEPLOYMENT_TIME, FE_END_RETRIEVAL_TIME, T.TRIP_START_DATE)) AS YEAR, + MONTH(COALESCE (FE_BEGIN_BOTTOM_CONTACT_TIME, FE_END_BOTTOM_CONTACT_TIME, FE_END_DEPLOYMENT_TIME, + FE_BEGIN_RETRIEVAL_TIME, FE_BEGIN_DEPLOYMENT_TIME, FE_END_RETRIEVAL_TIME)) AS MONTH, + DAY(COALESCE (FE_BEGIN_BOTTOM_CONTACT_TIME, FE_END_BOTTOM_CONTACT_TIME, FE_END_DEPLOYMENT_TIME, + FE_BEGIN_RETRIEVAL_TIME, FE_BEGIN_DEPLOYMENT_TIME, FE_END_RETRIEVAL_TIME)) AS DAY, + ISNULL(FE_BEGIN_BOTTOM_CONTACT_TIME, FE_END_DEPLOYMENT_TIME) AS TIME_DEPLOYED, + ISNULL(FE_END_BOTTOM_CONTACT_TIME, FE_BEGIN_RETRIEVAL_TIME) AS TIME_RETRIEVED, + FE_END_DEPLOYMENT_TIME AS TIME_END_DEPLOYMENT, + FE_BEGIN_RETRIEVAL_TIME AS TIME_BEGIN_RETRIEVAL, + FE_START_LATTITUDE_DEGREE + FE_START_LATTITUDE_MINUTE / 60 AS LATITUDE, + -(FE_START_LONGITUDE_DEGREE + FE_START_LONGITUDE_MINUTE / 60) AS LONGITUDE, + FE_END_LATTITUDE_DEGREE + FE_END_LATTITUDE_MINUTE / 60 AS LATITUDE_END, + -(FE_END_LONGITUDE_DEGREE + FE_END_LONGITUDE_MINUTE / 60) AS LONGITUDE_END, + FE.MAJOR_STAT_AREA_CODE, + FE.MINOR_STAT_AREA_CODE, + COALESCE(FE.FE_MODAL_BOTTOM_DEPTH, FE.FE_BEGINNING_BOTTOM_DEPTH, FE.FE_END_BOTTOM_DEPTH, + FE.FE_MIN_BOTTOM_DEPTH, FE.FE_MAX_BOTTOM_DEPTH) AS DEPTH_M, + FE.FE_BEGINNING_BOTTOM_DEPTH AS DEPTH_BEGIN, + FE_END_BOTTOM_DEPTH AS DEPTH_END, + --- FE.FE_BOTTOM_WATER_TEMPERATURE, --- rarely available + T.VESSEL_ID AS VESSEL_ID, + T.CAPTAIN_ID AS CAPTAIN_ID, + ISNULL(DATEDIFF(MI,FE_BEGIN_BOTTOM_CONTACT_TIME, + FE_END_BOTTOM_CONTACT_TIME), + DATEDIFF(MI,FE_END_DEPLOYMENT_TIME, + FE_BEGIN_RETRIEVAL_TIME)) AS DURATION_MIN, + NULLIF(FE_DISTANCE_TRAVELLED,0) * 1000.0 AS TOW_LENGTH_M, + NULLIF(TRSP.TRLSP_MOUTH_OPENING_WIDTH,0) AS MOUTH_WIDTH_M, + ISNULL(NULLIF(TRSP.TRLSP_DOORSPREAD,0), BD.DOORSPREAD) AS DOORSPREAD_M, + ISNULL(NULLIF(TRSP.TRLSP_SPEED,0), BD.SPEED)* 16.66667 AS SPEED_MPM, + CASE WHEN FE.GEAR_CODE IN (5) THEN LLSP.HOOK_CODE + WHEN FE.GEAR_CODE IN (4) THEN HLSP.HOOK_CODE + ELSE 0 END AS HOOK_CODE, + LGLSP_HOOK_COUNT, + H.HOOK_DESC, + HSZ.HOOKSIZE_DESC, + CASE WHEN FE.GEAR_CODE IN (1, 6, 8, 11, 14, 16) THEN ISNULL(TRSP.USABILITY_CODE, 0) + WHEN FE.GEAR_CODE IN (2) THEN ISNULL(TPSP.USABILITY_CODE, 0) + WHEN FE.GEAR_CODE IN (5) THEN ISNULL(LLSP.USABILITY_CODE, 0) + WHEN FE.GEAR_CODE IN (4) THEN ISNULL(HLSP.USABILITY_CODE, 0) + ELSE 0 END AS USABILITY_CODE, + FE.GROUPING_CODE AS GROUPING_CODE_ORIGINAL, + G.GROUPING_DESC AS GROUPING_DESC_ORIGINAL, + FEG.GROUPING_CODE AS GROUPING_CODE_UPDATED, -- for updated survey definitions + G2.GROUPING_DESC AS GROUPING_DESC_UPDATED, + G.GROUPING_DEPTH_ID, + G.AREA_KM2 AS GROUPING_AREA_KM2, + S.ORIGINAL_IND + FROM FISHING_EVENT FE + LEFT JOIN TRAWL_SPECS TRSP ON TRSP.FISHING_EVENT_ID = FE.FISHING_EVENT_ID + LEFT JOIN LONGLINE_SPECS LLSP ON LLSP.FISHING_EVENT_ID = FE.FISHING_EVENT_ID + LEFT JOIN TRAP_SPECS TPSP ON TPSP.FISHING_EVENT_ID = FE.FISHING_EVENT_ID + LEFT JOIN HANDLINE_SPECS HLSP ON HLSP.FISHING_EVENT_ID = FE.FISHING_EVENT_ID + LEFT JOIN TRIP T ON T.TRIP_ID = FE.TRIP_ID + LEFT JOIN TRIP_ACTIVITY TA ON TA.TRIP_ID = FE.TRIP_ID + LEFT JOIN ACTIVITY A ON A.ACTIVITY_CODE = TA.ACTIVITY_CODE + LEFT JOIN TRIP_SURVEY TRS ON TRS.TRIP_ID = T.TRIP_ID + LEFT JOIN REASON R ON FE.REASON_CODE = R.REASON_CODE + LEFT JOIN HOOK H ON H.HOOK_CODE = LLSP.HOOK_CODE + LEFT JOIN HOOKSIZE HSZ ON HSZ.HOOKSIZE_CODE = LLSP.HOOKSIZE_CODE + LEFT JOIN GROUPING G ON G.GROUPING_CODE = FE.GROUPING_CODE + LEFT JOIN SURVEY S1 ON S1.SURVEY_ID = TRS.SURVEY_ID --AND S1.SURVEY_SERIES_ID = G.SURVEY_SERIES_ID + LEFT JOIN SURVEY_GROUPING SG ON SG.SURVEY_ID = S1.SURVEY_ID AND SG.GROUPING_CODE = FE.GROUPING_CODE + LEFT JOIN FISHING_EVENT_GROUPING FEG ON FEG.FISHING_EVENT_ID = FE.FISHING_EVENT_ID AND SG.GROUPING_CODE = FEG.GROUPING_CODE + LEFT JOIN GROUPING G2 ON G2.GROUPING_CODE = FEG.GROUPING_CODE + LEFT JOIN SURVEY S ON S.SURVEY_ID = TRS.SURVEY_ID + LEFT JOIN BOOT_DEFAULTS BD ON BD.SURVEY_ID = ISNULL(S1.SURVEY_ID, S.SURVEY_ID) + WHERE FE.FISHING_EVENT_ID IS NOT NULL + -- insert ssid here + -- insert fe_vector here + -- insert major here + ORDER BY A.ACTIVITY_DESC, T.TRIP_START_DATE, FE.FISHING_EVENT_ID diff --git a/inst/sql/get-ll-hook-data-generalized.sql b/inst/sql/get-ll-hook-data-generalized.sql new file mode 100644 index 0000000..96af678 --- /dev/null +++ b/inst/sql/get-ll-hook-data-generalized.sql @@ -0,0 +1,62 @@ +-- Adapted from query written by Norm Olsen for Marie-Pierre Etienne (Yelloweye Rockfish outside hbll and iphc surveys) + SELECT FE.FISHING_EVENT_ID, + FE.FE_MAJOR_LEVEL_ID, + T.TRIP_ID, + Nall count_all_animals, + Nsp count_all_species, + Nb count_bait_only, + Ne count_empty_hooks, + Nnf count_not_fishing, + Nbr count_bent_broken + FROM SURVEY S + INNER JOIN TRIP_SURVEY TS ON + S.SURVEY_ID = TS.SURVEY_ID + INNER JOIN TRIP T ON + TS.TRIP_ID = T.TRIP_ID + INNER JOIN FISHING_EVENT FE ON + T.TRIP_ID = FE.TRIP_ID + INNER JOIN ( + SELECT TRIP_ID, + VESSEL_ID, + FE_MAJOR_LEVEL_ID, + SUM(Nall) AS Nall, + SUM(Nsp) AS Nsp, + SUM(Ne) AS Ne, + SUM(Nb) AS Nb, + SUM(Nnf) AS Nnf, + SUM(Nbr) AS Nbr + FROM ( + SELECT T.TRIP_ID, + T.VESSEL_ID, + FE.FE_MAJOR_LEVEL_ID, + FE.FISHING_EVENT_ID, + SUM(CASE WHEN SPECIES_CODE IS NOT NULL THEN 1 ELSE 0 END) AS Nsp, -- all species recorded + SUM(CASE WHEN HOOK_YIELD_CODE IN (3,4,5,8) THEN 1 ELSE 0 END) AS Nall, -- all animals + SUM(CASE HOOK_YIELD_CODE WHEN 1 THEN 1 ELSE 0 END) AS Ne, + SUM(CASE HOOK_YIELD_CODE WHEN 2 THEN 1 WHEN 6 THEN 1 ELSE 0 END) AS Nb, -- bait only and bait skin + SUM(CASE WHEN HOOK_CONDITION_CODE IN (1,2,3,7) THEN 1 ELSE 0 END) AS Nbr, + SUM(CASE HOOK_YIELD_CODE WHEN 9 THEN 1 ELSE 0 END) AS Nnf -- not fishing + --SUM(CASE HOOK_YIELD_CODE WHEN 2 THEN 1 ELSE 0 END) AS Nb (only 'Bait only' vs. bait only and bait skin) + FROM SURVEY S + INNER JOIN TRIP_SURVEY TS ON + S.SURVEY_ID = TS.SURVEY_ID + INNER JOIN TRIP T ON + TS.TRIP_ID = T.TRIP_ID + INNER JOIN FISHING_EVENT FE ON + T.TRIP_ID = FE.TRIP_ID + LEFT JOIN FISHING_EVENT_CATCH FEC ON + FE.FISHING_EVENT_ID = FEC.FISHING_EVENT_ID + LEFT JOIN CATCH C ON + FEC.CATCH_ID = C.CATCH_ID + LEFT JOIN HOOK_SPECS HS ON + FE.FISHING_EVENT_ID = HS.FISHING_EVENT_ID + LEFT JOIN LONGLINE_SPECS LS + ON FE.FISHING_EVENT_ID = LS.FISHING_EVENT_ID + WHERE FE_PARENT_EVENT_ID IS NOT NULL AND + FE_MINOR_LEVEL_ID IS NOT NULL + -- insert ssid here + GROUP BY T.TRIP_ID, T.VESSEL_ID, FE.FE_MAJOR_LEVEL_ID, FE.FISHING_EVENT_ID) T + GROUP BY TRIP_ID, VESSEL_ID, FE_MAJOR_LEVEL_ID) C ON + T.TRIP_ID = C.TRIP_ID AND T.VESSEL_ID = C.VESSEL_ID AND FE.FE_MAJOR_LEVEL_ID = C.FE_MAJOR_LEVEL_ID + WHERE FE_PARENT_EVENT_ID IS NULL + -- insert ssid here diff --git a/inst/sql/get-ll-sub-level-hook-data.sql b/inst/sql/get-ll-sub-level-hook-data.sql new file mode 100644 index 0000000..b51f7d2 --- /dev/null +++ b/inst/sql/get-ll-sub-level-hook-data.sql @@ -0,0 +1,66 @@ +-- Adapted from query written by Norm Olsen for Marie-Pierre Etienne (Yelloweye Rockfish outside hbll and iphc surveys) + SELECT + FE.FE_MAJOR_LEVEL_ID, + FE.FE_SUB_LEVEL_ID, + T.TRIP_ID, + Nall count_all_animals, + Nsp count_all_species, + Nb count_bait_only, + Ne count_empty_hooks, + Nnf count_not_fishing, + Nbr count_bent_broken + FROM SURVEY S + INNER JOIN TRIP_SURVEY TS ON + S.SURVEY_ID = TS.SURVEY_ID + INNER JOIN TRIP T ON + TS.TRIP_ID = T.TRIP_ID + INNER JOIN FISHING_EVENT FE ON + T.TRIP_ID = FE.TRIP_ID + INNER JOIN ( + SELECT TRIP_ID, + VESSEL_ID, + FE_MAJOR_LEVEL_ID, + FE_SUB_LEVEL_ID, + SUM(Nall) AS Nall, + SUM(Nsp) AS Nsp, + SUM(Ne) AS Ne, + SUM(Nb) AS Nb, + SUM(Nnf) AS Nnf, + SUM(Nbr) AS Nbr + FROM ( + SELECT T.TRIP_ID, + T.VESSEL_ID, + FE.FE_MAJOR_LEVEL_ID, + FE.FE_SUB_LEVEL_ID, + FE.FISHING_EVENT_ID, + SUM(CASE WHEN SPECIES_CODE IS NOT NULL THEN 1 ELSE 0 END) AS Nsp, -- all species recorded + SUM(CASE WHEN HOOK_YIELD_CODE IN (3,4,5,8) THEN 1 ELSE 0 END) AS Nall, -- all animals + SUM(CASE HOOK_YIELD_CODE WHEN 1 THEN 1 ELSE 0 END) AS Ne, + SUM(CASE HOOK_YIELD_CODE WHEN 2 THEN 1 WHEN 6 THEN 1 ELSE 0 END) AS Nb, -- bait only and bait skin + SUM(CASE WHEN HOOK_CONDITION_CODE IN (1,2,3,7) THEN 1 ELSE 0 END) AS Nbr, + SUM(CASE HOOK_YIELD_CODE WHEN 9 THEN 1 ELSE 0 END) AS Nnf -- not fishing + --SUM(CASE HOOK_YIELD_CODE WHEN 2 THEN 1 ELSE 0 END) AS Nb (only 'Bait only' vs. bait only and bait skin) + FROM SURVEY S + INNER JOIN TRIP_SURVEY TS ON + S.SURVEY_ID = TS.SURVEY_ID + INNER JOIN TRIP T ON + TS.TRIP_ID = T.TRIP_ID + INNER JOIN FISHING_EVENT FE ON + T.TRIP_ID = FE.TRIP_ID + LEFT JOIN FISHING_EVENT_CATCH FEC ON + FE.FISHING_EVENT_ID = FEC.FISHING_EVENT_ID + LEFT JOIN CATCH C ON + FEC.CATCH_ID = C.CATCH_ID + LEFT JOIN HOOK_SPECS HS ON + FE.FISHING_EVENT_ID = HS.FISHING_EVENT_ID + LEFT JOIN LONGLINE_SPECS LS + ON FE.FISHING_EVENT_ID = LS.FISHING_EVENT_ID + WHERE FE_PARENT_EVENT_ID IS NOT NULL AND + FE_MINOR_LEVEL_ID IS NOT NULL + -- insert ssid here + GROUP BY T.TRIP_ID, T.VESSEL_ID, FE.FE_MAJOR_LEVEL_ID, FE.FISHING_EVENT_ID, FE.FE_SUB_LEVEL_ID) T + GROUP BY TRIP_ID, VESSEL_ID, FE_MAJOR_LEVEL_ID, FE_SUB_LEVEL_ID) C ON + T.TRIP_ID = C.TRIP_ID AND T.VESSEL_ID = C.VESSEL_ID AND FE.FE_MAJOR_LEVEL_ID = C.FE_MAJOR_LEVEL_ID + AND FE.FE_SUB_LEVEL_ID = C.FE_SUB_LEVEL_ID + WHERE FE_MINOR_LEVEL_ID IS NULL + -- insert ssid here diff --git a/inst/sql/get-spp-sample-length-type.sql b/inst/sql/get-spp-sample-length-type.sql index dd8516c..bf62b7b 100644 --- a/inst/sql/get-spp-sample-length-type.sql +++ b/inst/sql/get-spp-sample-length-type.sql @@ -1,6 +1,8 @@ -SELECT COUNT(Fork_Length) Fork_Length +SELECT SPECIES_CODE + ,COUNT(Fork_Length) Fork_Length ,COUNT(Standard_Length) Standard_Length ,COUNT(Total_Length) Total_Length ,COUNT(Second_Dorsal_Length) Second_Dorsal_Length FROM B22_Specimens - -- insert species here + -- insert species here + GROUP BY SPECIES_CODE diff --git a/inst/sql/get-sub-level-catch.sql b/inst/sql/get-sub-level-catch.sql new file mode 100644 index 0000000..1f04614 --- /dev/null +++ b/inst/sql/get-sub-level-catch.sql @@ -0,0 +1,18 @@ +SELECT +FEC.TRIP_ID, +FE.FE_PARENT_EVENT_ID, +FE.FE_MAJOR_LEVEL_ID, +FE.FE_SUB_LEVEL_ID, +C.SPECIES_CODE, +SUM(CATCH_COUNT) AS CATCH_COUNT +FROM FISHING_EVENT_CATCH FEC +INNER JOIN FISHING_EVENT FE ON FE.FISHING_EVENT_ID = FEC.FISHING_EVENT_ID +INNER JOIN CATCH C ON C.CATCH_ID = FEC.CATCH_ID +INNER JOIN TRIP_SURVEY TS ON TS.TRIP_ID = FEC.TRIP_ID +GROUP BY FEC.TRIP_ID, +---FEC.FISHING_EVENT_ID, +FE.FE_PARENT_EVENT_ID, +FE.FE_MAJOR_LEVEL_ID, +FE.FE_SUB_LEVEL_ID, +C.SPECIES_CODE +ORDER BY FEC.TRIP_ID, FE.FE_MAJOR_LEVEL_ID diff --git a/man/assign_areas.Rd b/man/assign_areas.Rd index 9268c71..9e53f6e 100644 --- a/man/assign_areas.Rd +++ b/man/assign_areas.Rd @@ -19,7 +19,9 @@ descriptions.} Assign areas } \examples{ -x <- c("5D: NORTHERN HECATE STRAIT", "3C: S.W. VANCOUVER ISLAND", - "3D: N.W. VANCOUVER ISLAND") +x <- c( + "5D: NORTHERN HECATE STRAIT", "3C: S.W. VANCOUVER ISLAND", + "3D: N.W. VANCOUVER ISLAND" +) assign_areas(x) } diff --git a/man/correct_ssids.Rd b/man/correct_ssids.Rd new file mode 100644 index 0000000..eebfd55 --- /dev/null +++ b/man/correct_ssids.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/correct-ssids.R +\name{correct_ssids} +\alias{correct_ssids} +\title{Custom fixes for problem surveys with shared trip ids resulting in assignment to wrong ssid} +\usage{ +correct_ssids(dat, specimens = FALSE) +} +\arguments{ +\item{dat}{df containing these columns: fishing_event_ids, survey_series_id, survey_id, +major_stat_area_code, minor_stat_area_code} + +\item{specimens}{Defaults to FALSE where checks for duplication of fishing_event_ids} +} +\description{ +Custom fixes for problem surveys with shared trip ids resulting in assignment to wrong ssid +} diff --git a/man/get_all.Rd b/man/get_all.Rd new file mode 100644 index 0000000..132a150 --- /dev/null +++ b/man/get_all.Rd @@ -0,0 +1,141 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get-all-survey-samples.R, +% R/get-all-survey-sets.R +\name{get_all_survey_samples} +\alias{get_all_survey_samples} +\alias{get_all_survey_sets} +\title{Get all data} +\usage{ +get_all_survey_samples( + species, + ssid = NULL, + major = NULL, + usability = NULL, + unsorted_only = FALSE, + random_only = FALSE, + grouping_only = FALSE, + include_event_info = FALSE, + include_activity_matches = FALSE, + remove_bad_data = TRUE, + remove_duplicates = TRUE, + return_dna_info = FALSE, + drop_na_columns = TRUE, + quiet_option = "message" +) + +get_all_survey_sets( + species, + ssid = NULL, + major = NULL, + years = NULL, + join_sample_ids = FALSE, + remove_false_zeros = TRUE, + remove_bad_data = TRUE, + remove_duplicates = TRUE, + include_activity_matches = FALSE, + usability = NULL, + grouping_only = FALSE, + drop_na_columns = TRUE, + quiet_option = "message" +) +} +\arguments{ +\item{species}{One or more species common names (e.g. "pacific ocean +perch") or one or more species codes (e.g. \code{396}). Species codes can be +specified as numeric vectors \verb{c(396, 442}) or characters \code{c("396", "442")}. +Numeric values shorter than 3 digits will be expanded to 3 digits and +converted to character objects (\code{1} turns into \code{"001"}). Species common +names and species codes should not be mixed. If any element is missing a +species code, then all elements will be assumed to be species common +names. Does not work with non-numeric species codes, so in those cases the +common name will be needed.} + +\item{ssid}{A numeric vector of survey series IDs. Run \code{\link[=get_ssids]{get_ssids()}} for a +look-up table of available survey series IDs with surveys series +descriptions. Default is to return all data from all surveys. Some of the +most useful ids include: contemporary trawl (1, 3, 4, 16), historic trawl +(2), IPHC (14), sablefish (35), and HBLL (22, 36, 39, 40).} + +\item{major}{Character string (or vector) of major stat area code(s) to +include (characters). Use get_major_areas() to lookup area codes with +descriptions. Default is NULL.} + +\item{usability}{A vector of usability codes to include. Defaults to NULL, +but typical set for a design-based trawl survey index is \code{c(0, 1, 2, 6)}. +IPHC codes may be different to other surveys and the modern Sablefish survey +doesn't seem to assign usabilities.} + +\item{unsorted_only}{Defaults to FALSE, which will return all specimens +collected on research trips. TRUE returns only unsorted (\code{1}) and \code{NA} +specimens for both \code{species_category_code} and \code{sample_source_code}.} + +\item{random_only}{Defaults to FALSE, which will return all specimens +collected on research trips. TRUE returns only randomly sampled +specimens (\code{sample_type_code} = \verb{1, 2, 6, 7, or 8}).} + +\item{grouping_only}{Defaults to FALSE, which will return all specimens or sets +collected on research trips. TRUE returns only sets or specimens from fishing +events with grouping codes that match that expected for a survey. Can also be +achieved by filtering for specimens where \code{!is.na(grouping_code)}.} + +\item{include_event_info}{Logical for whether to append all relevant fishing +event info (location, timing, effort, catch, etc.). Defaults to TRUE.} + +\item{include_activity_matches}{Get all surveys with activity codes that +match chosen ssids.} + +\item{remove_bad_data}{Remove known bad data, such as unrealistic +length or weight values and duplications due to trips that include multiple +surveys. Default is TRUE.} + +\item{remove_duplicates}{Logical for whether to remove duplicated event +records due to overlapping survey stratifications when original_ind = 'N'. +Default is FALSE. This option only remains possible when ssids are supplied +and activity matches aren't included. Otherwise turns on automatically.} + +\item{return_dna_info}{Should DNA container ids and sample type be returned? +This can create duplication of specimen ids for some species. Defaults to +FALSE.} + +\item{drop_na_columns}{Logical for removing all columns that only contain NAs. +Defaults to TRUE.} + +\item{quiet_option}{Default option, \code{"message"}, suppresses messages from +sections of code with lots of \code{join_by} messages. Any other string will allow +messages.} + +\item{years}{Default is NULL, which returns all years.} + +\item{join_sample_ids}{This option was problematic, so now reverts to FALSE.} + +\item{remove_false_zeros}{Default of \code{TRUE} will make sure weights > 0 don't have +associated counts of 0 and vice versa. Mostly useful for trawl data where +counts are only taken for small catches.} +} +\description{ +These functions get all survey set or sample data for a set of species by +major area, activity, or specific surveys. The main functions in this package +focus on retrieving the more commonly used typs of data and are often limited +to sets and samples that conform to current design-based standards and survey +grids. These functions will retrieve everything and therefore require careful +consideration of what data types are reasonable to include depending on the +purpose. For this reason these function return a lot of columns, although the +exact number depends on which types of surveys are being returned. +} +\examples{ +\dontrun{ +## Import survey catch density and location data by tow or set for plotting +## Specify single or multiple species by common name or species code and +## single or multiple survey series id(s). +## Notes: +## `area_km` is the stratum area used in design-based index calculation. +## `area_swept` is in m^2 and is used to calculate density for trawl surveys +## It is based on `area_swept1` (`doorspread_m` x `tow_length_m`) except +## when `tow_length_m` is missing, and then we use `area_swept2` +## (`doorspread` x `duration_min` x `speed_mpm`). +## `duration_min` is derived in the SQL procedure "proc_catmat_2011" and +## differs slightly from the difference between `time_deployed` and +## `time_retrieved`. +} + +} diff --git a/man/get_parent_level_counts.Rd b/man/get_parent_level_counts.Rd new file mode 100644 index 0000000..43e97a9 --- /dev/null +++ b/man/get_parent_level_counts.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parent-level-counts.R +\name{get_parent_level_counts} +\alias{get_parent_level_counts} +\title{summarize sub/minor level counts from fishing event data at the parent event level +includes correction for a typo in dataframe +retrieves missing fishing_event_ids for sablefish surveys using major_level_ids} +\usage{ +get_parent_level_counts(fe) +} +\arguments{ +\item{fe}{df retrieved with get-event-data.sql} +} +\description{ +summarize sub/minor level counts from fishing event data at the parent event level +includes correction for a typo in dataframe +retrieves missing fishing_event_ids for sablefish surveys using major_level_ids +} diff --git a/man/get_skate_level_counts.Rd b/man/get_skate_level_counts.Rd new file mode 100644 index 0000000..7f0a886 --- /dev/null +++ b/man/get_skate_level_counts.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/skate-level-counts.R +\name{get_skate_level_counts} +\alias{get_skate_level_counts} +\title{summarize fishing event data at the skate level +retrieves missing fishing_event_ids for sablefish surveys using major_level_ids} +\usage{ +get_skate_level_counts(fe) +} +\arguments{ +\item{fe}{df retrieved with get-event-data.sql} +} +\description{ +summarize fishing event data at the skate level +retrieves missing fishing_event_ids for sablefish surveys using major_level_ids +} diff --git a/man/survey_blocks.Rd b/man/survey_blocks.Rd index 004cb05..476ac44 100644 --- a/man/survey_blocks.Rd +++ b/man/survey_blocks.Rd @@ -27,6 +27,7 @@ as documented in \code{data-raw/survey_blocks.R}. } \examples{ \dontshow{if (requireNamespace("sf", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +requireNamespace("ggplot2", quietly = TRUE) library(sf) library(ggplot2) gfdata::survey_blocks |> diff --git a/vignettes/.gitignore b/vignettes/.gitignore index 097b241..2d19fc7 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -1,2 +1 @@ *.html -*.R diff --git a/vignettes/gfdata-vignette-get-all.Rmd b/vignettes/gfdata-vignette-get-all.Rmd new file mode 100644 index 0000000..ffb046c --- /dev/null +++ b/vignettes/gfdata-vignette-get-all.Rmd @@ -0,0 +1,724 @@ +--- +title: "gfdata `get_all` vignette" +author: "Philina English" +date: "2024-10-11" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{gfdata `get_all' Vignette} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +## Why use a `get_all_*()` function? + +The original `get_*()` survey functions are limited to returning sets and specimen samples that conform to current survey design specifications based on assigned grouping and usability codes. +This works well for some surveys and data uses that depend on sampling a consistent survey footprint across years such as generating design-based abundance indexes. +Other uses can harness information from samples that were collected outside the current survey footprint or from different types of surveys, some of which don't consistently apply these codes. +The `get_all_*()` functions have been designed to retrieve all fishery-independent survey data relevant to a particular species, or set of species, and to do so more quickly and comprehensively than the original functions. +When retrieving data for multiple species at once, these functions will be dramatically faster than the original `get_*()` functions. +This is because sql scripts are called once for all species rather than repeatedly for each species. +When retrieving data for single species run times will depend on which surveys and arguments are used. +The extent of the data returned can be specific to a single survey, a single major stat area, any combination of these, or generalized to get everything in the database that is appropriately formatted. + +Additional variables are also returned to support modelling objectives and decisions regarding which data should be retained for specific purposes. + +### More flexibility + +The original functions required the user to specify survey series ids (`ssid`) and were limited in which they could accept and return. + +For `get_all_*()` functions, we have the option to set `ssid = NULL` which will return all fishery-independent samples and catch data in the database, as long as the survey series ids (and for catch data, survey ids as well) have been assigned in the database. +These include trap (sablefish), longline, jig, and most contemporary and historic trawl surveys (exceptions include the Nearshore Shrimp trawl survey for which survey ids are missing). +If a character string of major stat area codes is provided to the argument `major`, than all sets or samples from that area will be returned. + + +### Design-based analyses + +Because the original `get_*()` survey functions were intended to only return sets and specimen samples that conform to current survey design specifications, they cannot retrieve sets and samples from cells that fall outside the latest definition of a particular survey's design. +When this behaviour is desired, it can be reproduced using the `get_all_*()` functions with the filtering options `usability = c(0, 1, 2, 6)` and `grouping_only = TRUE`. +This works reliably for most groundfish bottom trawl and longline surveys (`ssid = c(1, 2, 3, 4, 16, 22, 36, 39, 40)`). +However, when these filtering options are applied to certain surveys, a small proportion of the data returned by the original function may be missed (e.g., IPHC: `ssid = 14`) and no data is returned for surveys for which grouping or usability codes do not appear in the database (e.g., the jig survey: `ssid = c(82:87)`). + +If using the `get_all_*()` functions to generate design-based indices, the strata area variable is now called `grouping_area_km2` (instead of `area_km2` to avoid confusion with `area_swept*` variables) and, in case design changes occur that are not incorporated into the usability codes, one should also always check for differences between the `grouping_code` and `grouping_code_updated` variables. +The `grouping_code_updated` generally contains a subset of the former, likely as a result of a shrinking footprint or the dropping an entire strata from a survey's definition. +This is currently the case for the offshore shrimp, also known as multi-species small-mesh (MSSM), surveys (`ssid = c(6, 7)`), which needs to be filtered for only those sets with updated grouping codes (`!is.na(grouping_code_updated)`) in order to match the current survey design. +Consulting data stewards for specific surveys may be helpful in understanding differences between grouping codes. + +To retrieve specimen samples that conform to design specifications, the arguments `unsorted_only = TRUE` and `random_only = TRUE` should be used in addition to usability and grouping options. +When doing so `get_all_survey_samples()` function will return > 70 additional specimens over the original function for each of the longline surveys (`ssid = c(22, 36, 39, 40)`). +This is because the original `get_survey_samples()` function used a stricter method for filtering based on grouping codes. +This stricter filtering matches how `get_survey_sets()` filtered for the current trawl survey footprint, but not how it filtered sets for longline surveys. +If desired, this stricter filtering can be achieved for both sets and samples from any survey by filtering for `!is.na(grouping_code_updated)`. + + +### Non-standard data + + +In contrast, the default behaviour of the `get_all_*()` survey functions is to return all data collected on any given survey, whether or not it conforms to current design. +This includes sets and samples from grid cells that all within subsequently established Rockfish Conservation areas (RCAs), and data that differ at the skate level. +The original functions were not built to retrieve data that differed at the skate level, like gear comparison studies (e.g., `ssid = 48`). +The `get_all_*()` functions will automatically return catch information at the skate level, instead of the fishing event level, for sets within a single function call whenever gear variables (currently checking for differences in hook code and size) differ between skates. + + + + +## Set up + +If you don't already have the package installed, see the general gfdata vignette for instructions. +Here we will load gfdata along with the package dplyr. + + + ``` r + library(gfdata) + library(dplyr) + library(tibble) + ``` + + + +The available arguments are described in the help documentation: + + + ``` r + ?get_all_survey_sets() + ?get_all_survey_samples() + ``` + + +## Examples + +### What survey data is available for a species? + +As an example, we might want to determine what survey set data are in our database for Bluntnose Sixgill Sharks (*Hexanchus griseus*). +For now, we will leave the default settings that pull all surveys and all areas. +Beware that some records in the database are from outside Canadian waters. +If desired, returned data can be filtered using the `major_stat_area_code` to retain only Canadian records (see `get_major_areas()` to identify which codes to use). + +#### Original `get_survey_sets()` function + +To start with, we check what the original `get_survey_sets()` function returns for this species. +By default this function returns just the most commonly used groundfish surveys: synoptic trawl (`ssid = c(1, 3, 4, 16)`), one historical trawl (`2`), and five longline--IPHC (`14`) and PHMA (`22, 36, 39, 40`) surveys. +The first thing to note here is that this function will only return one row per fishing event (unless overlapping survey series or sample_ids were requested). +This function will also return all sets for any survey series, even when the species has never been recorded on that survey. + + + ``` r + d0 <- get_survey_sets("Bluntnose Sixgill Shark") + nrow(d0) #> number of rows + ``` + + ``` + #> [1] 15349 + ``` + + ``` r + length(unique(d0$fishing_event_id)) #> number of fishing events + ``` + + ``` + #> [1] 15349 + ``` + + ``` r + sort(unique(d0$survey_series_id)) #> all default survey series were returned + ``` + + ``` + #> [1] 1 2 3 4 14 16 22 36 39 40 + ``` + +In contrast, `get_all_survey_sets()` only returns set data for survey series that have captured the species at least once. +Both `*_survey_sets()` functions return all sets for any survey series returned, including those that did not record the species. +So, to make sets that did capture the species visible to `head()`, we will sort by descending `catch_count`. + + + ``` r + d0 <- d0 |> rename(ssid = survey_series_id) |> + relocate(year, fishing_event_id, catch_count, catch_weight, ssid, survey_abbrev, + survey_series_desc) |> + arrange(-catch_count, -fishing_event_id) + head(d0, n = 8L) + ``` + + ``` + #> # A tibble: 8 x 36 + #> year fishing_event_id catch_count catch_weight ssid survey_abbrev survey_series_desc survey_id + #> + #> 1 2018 5092608 24 0 14 IPHC FISS International Pacif~ 538 + #> 2 2018 5087665 14 0 14 IPHC FISS International Pacif~ 538 + #> 3 2018 5093313 13 0 14 IPHC FISS International Pacif~ 538 + #> 4 2018 5089779 10 0 14 IPHC FISS International Pacif~ 538 + #> 5 2018 5074269 10 0 14 IPHC FISS International Pacif~ 538 + #> 6 2018 5090481 9 0 14 IPHC FISS International Pacif~ 538 + #> 7 2018 5074976 8 0 14 IPHC FISS International Pacif~ 538 + #> 8 2018 5091185 6 0 14 IPHC FISS International Pacif~ 538 + #> # i 28 more variables: species_code , survey_desc , trip_id , + #> # fe_major_level_id , latitude , longitude , grouping_code , + #> # major_stat_area_code , minor_stat_area_code , depth_m , duration_min , + #> # doorspread_m , speed_mpm , tow_length_m , density_kgpm2 , + #> # density_pcpm2 , skate_count , hook_count , density_ppkm2 , month , + #> # day , time_deployed , time_retrieved , latitude_end , + #> # longitude_end , species_common_name , species_science_name , ... + ``` + +Notice that `catch_weight` sometimes contains zeros when `catch_count` is at least 1. +This is because the original SQL code assume NULL values to be zeros. +In many cases catch weights are missing because they are not collected on that type of survey. +However, even for surveys where weights are the usual unit of measurement, a particular catch may have been too large or small for the scale and therefore recorded only as a count. +For `get_all_survey_sets()`, the default setting is `remove_false_zeros = TRUE`, which removes these misleading zeros from the data. + +#### Using `get_all_survey_sets()` + +Messages and warnings will alert the user about nuances in the data requested and returned. +For example, this function call results in multiple rows of data that share the same fishing_event_id and a warning that suggests possible reasons for this. +In this case, the number of rows of data exceeds the number of fishing events because catch is being returned at the skate level for some fishing events. +This will happen any time skates within a fishing event differ in their gear (currently just working off differences in hook type and size). + + + ``` r + d <- get_all_survey_sets("Bluntnose Sixgill Shark") + ``` + + ``` + #> [1] "Returning all sets/events/skates (including those with no catch) from all survey series that recorded Bluntnose Sixgill Shark at least once." + ``` + + ``` + #> Warning in get_all_survey_sets("Bluntnose Sixgill Shark"): Duplicate fishing_event_ids are still + #> present despite `remove_duplicates = TRUE`. This may be because of overlapping survey + #> stratifications or multiple skates per event (specifically when at least one survey included used + #> skates with differences in gear type), but could also be due to trips participating in more than + #> one type of survey. If the latter, location, gear, or `reason_desc` columns should be used to + #> choose which events to keep. After selecting specific survey stratifications and determining that + #> all relevant variables are accurate, the remaining duplications can be filtered using `dat <- + #> dat[!duplicated(dat$fishing_event_id), ]`. + ``` + + ``` + #> Warning in get_all_survey_sets("Bluntnose Sixgill Shark"): All sablefish research related sets are + #> returned as survey_series_id 35. To separate types of sets, use reason_desc and grouping_code + #> variables. + ``` + + ``` r + nrow(d) #> number of rows + ``` + + ``` + #> [1] 18967 + ``` + + ``` r + length(unique(d$fishing_event_id)) #> number of fishing events + ``` + + ``` + #> [1] 18840 + ``` + + ``` r + sort(unique(d$survey_series_id)) #> only returns survey series that caught the species + ``` + + ``` + #> [1] 4 6 7 14 34 35 36 39 40 45 48 76 80 + ``` + +Now, when we view the data `catch_weight` appears appropriately as `NA` when this data was not collected. +We now also get catches at the skate level (multiple skates make up each fishing event) for the dogfish comparison work (`ssid = c(48)`) which was not (and cannot be) returned by `get_survey_sets()` due to the gear differences between skates. + + + ``` r + d <- d |> rename(ssid = survey_series_id) |> + relocate(year, fishing_event_id, catch_count, catch_weight, ssid, survey_abbrev, + activity_desc, skate_id) |> + arrange(-catch_count, -fishing_event_id) + head(d, n = 8L) + ``` + + ``` + #> # A tibble: 8 x 75 + #> year fishing_event_id catch_count catch_weight ssid survey_abbrev activity_desc skate_id + #> + #> 1 2018 5092608 24 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 2 2018 5087665 14 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 3 2018 5093313 13 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 4 2018 5089779 10 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 5 2018 5074269 10 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 6 2018 5090481 9 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 7 2018 5074976 8 NA 14 IPHC FISS INTERNATIONAL PACIFI~ NA + #> 8 2023 5795618 6 NA 48 OTHER DOGFISH GEAR/TIMING ~ 5788696 + #> # i 67 more variables: species_common_name , species_code , fe_major_level_id , + #> # trip_id , survey_series_og , survey_id , activity_code , reason_desc , + #> # trip_year , month , day , time_deployed , time_retrieved , + #> # time_end_deployment , time_begin_retrieval , latitude , longitude , + #> # latitude_end , longitude_end , major_stat_area_code , + #> # minor_stat_area_code , depth_m , depth_begin , depth_end , vessel_id , + #> # captain_id , duration_min , tow_length_m , mouth_width_m , ... + ``` +So, which surveys encountered the most of this species? +Some surveys only count individuals and others only weigh the total catch, so we will summarize both count and weight variables. + + + ``` r + d |> group_by(ssid, survey_series_desc) |> + mutate(event_skate_id = paste0(fishing_event_id, "-", skate_id)) |> + summarise(individuals = sum(catch_count, na.rm = TRUE), + weight = sum(catch_weight), + events = length(unique(fishing_event_id)), + skates = length(unique(event_skate_id)), + rows = n()) |> + arrange(-individuals, -weight) + ``` + + ``` + #> # A tibble: 13 x 7 + #> # Groups: ssid [13] + #> ssid survey_series_desc individuals weight events skates rows + #> + #> 1 14 "International Pacific Halibut Commission Fishery-I~ 175 NA 3278 3278 3278 + #> 2 48 "Dogfish Gear/Timing Comparison Surveys" 38 NA 145 271 271 + #> 3 40 "Hard Bottom Longline Inside South " 16 NA 528 528 528 + #> 4 76 "Strait of Georgia Dogfish Longline" 13 NA 351 351 351 + #> 5 45 "Strait of Georgia Synoptic Bottom Trawl" 2 19.4 98 98 98 + #> 6 39 "Hard Bottom Longline Inside North " 1 200 769 769 769 + #> 7 34 "Strait of Georgia Ecosystem Research Initiative Ac~ 1 36.7 167 167 167 + #> 8 6 "Queen Charlotte Sound Multispecies Small-mesh Bott~ 1 7.14 1295 1295 1295 + #> 9 35 "Sablefish Research and Assessment" 1 5.8 3582 3582 3582 + #> 10 4 "West Coast Vancouver Island Synoptic Bottom Trawl" 1 1.52 1737 1737 1737 + #> 11 36 "Hard Bottom Longline Outside South" 1 NA 1536 1536 1536 + #> 12 7 "West Coast Vancouver Island Multispecies Small-mes~ 0 15.7 5084 5084 5084 + #> 13 80 "Eulachon Migration Study Bottom Trawl (South)" 0 4.5 271 271 271 + ``` + +We can also tally the number of unique fishing events versus the number of rows of data returned to see which surveys have been returned at the skate level. +This was the case for SSID `48`. + +The vast majority of records for Bluntnose Sixgill Shark come from the IPHC, followed by the Dogfish and Hard Bottom Longline surveys, both conducted in the Strait of Georgia (aka. Inside South). +Because the IPHC covers a wider area, we can explore the spatial distribution of catches within that survey only, and confirm that they are most frequently caught in the Strait of Georgia, major stat area `"01"`. + + + ``` r + d |> filter(ssid == 14) |> + group_by(major_stat_area_code) |> + summarise(individuals = sum(catch_count, na.rm = TRUE), + weight = sum(catch_weight), + events = length(unique(fishing_event_id))) |> + arrange(-individuals, -weight) + ``` + + ``` + #> # A tibble: 8 x 4 + #> major_stat_area_code individuals weight events + #> + #> 1 01 137 NA 43 + #> 2 06 14 NA 831 + #> 3 03 10 NA 383 + #> 4 04 10 NA 262 + #> 5 07 2 NA 757 + #> 6 05 1 NA 362 + #> 7 09 1 NA 201 + #> 8 08 0 0 439 + ``` + + +### What survey samples are available for a species within a specific area? + +As an example, we might want to determine what survey sample data exists for Pacific Spiny Dogfish in the Strait of Georgia. +The area argument is `major` and accepts character vectors of major stat area codes. +A table of options can be retrieved with `get_major_area()`. + + + ``` r + d2 <- get_all_survey_samples("north pacific spiny dogfish", + major = c("01")) + ``` + + ``` + #> [1] "Returning all north pacific spiny dogfish specimens from major area(s) 01 from any survey series." + ``` + + + ``` r + d2 |> group_by(survey_series_id, survey_series_desc) |> + summarise(specimens = length(unique(specimen_id)), + lengths = sum(!is.na(length)), + weights = sum(!is.na(weight)), + age_structures = sum(age_specimen_collected) + ) |> + arrange(-specimens)|> + rename(ssid = survey_series_id) + ``` + + ``` + #> # A tibble: 15 x 6 + #> # Groups: ssid [15] + #> ssid survey_series_desc specimens lengths weights age_structures + #> + #> 1 76 "Strait of Georgia Dogfish Longline" 54716 54690 6582 19053 + #> 2 40 "Hard Bottom Longline Inside South " 36566 36561 3 0 + #> 3 39 "Hard Bottom Longline Inside North " 35624 35497 408 400 + #> 4 48 "Dogfish Gear/Timing Comparison Surveys" 16695 16681 3322 884 + #> 5 45 "Strait of Georgia Synoptic Bottom Trawl" 1721 1721 1243 505 + #> 6 15 "Strait of Georgia Lingcod Young-of-year Bottom T~ 1371 1181 0 0 + #> 7 34 "Strait of Georgia Ecosystem Research Initiative ~ 925 877 402 0 + #> 8 50 "Yelloweye Rockfish Genetics" 786 786 0 0 + #> 9 87 "Jig Survey - 4B Stat Area 19" 314 300 0 0 + #> 10 86 "Jig Survey - 4B Stat Area 18" 176 164 0 0 + #> 11 51 "Combined Submersible And Longline Fishing Survey" 169 169 168 0 + #> 12 82 "Jig Survey - 4B Stat Area 12" 14 14 0 0 + #> 13 85 "Jig Survey - 4B Stat Area 16" 5 4 0 0 + #> 14 68 "Joint Canada/US Hake Acoustic" 1 1 1 0 + #> 15 84 "Jig Survey - 4B Stat Area 15" 1 0 0 0 + ``` + +This should return all fishery-independent specimen records. +We haven't counted actual ages for this species, because none are available. +For both `get_all_*` functions, the default is to drop all columns with no data, so in this case the column named `age` is missing. +However, this column could have been retained by using the argument `drop_na_columns = FALSE`. + +If you want to focus on specimens that come from design-based survey sets you can add arguments that filter for unsorted random samples that come from events that have grouping codes that match those expected for the current survey design. +Alternatively, this can be achieved by filtering for specimens where `!is.na(grouping_code)` or `!is.na(grouping_code_updated)` and checking that the `sample_type_comment` and `sample_source_desc` notes are consistent with the specimens being from random samples. +Here, we will use the built in filter arguments, but also add the additional constraint of filtering based on the updated grouping codes. +Note: Some surveys do not use grouping codes, and therefore won't be returned when the `grouping_only` option is used. +In this case, all of SSID `51` and `82:87` are now missing. + + + ``` r + d3 <- get_all_survey_samples("north pacific spiny dogfish", + major = c("01"), + usability = c(0, 1, 2, 6), + unsorted_only = TRUE, + random_only = TRUE, + grouping_only = TRUE) + ``` + + ``` + #> [1] "Looking for samples that are usable (0, 1, 2, 6) unsorted random with originally specified grouping codes." + #> [1] "Returning all north pacific spiny dogfish specimens from major area(s) 01 from any survey series." + ``` + + + ``` r + d3 |> filter(!is.na(grouping_code_updated)) |> + group_by(survey_series_id, survey_series_desc) |> + summarise(specimens = length(unique(specimen_id)), + lengths = sum(!is.na(length)), + weights = sum(!is.na(weight)), + age_structures = sum(age_specimen_collected)) |> + arrange(-specimens) |> + rename(ssid = survey_series_id) + ``` + + ``` + #> # A tibble: 7 x 6 + #> # Groups: ssid [7] + #> ssid survey_series_desc specimens lengths weights age_structures + #> + #> 1 40 "Hard Bottom Longline Inside South " 35070 35065 3 0 + #> 2 39 "Hard Bottom Longline Inside North " 29149 29027 399 400 + #> 3 76 "Strait of Georgia Dogfish Longline" 18384 18376 0 15521 + #> 4 45 "Strait of Georgia Synoptic Bottom Trawl" 1721 1721 1243 505 + #> 5 15 "Strait of Georgia Lingcod Young-of-year Bottom Tr~ 1071 1071 0 0 + #> 6 34 "Strait of Georgia Ecosystem Research Initiative A~ 215 215 0 0 + #> 7 68 "Joint Canada/US Hake Acoustic" 1 1 1 0 + ``` + +If you want to retrieve additional event or skate-level covariates for use in model-based analyses, than use the argument `include_event_info = TRUE`. +For example, when this is applied to the various longline surveys in the Strait of Georgia, one can test the effects of variables like depth, date, hook type and size on the sex and sizes of fish caught. + + + ``` r + d4 <- get_all_survey_samples("north pacific spiny dogfish", + major = c("01"), + ssid = c(39, 40, 48, 76), + include_event_info = TRUE) + ``` + + ``` + #> [1] "Returning all north pacific spiny dogfish specimens from within major area(s) 01 and belonging to survey series 39, 40, 48, 76." + #> [1] "Specimens found. Fetching additional event info." + ``` + + + ``` r + d4 |> group_by(survey_series_id, activity_desc, hook_desc, hooksize_desc) |> + summarise(specimens = length(unique(specimen_id)), + years = paste(min(year, na.rm = TRUE), "-", max(year, na.rm = TRUE))) |> + arrange(-specimens) |> + rename(ssid = survey_series_id, hooksize = hooksize_desc) |> + print() + ``` + + ``` + #> # A tibble: 8 x 6 + #> # Groups: ssid, activity_desc, hook_desc [7] + #> ssid activity_desc hook_desc hooksize specimens years + #> + #> 1 40 HARD BOTTOM LONGLINE HOOK SURVEY - INSIDE SOUTH CIRCLE HOOK 13/0 36566 2005 - 2022 + #> 2 76 STRAIT OF GEORGIA DOGFISH LONGLINE SURVEY CIRCLE HOOK 14/0 36332 2005 - 2019 + #> 3 39 HARD BOTTOM LONGLINE HOOK SURVEY - INSIDE NORTH CIRCLE HOOK 13/0 35624 2003 - 2023 + #> 4 76 STRAIT OF GEORGIA DOGFISH LONGLINE SURVEY J-HOOK 12/0 18384 1986 - 1989 + #> 5 48 DOGFISH GEAR/TIMING COMPARISON SURVEYS CIRCLE HOOK 14/0 8474 2004 - 2023 + #> 6 48 DOGFISH GEAR/TIMING COMPARISON SURVEYS CIRCLE HOOK 13/0 7254 2019 - 2023 + #> 7 48 DOGFISH GEAR/TIMING COMPARISON SURVEYS J-HOOK 12/0 948 2004 - 2023 + #> 8 48 DOGFISH GEAR/TIMING COMPARISON SURVEYS 19 2004 - 2004 + ``` + +These are the variables returned by default: + + + ``` r + glimpse(d2) + ``` + + ``` + #> Rows: 149,084 + #> Columns: 50 + #> $ species_common_name "north pacific spiny dogfish", "north pacific spiny dogfish",~ + #> $ survey_series_id 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1~ + #> $ sex 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2~ + #> $ length NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ weight NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ survey_series_og 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1~ + #> $ activity_desc "STRAIT OF GEORGIA LINGCOD YOUNG-OF-YEAR BOTTOM TRAWL SURVEY"~ + #> $ activity_code 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 3~ + #> $ fishing_event_id 3132875, 3132875, 3132875, 3132875, 3132875, 3132875, 3132875~ + #> $ trip_year 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2~ + #> $ sample_date 2006-08-04, 2006-08-04, 2006-08-04, 2006-08-04, 2006-08-04, ~ + #> $ species_code "044", "044", "044", "044", "044", "044", "044", "044", "044"~ + #> $ species_science_name "squalus suckleyi", "squalus suckleyi", "squalus suckleyi", "~ + #> $ specimen_id 12291537, 12291539, 12291540, 12291542, 12291543, 12291544, 1~ + #> $ sample_id 399729, 399729, 399729, 399729, 399729, 399729, 399729, 39972~ + #> $ fork_length 72, 72, 73, 74, 75, 75, 76, 80, 85, 88, 89, 70, 77, 77, 78, 7~ + #> $ total_length NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ maturity_code 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ + #> $ maturity_name NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ maturity_desc NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ maturity_convention_code 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9~ + #> $ maturity_convention_desc "MATURITIES NOT LOOKED AT", "MATURITIES NOT LOOKED AT", "MATU~ + #> $ maturity_convention_maxvalue 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ + #> $ major_stat_area_code "01", "01", "01", "01", "01", "01", "01", "01", "01", "01", "~ + #> $ major_stat_area_name "4B: STRAIT OF GEORGIA", "4B: STRAIT OF GEORGIA", "4B: STRAIT~ + #> $ minor_stat_area_code "14", "14", "14", "14", "14", "14", "14", "14", "14", "14", "~ + #> $ gear 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ + #> $ reason_desc NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ survey_id 481, 481, 481, 481, 481, 481, 481, 481, 481, 481, 481, 481, 4~ + #> $ trip_id 73330, 73330, 73330, 73330, 73330, 73330, 73330, 73330, 73330~ + #> $ trip_sub_type_code 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~ + #> $ fe_parent_event_id NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ fe_major_level_id 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 4~ + #> $ fe_sub_level_id NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ sample_type_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ + #> $ sample_type_comment "TOTAL CATCH", "TOTAL CATCH", "TOTAL CATCH", "TOTAL CATCH~ + #> $ species_category_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ + #> $ sample_source_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ + #> $ age_specimen_collected 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ + #> $ usability_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ + #> $ grouping_code 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 1~ + #> $ grouping_desc "Lingcod YOY depth stratum 1: 16 - 25 m", "Lingcod YOY depth ~ + #> $ grouping_code_updated NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ grouping_desc_updated NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ original_ind "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~ + #> $ length_type "total_length", "total_length", "total_length", "total_length~ + #> $ sample_source_desc "Unsorted", "Unsorted", "Unsorted", "Unsorted", "Unsorted", "~ + #> $ usability_desc "FULLY USABLE", "FULLY USABLE", "FULLY USABLE", "FULLY USABLE~ + #> $ survey_series_desc "Strait of Georgia Lingcod Young-of-year Bottom Trawl", "Stra~ + #> $ survey_abbrev "OTHER", "OTHER", "OTHER", "OTHER", "OTHER", "OTHER", "OTHER"~ + ``` + +And these additional variables were returned for longline surveys when `include_event_info = TRUE`: + + + ``` r + glimpse(d4[, !names(d4) %in% names(d2)]) + ``` + + ``` + #> Rows: 143,601 + #> Columns: 34 + #> $ catch_weight NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ catch_count 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 5~ + #> $ year 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 202~ + #> $ month 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ~ + #> $ day 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ~ + #> $ time_deployed 2023-09-02 14:43:16, 2023-09-02 14:43:16, 2023-09-02 14:43:16, 2023-~ + #> $ time_retrieved 2023-09-02 16:42:55, 2023-09-02 16:42:55, 2023-09-02 16:42:55, 2023-~ + #> $ time_end_deployment 2023-09-02 14:43:16, 2023-09-02 14:43:16, 2023-09-02 14:43:16, 2023-~ + #> $ time_begin_retrieval 2023-09-02 16:42:55, 2023-09-02 16:42:55, 2023-09-02 16:42:55, 2023-~ + #> $ latitude 50.21935, 50.21935, 50.21935, 50.21935, 50.21935, 50.21935, 50.21935,~ + #> $ longitude -125.3644, -125.3644, -125.3644, -125.3644, -125.3644, -125.3644, -12~ + #> $ latitude_end 50.22198, 50.22198, 50.22198, 50.22198, 50.22198, 50.22198, 50.22198,~ + #> $ longitude_end -125.3671, -125.3671, -125.3671, -125.3671, -125.3671, -125.3671, -12~ + #> $ depth_m 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 7~ + #> $ depth_begin 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 6~ + #> $ depth_end 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,~ + #> $ vessel_id 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 201~ + #> $ captain_id 969, 969, 969, 969, 969, 969, 969, 969, 969, 969, 969, 969, 969, 969,~ + #> $ duration_min 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,~ + #> $ tow_length_m 350, 350, 350, 350, 350, 350, 350, 350, 350, 350, 350, 350, 350, 350,~ + #> $ hook_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~ + #> $ lglsp_hook_count 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225,~ + #> $ hook_desc "CIRCLE HOOK", "CIRCLE HOOK", "CIRCLE HOOK", "CIRCLE HOOK", "CIRCLE H~ + #> $ hooksize_desc "13/0", "13/0", "13/0", "13/0", "13/0", "13/0", "13/0", "13/0", "13/0~ + #> $ grouping_depth_id "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2",~ + #> $ grouping_area_km2 844, 844, 844, 844, 844, 844, 844, 844, 844, 844, 844, 844, 844, 844,~ + #> $ skate_count 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~ + #> $ mean_per_skate 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236,~ + #> $ minor_id_count 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236,~ + #> $ minor_id_max 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236,~ + #> $ diff 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~ + #> $ event_level_count NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ skate_id NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ + #> $ hook_area_swept_km2 0.005262028, 0.005262028, 0.005262028, 0.005262028, 0.005262028, 0.00~ + ``` + +The collection of variables that are returned can change depending on the records that are retrieved. For example, variables that are specific to longline surveys are omitted when only trawl survey sets are returned. + +### Surveys with overlapping stratifications + +Some fishing events are assigned to multiple surveys, which may or may not be fully or partially overlapping, and are defined by the same activity code in the database. To get all sets with matching activity codes one can use `include_activity_matches = TRUE`. It will return all events that share the same `activity_code` as any SSIDs requested. This works when retrieving either sets or samples. + + + ``` r + d5 <- get_all_survey_sets("north pacific spiny dogfish", + ssid = c(7), + include_activity_matches = TRUE, + remove_duplicates = TRUE #> default + ) + ``` + + ``` + #> [1] "north pacific spiny dogfish have been recorded by survey series 7, 6, 67 at least once. " + #> [1] "Returning all relevant sets/events/skates including those with no catch." + ``` + + ``` + #> Warning in get_all_survey_sets("north pacific spiny dogfish", ssid = c(7), : Duplicate + #> fishing_event_ids are still present despite `remove_duplicates = TRUE`. This may be because of + #> overlapping survey stratifications or multiple skates per event (specifically when at least one + #> survey included used skates with differences in gear type), but could also be due to trips + #> participating in more than one type of survey. If the latter, location, gear, or `reason_desc` + #> columns should be used to choose which events to keep. After selecting specific survey + #> stratifications and determining that all relevant variables are accurate, the remaining + #> duplications can be filtered using `dat <- dat[!duplicated(dat$fishing_event_id), ]`. + ``` + + + ``` r + d5 |> group_by(survey_series_id, survey_abbrev, activity_desc) |> + summarise(events = length(unique(fishing_event_id)), + years = paste(min(year), "-", max(year)), + rows = n()) |> + arrange(-events) |> + rename(ssid = survey_series_id) + ``` + + ``` + #> # A tibble: 2 x 6 + #> # Groups: ssid, survey_abbrev [2] + #> ssid survey_abbrev activity_desc events years rows + #> + #> 1 7 MSSM WCVI MULTISPECIES SMALL-MESH (AKA SHRIMP) BOTTOM TRAWL SURVEY 5084 1975 - ~ 5084 + #> 2 6 MSSM QCS MULTISPECIES SMALL-MESH (AKA SHRIMP) BOTTOM TRAWL SURVEY 1295 1998 - ~ 1295 + ``` + +You will get a warning that some fishing events are duplicated even though `remove_duplicates = TRUE`. We can look at one of the duplicated events and see that it lacks location information, which means that it couldn't be accurately assigned to either region of shrimp survey, so it has been returned as potentially belonging to both. + + + ``` r + dd <- d5[duplicated(d5$fishing_event_id),] + glimpse(dd) + ``` + + ``` + #> Rows: 1 + #> Columns: 54 + #> $ species_common_name "north pacific spiny dogfish" + #> $ catch_count NA + #> $ catch_weight 1.9 + #> $ survey_series_id 7 + #> $ survey_abbrev "MSSM WCVI" + #> $ year 2000 + #> $ fishing_event_id 901698 + #> $ species_code "044" + #> $ fe_major_level_id 9 + #> $ trip_id 60021 + #> $ survey_series_og 7 + #> $ activity_desc "MULTISPECIES SMALL-MESH (AKA SHRIMP) BOTTOM TRAWL SURVEY" + #> $ activity_code 25 + #> $ reason_desc "QUANT. BIOMASS SURVEY" + #> $ trip_year 2000 + #> $ month NA + #> $ day NA + #> $ time_deployed NA + #> $ time_retrieved NA + #> $ time_end_deployment NA + #> $ time_begin_retrieval NA + #> $ latitude NA + #> $ longitude NA + #> $ latitude_end NA + #> $ longitude_end NA + #> $ major_stat_area_code "00" + #> $ minor_stat_area_code "00" + #> $ depth_m NA + #> $ depth_begin NA + #> $ depth_end NA + #> $ vessel_id 2000 + #> $ captain_id NA + #> $ duration_min NA + #> $ tow_length_m NA + #> $ mouth_width_m 10.6 + #> $ doorspread_m 29.6 + #> $ speed_mpm 86.66668 + #> $ usability_code 13 + #> $ grouping_code NA + #> $ grouping_desc NA + #> $ grouping_code_updated NA + #> $ grouping_desc_updated NA + #> $ grouping_area_km2 NA + #> $ original_ind "Y" + #> $ survey_series_desc "West Coast Vancouver Island Multispecies Small-mesh Bottom Trawl" + #> $ species_science_name "squalus suckleyi" + #> $ species_desc "north pacific spiny dogfish" + #> $ usability_desc "UNUSABLE FOR CPUE ESTIMATION" + #> $ area_swept1 NA + #> $ area_swept2 NA + #> $ area_swept NA + #> $ area_swept_km2 NA + #> $ density_kgpm2 NA + #> $ density_pcpm2 NA + ``` + +NOTE: All activity matches are always returned by default whenever any one of the sablefish surveys (ssid = 35, 41, 42, 43) is requested. +This is because SSIDs for that survey were inconsistently assigned and frequently share trip ids, which results in duplication and or assignment to the wrong survey series. +In order to accurately separate the types of sablefish surveys one needs to split data from this survey by the `reason_desc` variable. + + +## Troubleshooting + +### Beware of duplication of fishing events and specimens + +A risk in using `get_all_*()` functions is that, in the attempt to return a comprehensive data set, some fishing events and specimen ids may be duplicated (this also occurs with the original functions but usually for different reasons). +Some vessel trips conduct sampling for multiple survey series, and unless joining is based on grouping codes (which aren't used consistently for some surveys) the only way to connect a fishing event to a survey series id is through the vessel trip. +This can result in events and specimens getting assigned to both surveys that were conducted on the same trip. +The most common instances of this (e.g., sablefish, jig, and offshore shrimp surveys) have custom corrections coded into an internal function `correct_ssids()` that is applied within the `get_all_*()` functions. +Duplication can also occur due to missing covariates (e.g., both event level and survey defaults are missing for `doorspread_m` on a couple sets for some trawl survey series), or for specimens when multiple vials of DNA were collected and `return_dna_info = TRUE`. +It is recommended to always check for unexpected duplication of observations (usually `fishing_event_id` or `specimen_id`) before beginning any analysis. +Two return both copies of each duplicated record the following can be used: `d[duplicated(d$specimen_id) | duplicated(d$specimen_id, fromLast=TRUE), ]`. + + +### Error messages + +Any error message that mentions SQL Server suggests either that the network connection or server timed out, or that the SQL query was flawed. One way this can happen is providing an invalid ssid (e.g., a character "4" instead of a numeric 4), invalid major area code (e.g., a numeric 1 instead of a character "01"), or a search that is insufficiently limited in its scope. + +>Error: nanodbc/nanodbc.cpp:2823: 08S01 +>[Microsoft][ODBC SQL Server Driver][DBNETLIB]ConnectionRead (recv()). +>[Microsoft][ODBC SQL Server Driver][DBNETLIB]General network error. Check your network documentation. + +If the SQL search is successfully returned to R, but your computer has insufficient memory to handle the amount of data returned, you may see an error like this: + +>Error: cannot allocate vector of size XXX Mb + + diff --git a/vignettes/gfdata-vignette-get-all.Rmd.orig b/vignettes/gfdata-vignette-get-all.Rmd.orig new file mode 100644 index 0000000..ccb0469 --- /dev/null +++ b/vignettes/gfdata-vignette-get-all.Rmd.orig @@ -0,0 +1,345 @@ +--- +title: "gfdata `get_all` vignette" +author: "Philina English" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{gfdata `get_all' Vignette} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, echo = FALSE} +knitr::opts_chunk$set( + collapse = FALSE, + comment = "#>", + cache = TRUE, + size = "small", + cols.print = 6, + eval = TRUE, + autodep = TRUE, + indent = " ", + fig.path = "vignettes/get-all-figs/", + cache.path = "vignettes/get-all-cache/" +) +options(width = 100, + cols.print = 6, + rows.print = 7) +``` + +## Why use a `get_all_*()` function? + +The original `get_*()` survey functions are limited to returning sets and specimen samples that conform to current survey design specifications based on assigned grouping and usability codes. +This works well for some surveys and data uses that depend on sampling a consistent survey footprint across years such as generating design-based abundance indexes. +Other uses can harness information from samples that were collected outside the current survey footprint or from different types of surveys, some of which don't consistently apply these codes. +The `get_all_*()` functions have been designed to retrieve all fishery-independent survey data relevant to a particular species, or set of species, and to do so more quickly and comprehensively than the original functions. +When retrieving data for multiple species at once, these functions will be dramatically faster than the original `get_*()` functions. +This is because sql scripts are called once for all species rather than repeatedly for each species. +When retrieving data for single species run times will depend on which surveys and arguments are used. +The extent of the data returned can be specific to a single survey, a single major stat area, any combination of these, or generalized to get everything in the database that is appropriately formatted. + +Additional variables are also returned to support modelling objectives and decisions regarding which data should be retained for specific purposes. + +### More flexibility + +The original functions required the user to specify survey series ids (`ssid`) and were limited in which they could accept and return. + +For `get_all_*()` functions, we have the option to set `ssid = NULL` which will return all fishery-independent samples and catch data in the database, as long as the survey series ids (and for catch data, survey ids as well) have been assigned in the database. +These include trap (sablefish), longline, jig, and most contemporary and historic trawl surveys (exceptions include the Nearshore Shrimp trawl survey for which survey ids are missing). +If a character string of major stat area codes is provided to the argument `major`, than all sets or samples from that area will be returned. + + +### Design-based analyses + +Because the original `get_*()` survey functions were intended to only return sets and specimen samples that conform to current survey design specifications, they cannot retrieve sets and samples from cells that fall outside the latest definition of a particular survey's design. +When this behaviour is desired, it can be reproduced using the `get_all_*()` functions with the filtering options `usability = c(0, 1, 2, 6)` and `grouping_only = TRUE`. +This works reliably for most groundfish bottom trawl and longline surveys (`ssid = c(1, 2, 3, 4, 16, 22, 36, 39, 40)`). +However, when these filtering options are applied to certain surveys, a small proportion of the data returned by the original function may be missed (e.g., IPHC: `ssid = 14`) and no data is returned for surveys for which grouping or usability codes do not appear in the database (e.g., the jig survey: `ssid = c(82:87)`). + +If using the `get_all_*()` functions to generate design-based indices, the strata area variable is now called `grouping_area_km2` (instead of `area_km2` to avoid confusion with `area_swept*` variables) and, in case design changes occur that are not incorporated into the usability codes, one should also always check for differences between the `grouping_code` and `grouping_code_updated` variables. +The `grouping_code_updated` generally contains a subset of the former, likely as a result of a shrinking footprint or the dropping an entire strata from a survey's definition. +This is currently the case for the offshore shrimp, also known as multi-species small-mesh (MSSM), surveys (`ssid = c(6, 7)`), which needs to be filtered for only those sets with updated grouping codes (`!is.na(grouping_code_updated)`) in order to match the current survey design. +Consulting data stewards for specific surveys may be helpful in understanding differences between grouping codes. + +To retrieve specimen samples that conform to design specifications, the arguments `unsorted_only = TRUE` and `random_only = TRUE` should be used in addition to usability and grouping options. +When doing so `get_all_survey_samples()` function will return > 70 additional specimens over the original function for each of the longline surveys (`ssid = c(22, 36, 39, 40)`). +This is because the original `get_survey_samples()` function used a stricter method for filtering based on grouping codes. +This stricter filtering matches how `get_survey_sets()` filtered for the current trawl survey footprint, but not how it filtered sets for longline surveys. +If desired, this stricter filtering can be achieved for both sets and samples from any survey by filtering for `!is.na(grouping_code_updated)`. + + +### Non-standard data + + +In contrast, the default behaviour of the `get_all_*()` survey functions is to return all data collected on any given survey, whether or not it conforms to current design. +This includes sets and samples from grid cells that all within subsequently established Rockfish Conservation areas (RCAs), and data that differ at the skate level. +The original functions were not built to retrieve data that differed at the skate level, like gear comparison studies (e.g., `ssid = 48`). +The `get_all_*()` functions will automatically return catch information at the skate level, instead of the fishing event level, for sets within a single function call whenever gear variables (currently checking for differences in hook code and size) differ between skates. + + +```{r} +``` + +## Set up + +If you don't already have the package installed, see the general gfdata vignette for instructions. +Here we will load gfdata along with the package dplyr. + +```{r, cache=FALSE, warning = FALSE, message = FALSE} +library(gfdata) +library(dplyr) +library(tibble) +``` + +```{r, echo=FALSE, cache=FALSE} +.error <- tryCatch(get_ssids(), error = function(e) "Error") +.eval <- !identical(class(.error), "character") +``` + +The available arguments are described in the help documentation: + +```{r, eval = FALSE} +?get_all_survey_sets() +?get_all_survey_samples() +``` + + +## Examples + +### What survey data is available for a species? + +As an example, we might want to determine what survey set data are in our database for Bluntnose Sixgill Sharks (*Hexanchus griseus*). +For now, we will leave the default settings that pull all surveys and all areas. +Beware that some records in the database are from outside Canadian waters. +If desired, returned data can be filtered using the `major_stat_area_code` to retain only Canadian records (see `get_major_areas()` to identify which codes to use). + +#### Original `get_survey_sets()` function + +To start with, we check what the original `get_survey_sets()` function returns for this species. +By default this function returns just the most commonly used groundfish surveys: synoptic trawl (`ssid = c(1, 3, 4, 16)`), one historical trawl (`2`), and five longline--IPHC (`14`) and PHMA (`22, 36, 39, 40`) surveys. +The first thing to note here is that this function will only return one row per fishing event (unless overlapping survey series or sample_ids were requested). +This function will also return all sets for any survey series, even when the species has never been recorded on that survey. + +```{r, eval=.eval, warning=TRUE} +d0 <- get_survey_sets("Bluntnose Sixgill Shark") +nrow(d0) #> number of rows +length(unique(d0$fishing_event_id)) #> number of fishing events +sort(unique(d0$survey_series_id)) #> all default survey series were returned +``` + +In contrast, `get_all_survey_sets()` only returns set data for survey series that have captured the species at least once. +Both `*_survey_sets()` functions return all sets for any survey series returned, including those that did not record the species. +So, to make sets that did capture the species visible to `head()`, we will sort by descending `catch_count`. + +```{r, eval=.eval, warning=TRUE} +d0 <- d0 |> rename(ssid = survey_series_id) |> + relocate(year, fishing_event_id, catch_count, catch_weight, ssid, survey_abbrev, + survey_series_desc) |> + arrange(-catch_count, -fishing_event_id) +head(d0, n = 8L) +``` + +Notice that `catch_weight` sometimes contains zeros when `catch_count` is at least 1. +This is because the original SQL code assume NULL values to be zeros. +In many cases catch weights are missing because they are not collected on that type of survey. +However, even for surveys where weights are the usual unit of measurement, a particular catch may have been too large or small for the scale and therefore recorded only as a count. +For `get_all_survey_sets()`, the default setting is `remove_false_zeros = TRUE`, which removes these misleading zeros from the data. + +#### Using `get_all_survey_sets()` + +Messages and warnings will alert the user about nuances in the data requested and returned. +For example, this function call results in multiple rows of data that share the same fishing_event_id and a warning that suggests possible reasons for this. +In this case, the number of rows of data exceeds the number of fishing events because catch is being returned at the skate level for some fishing events. +This will happen any time skates within a fishing event differ in their gear (currently just working off differences in hook type and size). + +```{r, eval=.eval, warning=TRUE} +d <- get_all_survey_sets("Bluntnose Sixgill Shark") +nrow(d) #> number of rows +length(unique(d$fishing_event_id)) #> number of fishing events +sort(unique(d$survey_series_id)) #> only returns survey series that caught the species +``` + +Now, when we view the data `catch_weight` appears appropriately as `NA` when this data was not collected. +We now also get catches at the skate level (multiple skates make up each fishing event) for the dogfish comparison work (`ssid = c(48)`) which was not (and cannot be) returned by `get_survey_sets()` due to the gear differences between skates. + +```{r, eval=.eval, warning=TRUE, cache=FALSE} +d <- d |> rename(ssid = survey_series_id) |> + relocate(year, fishing_event_id, catch_count, catch_weight, ssid, survey_abbrev, + activity_desc, skate_id) |> + arrange(-catch_count, -fishing_event_id) +head(d, n = 8L) +``` +So, which surveys encountered the most of this species? +Some surveys only count individuals and others only weigh the total catch, so we will summarize both count and weight variables. + +```{r, eval=.eval, message=FALSE, cache=FALSE} +d |> group_by(ssid, survey_series_desc) |> + mutate(event_skate_id = paste0(fishing_event_id, "-", skate_id)) |> + summarise(individuals = sum(catch_count, na.rm = TRUE), + weight = sum(catch_weight), + events = length(unique(fishing_event_id)), + skates = length(unique(event_skate_id)), + rows = n()) |> + arrange(-individuals, -weight) +``` + +We can also tally the number of unique fishing events versus the number of rows of data returned to see which surveys have been returned at the skate level. +This was the case for SSID `48`. + +The vast majority of records for Bluntnose Sixgill Shark come from the IPHC, followed by the Dogfish and Hard Bottom Longline surveys, both conducted in the Strait of Georgia (aka. Inside South). +Because the IPHC covers a wider area, we can explore the spatial distribution of catches within that survey only, and confirm that they are most frequently caught in the Strait of Georgia, major stat area `"01"`. + +```{r, eval=.eval, message=FALSE, cache=FALSE} +d |> filter(ssid == 14) |> + group_by(major_stat_area_code) |> + summarise(individuals = sum(catch_count, na.rm = TRUE), + weight = sum(catch_weight), + events = length(unique(fishing_event_id))) |> + arrange(-individuals, -weight) +``` + + +### What survey samples are available for a species within a specific area? + +As an example, we might want to determine what survey sample data exists for Pacific Spiny Dogfish in the Strait of Georgia. +The area argument is `major` and accepts character vectors of major stat area codes. +A table of options can be retrieved with `get_major_area()`. + +```{r, eval=.eval} +d2 <- get_all_survey_samples("north pacific spiny dogfish", + major = c("01")) +``` + +```{r, eval=.eval, message=FALSE, cache=FALSE} +d2 |> group_by(survey_series_id, survey_series_desc) |> + summarise(specimens = length(unique(specimen_id)), + lengths = sum(!is.na(length)), + weights = sum(!is.na(weight)), + age_structures = sum(age_specimen_collected) + ) |> + arrange(-specimens)|> + rename(ssid = survey_series_id) +``` + +This should return all fishery-independent specimen records. +We haven't counted actual ages for this species, because none are available. +For both `get_all_*` functions, the default is to drop all columns with no data, so in this case the column named `age` is missing. +However, this column could have been retained by using the argument `drop_na_columns = FALSE`. + +If you want to focus on specimens that come from design-based survey sets you can add arguments that filter for unsorted random samples that come from events that have grouping codes that match those expected for the current survey design. +Alternatively, this can be achieved by filtering for specimens where `!is.na(grouping_code)` or `!is.na(grouping_code_updated)` and checking that the `sample_type_comment` and `sample_source_desc` notes are consistent with the specimens being from random samples. +Here, we will use the built in filter arguments, but also add the additional constraint of filtering based on the updated grouping codes. +Note: Some surveys do not use grouping codes, and therefore won't be returned when the `grouping_only` option is used. +In this case, all of SSID `51` and `82:87` are now missing. + +```{r, eval=.eval, message=FALSE} +d3 <- get_all_survey_samples("north pacific spiny dogfish", + major = c("01"), + usability = c(0, 1, 2, 6), + unsorted_only = TRUE, + random_only = TRUE, + grouping_only = TRUE) +``` + +```{r, eval=.eval, message=FALSE, cache=FALSE} +d3 |> filter(!is.na(grouping_code_updated)) |> + group_by(survey_series_id, survey_series_desc) |> + summarise(specimens = length(unique(specimen_id)), + lengths = sum(!is.na(length)), + weights = sum(!is.na(weight)), + age_structures = sum(age_specimen_collected)) |> + arrange(-specimens) |> + rename(ssid = survey_series_id) +``` + +If you want to retrieve additional event or skate-level covariates for use in model-based analyses, than use the argument `include_event_info = TRUE`. +For example, when this is applied to the various longline surveys in the Strait of Georgia, one can test the effects of variables like depth, date, hook type and size on the sex and sizes of fish caught. + +```{r, eval=.eval, message=FALSE} +d4 <- get_all_survey_samples("north pacific spiny dogfish", + major = c("01"), + ssid = c(39, 40, 48, 76), + include_event_info = TRUE) +``` + +```{r, eval=.eval, message=FALSE, cache=FALSE} +d4 |> group_by(survey_series_id, activity_desc, hook_desc, hooksize_desc) |> + summarise(specimens = length(unique(specimen_id)), + years = paste(min(year, na.rm = TRUE), "-", max(year, na.rm = TRUE))) |> + arrange(-specimens) |> + rename(ssid = survey_series_id, hooksize = hooksize_desc) |> + print() +``` + +These are the variables returned by default: + +```{r, eval=.eval, message=FALSE, cache=FALSE} +glimpse(d2) +``` + +And these additional variables were returned for longline surveys when `include_event_info = TRUE`: + +```{r, eval=.eval, message=FALSE, cache=FALSE} +glimpse(d4[, !names(d4) %in% names(d2)]) +``` + +The collection of variables that are returned can change depending on the records that are retrieved. For example, variables that are specific to longline surveys are omitted when only trawl survey sets are returned. + +### Surveys with overlapping stratifications + +Some fishing events are assigned to multiple surveys, which may or may not be fully or partially overlapping, and are defined by the same activity code in the database. To get all sets with matching activity codes one can use `include_activity_matches = TRUE`. It will return all events that share the same `activity_code` as any SSIDs requested. This works when retrieving either sets or samples. + +```{r, eval=.eval, message=FALSE} +d5 <- get_all_survey_sets("north pacific spiny dogfish", + ssid = c(7), + include_activity_matches = TRUE, + remove_duplicates = TRUE #> default + ) +``` + +```{r, eval=.eval, message=FALSE, cache=FALSE} +d5 |> group_by(survey_series_id, survey_abbrev, activity_desc) |> + summarise(events = length(unique(fishing_event_id)), + years = paste(min(year), "-", max(year)), + rows = n()) |> + arrange(-events) |> + rename(ssid = survey_series_id) +``` + +You will get a warning that some fishing events are duplicated even though `remove_duplicates = TRUE`. We can look at one of the duplicated events and see that it lacks location information, which means that it couldn't be accurately assigned to either region of shrimp survey, so it has been returned as potentially belonging to both. + +```{r, eval=.eval, message=FALSE, cache=FALSE} +dd <- d5[duplicated(d5$fishing_event_id),] +glimpse(dd) +``` + +NOTE: All activity matches are always returned by default whenever any one of the sablefish surveys (ssid = 35, 41, 42, 43) is requested. +This is because SSIDs for that survey were inconsistently assigned and frequently share trip ids, which results in duplication and or assignment to the wrong survey series. +In order to accurately separate the types of sablefish surveys one needs to split data from this survey by the `reason_desc` variable. + + +## Troubleshooting + +### Beware of duplication of fishing events and specimens + +A risk in using `get_all_*()` functions is that, in the attempt to return a comprehensive data set, some fishing events and specimen ids may be duplicated (this also occurs with the original functions but usually for different reasons). +Some vessel trips conduct sampling for multiple survey series, and unless joining is based on grouping codes (which aren't used consistently for some surveys) the only way to connect a fishing event to a survey series id is through the vessel trip. +This can result in events and specimens getting assigned to both surveys that were conducted on the same trip. +The most common instances of this (e.g., sablefish, jig, and offshore shrimp surveys) have custom corrections coded into an internal function `correct_ssids()` that is applied within the `get_all_*()` functions. +Duplication can also occur due to missing covariates (e.g., both event level and survey defaults are missing for `doorspread_m` on a couple sets for some trawl survey series), or for specimens when multiple vials of DNA were collected and `return_dna_info = TRUE`. +It is recommended to always check for unexpected duplication of observations (usually `fishing_event_id` or `specimen_id`) before beginning any analysis. +Two return both copies of each duplicated record the following can be used: `d[duplicated(d$specimen_id) | duplicated(d$specimen_id, fromLast=TRUE), ]`. + + +### Error messages + +Any error message that mentions SQL Server suggests either that the network connection or server timed out, or that the SQL query was flawed. One way this can happen is providing an invalid ssid (e.g., a character "4" instead of a numeric 4), invalid major area code (e.g., a numeric 1 instead of a character "01"), or a search that is insufficiently limited in its scope. + +>Error: nanodbc/nanodbc.cpp:2823: 08S01 +>[Microsoft][ODBC SQL Server Driver][DBNETLIB]ConnectionRead (recv()). +>[Microsoft][ODBC SQL Server Driver][DBNETLIB]General network error. Check your network documentation. + +If the SQL search is successfully returned to R, but your computer has insufficient memory to handle the amount of data returned, you may see an error like this: + +>Error: cannot allocate vector of size XXX Mb + diff --git a/vignettes/gfdata-vignette.Rmd b/vignettes/gfdata-vignette.Rmd index aca7c39..d3cd8f9 100644 --- a/vignettes/gfdata-vignette.Rmd +++ b/vignettes/gfdata-vignette.Rmd @@ -1,7 +1,7 @@ --- title: "gfdata Vignette" author: "Elise Keppel" -date: "`r Sys.Date()`" +date: "2024-10-15" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{gfdata Vignette} @@ -9,34 +9,24 @@ vignette: > %\VignetteEncoding{UTF-8} --- -```{r, echo = FALSE} -knitr::opts_chunk$set( - collapse = FALSE, - comment = "#>", - cache = TRUE, - autodep = TRUE, - fig.path = "knitr-figs/", - cache.path = "knitr-cache/" -) -``` -```{r, echo=FALSE} -.error <- tryCatch(get_ssids(), error = function(e) "Error") -.eval <- !identical(class(.error), "character") -``` + + # Setup If you don't already have the package installed, then run: -```{r, eval=FALSE} + +``` r # install.packages("devtools") devtools::install_github("pbs-assess/gfdata") ``` First we will load the package along with dplyr since we will use it within our code later. -```{r, cache=FALSE, warning = FALSE, message = FALSE} + +``` r library(gfdata) library(dplyr) ``` @@ -71,11 +61,33 @@ Advis. Sec. REs. Doc. 2019/nnn. iv + 327 p.\ \ The complete list of `get_*()` functions in gfdata is: -```{r list-get} + +``` r fns <- ls("package:gfdata") sort(fns[grepl("get", fns)]) ``` +``` +#> [1] "get_active_survey_blocks" "get_age_methods" "get_age_precision" +#> [4] "get_all_stomachs" "get_all_survey_samples" "get_all_survey_sets" +#> [7] "get_catch" "get_catch_spatial" "get_comm_gear_types" +#> [10] "get_commercial_hooks_per_fe" "get_commercial_samples" "get_cpue_historical" +#> [13] "get_cpue_historical_hake" "get_cpue_historical_hl" "get_cpue_index" +#> [16] "get_cpue_index_hl" "get_cpue_spatial" "get_cpue_spatial_ll" +#> [19] "get_eulachon_specimens" "get_fishery_ids" "get_fishery_sectors" +#> [22] "get_gear_types" "get_hake_catch" "get_hake_survey_samples" +#> [25] "get_ll_hook_data" "get_major_areas" "get_management" +#> [28] "get_management_areas" "get_other_surveys" "get_parent_level_counts" +#> [31] "get_sable_landings" "get_sablefish_surveys" "get_sample_trips" +#> [34] "get_sensor_attributes" "get_sensor_data_ll_ctd" "get_sensor_data_ll_ctd_fe" +#> [37] "get_sensor_data_ll_td" "get_sensor_data_ll_td_fe" "get_sensor_data_trawl" +#> [40] "get_sensor_data_trawl_fe" "get_skate_level_counts" "get_species" +#> [43] "get_species_groups" "get_spp_sample_length_type" "get_ssids" +#> [46] "get_strata_areas" "get_survey_blocks" "get_survey_gear_types" +#> [49] "get_survey_ids" "get_survey_index" "get_survey_samples" +#> [52] "get_survey_sets" "get_survey_stomachs" "get_table" +``` + The `get_*()` functions extract data by species, and some functions have arguments for additional filtering, such as survey series, @@ -85,7 +97,8 @@ the `get_*()` functions can extract data for one or multiple species. All functions can be viewed with the available arguments in the help documentation for each set of functions with: -```{r, eval = FALSE} + +``` r ?get_data ?get_environmental_data ?get_lookup_tables @@ -102,11 +115,45 @@ As an example, we could extract Pacific cod survey sample data with the following function call if we were on a DFO laptop, with appropriate database permissions, and on the PBS network. -```{r, eval=.eval} + +``` r dat <- get_survey_samples("pacific cod") +``` + +``` +#> All or majority of length measurements are Fork_Length +``` + +``` +#> Warning in get_survey_samples("pacific cod"): Duplicate specimen IDs are present because of +#> overlapping survey stratifications. If working with the data yourelf, filter them after selecting +#> specific surveys. For example, `dat <- dat[!duplicated(dat$specimen_id), ]`. The tidying and +#> plotting functions within gfplot will do this for you. +``` + +``` r head(dat) ``` +``` +#> # A tibble: 6 x 40 +#> trip_start_date fishing_event_id year month gear survey_series_id survey_abbrev +#> +#> 1 2014-10-01 00:00:00 3420684 2014 10 5 76 DOG +#> 2 2019-10-01 00:00:00 5167333 2019 10 5 76 DOG +#> 3 2003-08-13 00:00:00 309491 2003 8 5 39 HBLL INS N +#> 4 2003-08-13 00:00:00 309493 2003 8 5 39 HBLL INS N +#> 5 2003-08-13 00:00:00 309503 2003 8 5 39 HBLL INS N +#> 6 2003-08-13 00:00:00 309506 2003 8 5 39 HBLL INS N +#> # i 33 more variables: survey_series_desc , survey_id , major_stat_area_code , +#> # major_stat_area_name , minor_stat_area_code , species_code , +#> # species_common_name , species_science_name , specimen_id , sample_id , +#> # sex , age_specimen_collected , age , sampling_desc , +#> # ageing_method_code , length , weight , maturity_code , maturity_name , +#> # maturity_desc , maturity_convention_code , maturity_convention_desc , +#> # maturity_convention_maxvalue , trip_sub_type_code , sample_type_code , ... +``` + Note that there are some duplicate records in the databases due to relating a record to multiple stratification schemes for alternative analyses. If this occurs, a warning is given. @@ -116,7 +163,8 @@ occurs, a warning is given. Either species name or species code can be given as an argument, and species name, if used, is not case-sensitive. The following all do the same thing: -```{r, eval=FALSE} + +``` r get_survey_samples("pacific cod") get_survey_samples("Pacific cod") get_survey_samples("PaCiFiC cOD") @@ -126,7 +174,8 @@ get_survey_samples(222) To extract multiple species at once, give a list as the species argument: -```{r, eval=FALSE} + +``` r get_survey_samples(c("pacific ocean perch", "pacific cod")) get_survey_samples(c(396, 222)) get_survey_samples(c(222, "pacific cod")) @@ -136,27 +185,111 @@ We can further restrict the data extraction to a single trawl survey series by including the ssid (survey series id) argument. For a list of survey series id codes, run the lookup function `get_ssids()`. -```{r, eval=.eval} + +``` r ssids <- get_ssids() head(ssids) ``` +``` +#> # A tibble: 6 x 3 +#> SURVEY_SERIES_ID SURVEY_SERIES_DESC SURVEY_ABBREV +#> +#> 1 0 Individual survey without a series OTHER +#> 2 1 Queen Charlotte Sound Synoptic Bottom Trawl SYN QCS +#> 3 2 Hecate Strait Multispecies Assemblage Bottom Trawl HS MSA +#> 4 3 Hecate Strait Synoptic Bottom Trawl SYN HS +#> 5 4 West Coast Vancouver Island Synoptic Bottom Trawl SYN WCVI +#> 6 5 Hecate Strait Pacific Cod Monitoring Bottom Trawl OTHER +``` + Select desired ssid and include as argument (i.e. the Queen Charlotte Sound bottom trawl survey): -```{r, eval = FALSE} + +``` r dat <- get_survey_samples(222, ssid = 1) +``` + +``` +#> All or majority of length measurements are Fork_Length +``` + +``` r head(dat) ``` -```{r, echo = FALSE, eval=.eval} -dat <- dat %>% filter(survey_series_id == 1) ``` +#> # A tibble: 6 x 40 +#> trip_start_date fishing_event_id year month gear survey_series_id survey_abbrev +#> +#> 1 2003-07-03 00:00:00 308673 2003 7 1 1 SYN QCS +#> 2 2003-07-03 00:00:00 308673 2003 7 1 1 SYN QCS +#> 3 2003-07-03 00:00:00 308673 2003 7 1 1 SYN QCS +#> 4 2003-07-03 00:00:00 308673 2003 7 1 1 SYN QCS +#> 5 2003-07-03 00:00:00 308673 2003 7 1 1 SYN QCS +#> 6 2003-07-03 00:00:00 308673 2003 7 1 1 SYN QCS +#> # i 33 more variables: survey_series_desc , survey_id , major_stat_area_code , +#> # major_stat_area_name , minor_stat_area_code , species_code , +#> # species_common_name , species_science_name , specimen_id , sample_id , +#> # sex , age_specimen_collected , age , sampling_desc , +#> # ageing_method_code , length , weight , maturity_code , maturity_name , +#> # maturity_desc , maturity_convention_code , maturity_convention_desc , +#> # maturity_convention_maxvalue , trip_sub_type_code , sample_type_code , ... +``` + -```{r, eval=.eval} + + +``` r glimpse(dat) ``` +``` +#> Rows: 11,509 +#> Columns: 40 +#> $ trip_start_date 2003-07-03, 2003-07-03, 2003-07-03, 2003-07-03, 2003-07-03, ~ +#> $ fishing_event_id 308673, 308673, 308673, 308673, 308673, 308673, 308673, 30867~ +#> $ year 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2~ +#> $ month 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7~ +#> $ gear 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ survey_series_id 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ survey_abbrev "SYN QCS", "SYN QCS", "SYN QCS", "SYN QCS", "SYN QCS", "SYN Q~ +#> $ survey_series_desc "Queen Charlotte Sound Synoptic Bottom Trawl", "Queen Charlot~ +#> $ survey_id 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ major_stat_area_code "05", "05", "05", "05", "05", "05", "05", "05", "05", "05", "~ +#> $ major_stat_area_name "5A: SOUTHERN Q.C. SOUND", "5A: SOUTHERN Q.C. SOUND", "5A: SO~ +#> $ minor_stat_area_code "11", "11", "11", "11", "11", "11", "11", "11", "11", "11", "~ +#> $ species_code "222", "222", "222", "222", "222", "222", "222", "222", "222"~ +#> $ species_common_name "pacific cod", "pacific cod", "pacific cod", "pacific cod", "~ +#> $ species_science_name "gadus macrocephalus", "gadus macrocephalus", "gadus macrocep~ +#> $ specimen_id 7607986, 7607987, 7607988, 7607989, 7607990, 7607991, 7607992~ +#> $ sample_id 233551, 233551, 233551, 233551, 233551, 233551, 233551, 23355~ +#> $ sex 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2~ +#> $ age_specimen_collected 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ +#> $ age NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ sampling_desc "UNSORTED", "UNSORTED", "UNSORTED", "UNSORTED", "UNSORTED", "~ +#> $ ageing_method_code NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ length 21, 21, 23, 23, 23, 23, 23, 24, 24, 25, 26, 26, 27, 28, 28, 2~ +#> $ weight NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ maturity_code 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ +#> $ maturity_name NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ maturity_desc NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ maturity_convention_code 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9~ +#> $ maturity_convention_desc "MATURITIES NOT LOOKED AT", "MATURITIES NOT LOOKED AT", "MATU~ +#> $ maturity_convention_maxvalue 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~ +#> $ trip_sub_type_code 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3~ +#> $ sample_type_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ species_category_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ sample_source_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ dna_sample_type NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ dna_container_id NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~ +#> $ usability_code 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~ +#> $ grouping_code 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1~ +#> $ length_type "Fork_Length", "Fork_Length", "Fork_Length", "Fork_Length", "~ +#> $ species_ageing_group "pcod_lingcod", "pcod_lingcod", "pcod_lingcod", "pcod_lingcod~ +``` + # Caching the data from the SQL servers In addition to the individual `get_*()` functions, there is a function @@ -171,20 +304,220 @@ given species into a series of `.rds` files into whatever folder you specify to the `path` argument. I'll wrap it in a quick check just to make sure we don't download the data twice if we build this document again. -```{r, eval = FALSE, eval=.eval} + +``` r cache_pbs_data("pacific cod", path = "pcod-cache") ``` +``` +#> Extracting data for +``` + +``` +#> Extracting survey samples +``` + +``` +#> All or majority of length measurements are Fork_Length +``` + +``` +#> Warning in get_survey_samples(this_sp): Duplicate specimen IDs are present because of overlapping +#> survey stratifications. If working with the data yourelf, filter them after selecting specific +#> surveys. For example, `dat <- dat[!duplicated(dat$specimen_id), ]`. The tidying and plotting +#> functions within gfplot will do this for you. +``` + +``` +#> Extracting commercial samples +``` + +``` +#> All or majority of length measurements are Fork_Length +``` + +``` +#> Extracting catch +``` + +``` +#> Extracting spatial CPUE +``` + +``` +#> Extracting spatial LL CPUE +``` + +``` +#> Extracting spatial catch +``` + +``` +#> Extracting survey indexes +``` + +``` +#> Extracting aging precision +``` + +``` +#> All data extracted and saved in the folder `pcod-cache`. +``` + And to call the list of output files: -```{r, eval=.eval} + +``` r dat <- readRDS(file.path("pcod-cache", "pacific-cod.rds")) head(dat) ``` +``` +#> $survey_samples +#> # A tibble: 77,134 x 40 +#> trip_start_date fishing_event_id year month gear survey_series_id survey_abbrev +#> +#> 1 2014-10-01 00:00:00 3420684 2014 10 5 76 DOG +#> 2 2019-10-01 00:00:00 5167333 2019 10 5 76 DOG +#> 3 2003-08-13 00:00:00 309491 2003 8 5 39 HBLL INS N +#> 4 2003-08-13 00:00:00 309493 2003 8 5 39 HBLL INS N +#> 5 2003-08-13 00:00:00 309503 2003 8 5 39 HBLL INS N +#> 6 2003-08-13 00:00:00 309506 2003 8 5 39 HBLL INS N +#> 7 2003-08-13 00:00:00 309516 2003 8 5 39 HBLL INS N +#> 8 2003-08-13 00:00:00 309517 2003 8 5 39 HBLL INS N +#> 9 2003-08-13 00:00:00 309524 2003 8 5 39 HBLL INS N +#> 10 2003-08-13 00:00:00 309478 2003 8 5 39 HBLL INS N +#> # i 77,124 more rows +#> # i 33 more variables: survey_series_desc , survey_id , major_stat_area_code , +#> # major_stat_area_name , minor_stat_area_code , species_code , +#> # species_common_name , species_science_name , specimen_id , sample_id , +#> # sex , age_specimen_collected , age , sampling_desc , +#> # ageing_method_code , length , weight , maturity_code , maturity_name , +#> # maturity_desc , maturity_convention_code , maturity_convention_desc , ... +#> +#> $commercial_samples +#> # A tibble: 155,282 x 54 +#> trip_start_date trip_end_date trip_year year month day time_deployed +#> +#> 1 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 2 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 3 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 4 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 5 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 6 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 7 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 8 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 9 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> 10 1962-04-19 00:00:00 1962-04-30 00:00:00 1962 1962 4 2 NA +#> # i 155,272 more rows +#> # i 47 more variables: time_retrieved , trip_id , fishing_event_id , +#> # latitude , lat_start , lat_end , longitude , lon_start , +#> # lon_end , best_depth , gear_code , gear_desc , species_code , +#> # species_common_name , species_science_name , sample_id , specimen_id , +#> # sex , age_specimen_collected , age , ageing_method_code , length , +#> # weight , maturity_code , maturity_convention_code , ... +#> +#> $catch +#> # A tibble: 333,430 x 27 +#> database_name trip_id fishing_event_id fishery_sector trip_category gear best_date +#> +#> 1 GFCatch 54000002 1 GROUNDFISH TRAWL BOTTO~ 1954-01-04 00:00:00 +#> 2 GFCatch 54000054 1 GROUNDFISH TRAWL BOTTO~ 1954-01-04 00:00:00 +#> 3 GFCatch 54000001 1 GROUNDFISH TRAWL BOTTO~ 1954-01-04 00:00:00 +#> 4 GFCatch 54000055 1 GROUNDFISH TRAWL BOTTO~ 1954-01-04 00:00:00 +#> 5 GFCatch 54000034 1 GROUNDFISH TRAWL BOTTO~ 1954-01-05 00:00:00 +#> 6 GFCatch 54000003 1 GROUNDFISH TRAWL BOTTO~ 1954-01-07 00:00:00 +#> 7 GFCatch 54000036 4 GROUNDFISH TRAWL BOTTO~ 1954-01-07 00:00:00 +#> 8 GFCatch 54000061 1 GROUNDFISH TRAWL BOTTO~ 1954-01-07 00:00:00 +#> 9 GFCatch 54000036 3 GROUNDFISH TRAWL BOTTO~ 1954-01-07 00:00:00 +#> 10 GFCatch 54000037 1 GROUNDFISH TRAWL BOTTO~ 1954-01-07 00:00:00 +#> # i 333,420 more rows +#> # i 20 more variables: fe_start_date , fe_end_date , lat , lon , +#> # best_depth , species_code , dfo_stat_area_code , dfo_stat_subarea_code , +#> # species_scientific_name , species_common_name , landed_kg , discarded_kg , +#> # landed_pcs , discarded_pcs , major_stat_area_code , minor_stat_area_code , +#> # major_stat_area_name , vessel_name , vessel_registration_number , year +#> +#> $cpue_spatial +#> # A tibble: 77,392 x 11 +#> year best_date major_stat_area_code trip_id fishing_event_id lat lon +#> +#> 1 2007 2007-05-08 09:50:00 06 83395 732412 51.4 -129. +#> 2 2007 2007-05-08 11:15:00 06 83395 732415 51.4 -129. +#> 3 2007 2007-05-07 16:30:00 06 83395 732418 51.3 -129. +#> 4 2007 2007-05-08 19:55:00 06 83395 732419 51.3 -129. +#> 5 2007 2007-05-07 09:30:00 07 83739 736980 52.4 -130. +#> 6 2007 2007-05-11 08:07:00 05 83739 736990 50.8 -129. +#> 7 2007 2007-05-08 17:26:00 08 83916 726785 54.5 -131. +#> 8 2007 2007-05-07 13:17:00 08 83916 726787 54.5 -131. +#> 9 2007 2007-05-08 11:14:00 08 83916 726788 54.5 -131. +#> 10 2007 2007-05-08 07:15:00 08 83916 726790 54.5 -131. +#> # i 77,382 more rows +#> # i 4 more variables: vessel_registration_number , species_scientific_name , +#> # species_common_name , cpue +#> +#> $cpue_spatial_ll +#> # A tibble: 8,899 x 16 +#> year best_date fishery_sector vessel_registration_num~1 gear trip_id fishing_event_id +#> +#> 1 2006 2006-03-12 12:00:00 halibut 30850 long~ 60792 512243 +#> 2 2006 2006-03-06 09:30:00 halibut 22452 long~ 60809 512797 +#> 3 2006 2006-03-07 10:10:00 halibut 22452 long~ 60809 512802 +#> 4 2006 2006-03-07 12:00:00 halibut 22452 long~ 60809 512803 +#> 5 2006 2006-03-09 20:00:00 halibut 22452 long~ 60809 512806 +#> 6 2006 2006-03-10 10:45:00 halibut 23703 long~ 60944 511671 +#> 7 2006 2006-03-11 14:45:00 halibut 23703 long~ 60944 511683 +#> 8 2006 2006-03-12 09:15:00 halibut 23703 long~ 60944 511701 +#> 9 2006 2006-03-12 17:00:00 halibut 23703 long~ 60944 511705 +#> 10 2006 2006-03-13 23:59:00 halibut 22452 long~ 61099 513046 +#> # i 8,889 more rows +#> # i abbreviated name: 1: vessel_registration_number +#> # i 9 more variables: lat , lon , species_code , species_scientific_name , +#> # species_common_name , landed_round_kg , cpue , total_released_pcs , +#> # major_stat_area_code +#> +#> $catch_spatial +#> # A tibble: 77,392 x 11 +#> year best_date major_stat_area_code trip_id fishing_event_id lat lon +#> +#> 1 2007 2007-12-28 14:25:00 01 99701 881231 48.9 -123. +#> 2 2007 2007-12-28 16:23:00 01 99701 881232 48.9 -123. +#> 3 2007 2007-12-28 18:13:00 01 99701 881233 48.9 -123. +#> 4 2007 2007-12-28 20:10:00 01 99701 881234 48.9 -123. +#> 5 2007 2007-12-29 08:20:00 01 99701 881235 48.9 -123. +#> 6 2007 2007-12-29 10:01:00 01 99701 881236 48.9 -123. +#> 7 2007 2007-12-28 08:35:00 01 99705 881398 49.3 -123. +#> 8 2007 2007-12-29 15:01:00 01 99705 881402 49.2 -123. +#> 9 2007 2007-12-29 10:48:00 01 99711 881395 49.3 -123. +#> 10 2007 2007-04-05 17:50:00 01 82028 880764 48.9 -123. +#> # i 77,382 more rows +#> # i 4 more variables: vessel_registration_number , species_scientific_name , +#> # species_common_name , catch +``` + And to call one object/dataframe (i.e. our survey sample data) from the list: -```{r, eval=.eval} + +``` r dat <- dat$survey_samples head(dat) ``` + +``` +#> # A tibble: 6 x 40 +#> trip_start_date fishing_event_id year month gear survey_series_id survey_abbrev +#> +#> 1 2014-10-01 00:00:00 3420684 2014 10 5 76 DOG +#> 2 2019-10-01 00:00:00 5167333 2019 10 5 76 DOG +#> 3 2003-08-13 00:00:00 309491 2003 8 5 39 HBLL INS N +#> 4 2003-08-13 00:00:00 309493 2003 8 5 39 HBLL INS N +#> 5 2003-08-13 00:00:00 309503 2003 8 5 39 HBLL INS N +#> 6 2003-08-13 00:00:00 309506 2003 8 5 39 HBLL INS N +#> # i 33 more variables: survey_series_desc , survey_id , major_stat_area_code , +#> # major_stat_area_name , minor_stat_area_code , species_code , +#> # species_common_name , species_science_name , specimen_id , sample_id , +#> # sex , age_specimen_collected , age , sampling_desc , +#> # ageing_method_code , length , weight , maturity_code , maturity_name , +#> # maturity_desc , maturity_convention_code , maturity_convention_desc , +#> # maturity_convention_maxvalue , trip_sub_type_code , sample_type_code , ... +``` diff --git a/vignettes/gfdata-vignette.Rmd.orig b/vignettes/gfdata-vignette.Rmd.orig new file mode 100644 index 0000000..aeafceb --- /dev/null +++ b/vignettes/gfdata-vignette.Rmd.orig @@ -0,0 +1,191 @@ +--- +title: "gfdata Vignette" +author: "Elise Keppel" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{gfdata Vignette} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, echo = FALSE} +knitr::opts_chunk$set( + collapse = FALSE, + comment = "#>", + eval = TRUE, + cache = TRUE, + autodep = TRUE, + fig.path = "vignettes/knitr-figs/", + cache.path = "vignettes/knitr-cache/" +) +``` + +```{r, echo=FALSE, cache=FALSE} +.error <- tryCatch(get_ssids(), error = function(e) "Error") +.eval <- !identical(class(.error), "character") +``` + +# Setup + +If you don't already have the package installed, then run: + +```{r, eval=FALSE} +# install.packages("devtools") +devtools::install_github("pbs-assess/gfdata") +``` + +First we will load the package along with dplyr since we will use it within our code later. + +```{r, cache=FALSE, warning = FALSE, message = FALSE} +library(gfdata) +library(dplyr) +``` + + +# An overview of gfdata + +Commercial and research catch, effort, and biological data for groundfish are +archived by the DFO Pacific Groundfish Data Unit (Fisheries and Oceans Canada, Science +Branch, Pacific Region) and housed in a number of relational databases archived +on-site at the Pacific Biological Station, Nanaimo, BC). + +The gfdata package was +develeoped to automate data extraction from these databases in a consistent, +reproducible manner with a series of `get_*()` functions. The functions extract +data using SQL queries, developed with support from the Groundfish Data Unit. +The standardized datasets are designed to feed directly into functions in the +gfplot package, or can of course also be analyzed outside of gfplot.\ +\ +The SQL code called in the `get_*()` functions can be viewed here:\ +\ +\ +How the various functions fit together:\ +\ +\ +Detailed information on the data extraction and the `get_*()` functions can be +found in: + +Anderson, S.C., Keppel, E.A., Edwards, A.M.. "A reproducible data +synopsis for over 100 species of British Columbia groundfish". DFO Can. Sci. +Advis. Sec. REs. Doc. 2019/nnn. iv + 327 p.\ +\ +The complete list of `get_*()` functions in gfdata is: + +```{r list-get} +fns <- ls("package:gfdata") +sort(fns[grepl("get", fns)]) +``` + + +The `get_*()` functions extract data by species, and some +functions have arguments for additional filtering, such as survey series, +management area, years, gear type, or environmental data type. In all cases, +the `get_*()` functions can extract data for one or multiple species. + +All functions can be viewed with the available arguments in the help +documentation for each set of functions with: + +```{r, eval = FALSE} +?get_data +?get_environmental_data +?get_lookup_tables +``` + + +In addition, a number of the `get` functions retain many relevant database +columns that users can filter on with, for example, `dplyr::filter(dat, x = "y")`. + + +# Example + +As an example, we could extract Pacific cod survey sample data with the +following function call if we were on a DFO laptop, with appropriate database +permissions, and on the PBS network. + +```{r, eval=.eval} +dat <- get_survey_samples("pacific cod") +head(dat) +``` + +Note that there are some duplicate records in the databases due to relating a +record to multiple stratification schemes for alternative analyses. If this +occurs, a warning is given. + +> "Duplicate specimen IDs are present because of overlapping survey stratifications. If working with the data yourelf, filter them after selecting specific surveys. For example, `dat <- dat[!duplicated(dat$specimen_id), ]`. The tidying and plotting functions within gfplot will do this for you." + +Either species name or species code can be given as an argument, and +species name, if used, is not case-sensitive. The following all do the same thing: + +```{r, eval=FALSE} +get_survey_samples("pacific cod") +get_survey_samples("Pacific cod") +get_survey_samples("PaCiFiC cOD") +get_survey_samples("222") +get_survey_samples(222) +``` + +To extract multiple species at once, give a list as the species argument: + +```{r, eval=FALSE} +get_survey_samples(c("pacific ocean perch", "pacific cod")) +get_survey_samples(c(396, 222)) +get_survey_samples(c(222, "pacific cod")) +``` + +We can further restrict the data extraction to a single trawl survey series +by including the ssid (survey series id) argument. For a list of +survey series id codes, run the lookup function `get_ssids()`. + +```{r, eval=.eval} +ssids <- get_ssids() +head(ssids) +``` + +Select desired ssid and include as argument (i.e. the Queen Charlotte +Sound bottom trawl survey): + +```{r, eval = .eval} +dat <- get_survey_samples(222, ssid = 1) +head(dat) +``` + +```{r, echo = FALSE, eval=.eval} +dat <- dat %>% filter(survey_series_id == 1) +``` + +```{r, eval=.eval} +glimpse(dat) +``` + +# Caching the data from the SQL servers + +In addition to the individual `get_*()` functions, there is a function +`cache_pbs_data()` that runs all the `get_*()` functions and caches the data +in a folder that you specify. This is useful to be able to have the data +available for working on later when not on the PBS network, and it saves +running the SQL queries (though the data do get updated occassionally and the +most up-to-date data should usually be extracted for analysis). + +The helper function `cache_pbs_data()` will extract all of the data for the +given species into a series of `.rds` files into whatever folder you specify to +the `path` argument. I'll wrap it in a quick check just to make sure we don't +download the data twice if we build this document again. + +```{r, eval = FALSE, eval=.eval} +cache_pbs_data("pacific cod", path = "pcod-cache") +``` + +And to call the list of output files: + +```{r, eval=.eval} +dat <- readRDS(file.path("pcod-cache", "pacific-cod.rds")) +head(dat) +``` + +And to call one object/dataframe (i.e. our survey sample data) from the list: + +```{r, eval=.eval} +dat <- dat$survey_samples +head(dat) +``` diff --git a/vignettes/precompile.R b/vignettes/precompile.R new file mode 100644 index 0000000..3fe8920 --- /dev/null +++ b/vignettes/precompile.R @@ -0,0 +1,5 @@ +# Pre-compiled vignettes that take a long time to build and require network access +# Only run this if you're on network, otherwise you will overwrite the data stored in the the output file. +knitr::knit("vignettes/gfdata-vignette.Rmd.orig", output = "vignettes/gfdata-vignette.Rmd") +knitr::knit("vignettes/gfdata-vignette-get-all.Rmd.orig", output = "vignettes/gfdata-vignette-get-all.Rmd") +