Merge pull request #368 from USEPA/12-7-23-cm

12 7 23 cm
USEPA · Dec 15, 2023 · b2c0bb7 · b2c0bb7
2 parents 6089220 + 785c5e4
commit b2c0bb7
Show file tree

Hide file tree

Showing 37 changed files with 71,336 additions and 71,055 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -39,6 +39,7 @@ export(TADA_GetMeasureUnitRef)
 export(TADA_GetNutrientSummationRef)
 export(TADA_GetSynonymRef)
 export(TADA_GetTemplate)
+export(TADA_GetUSGSSynonymRef)
 export(TADA_GetWQXCharValRef)
 export(TADA_HarmonizeSynonyms)
 export(TADA_Histogram)

diff --git a/R/CensoredDataSuite.R b/R/CensoredDataSuite.R
@@ -46,18 +46,21 @@ TADA_IDCensoredData <- function(.data) {
 
   if (dim(cens)[1] > 0) {
     ## Bring in det cond reference table
-    cond.ref <- TADA_GetDetCondRef() %>%
+    cond.ref <- utils::read.csv(system.file("extdata", "WQXResultDetectionConditionRef.csv", package = "TADA")) %>%
       dplyr::rename(ResultDetectionConditionText = Name) %>%
       dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)
 
     ## Join to censored data
     cens <- dplyr::left_join(cens, cond.ref, by = "ResultDetectionConditionText")
 
-    ## Flag censored data that does not havedet cond populated
+    ## Flag censored data that does not have det cond populated
     cens$TADA.Detection_Type <- ifelse(is.na(cens$ResultDetectionConditionText), "ResultDetectionConditionText missing", cens$TADA.Detection_Type)
 
-    ## Fill in detection type when result measure value = "ND"
-    cens$TADA.Detection_Type <- ifelse(cens$ResultMeasureValue %in% c("ND"), "Non-Detect", cens$TADA.Detection_Type)
+    ## Fill in detection type when text result measure value indicates it is a nondetect
+    cens$TADA.Detection_Type <- ifelse(cens$ResultMeasureValue %in% 
+                                         c("ND", "BPQL", "BDL"), 
+                                       "Non-Detect", 
+                                       cens$TADA.Detection_Type)
 
     ## Let user know when detection condition text is missing from one or more results
     # NOTE that at this point, TADA.Detection_Type may be NA if there are detection conditions in dataset that are not present in domain table
@@ -75,7 +78,7 @@ TADA_IDCensoredData <- function(.data) {
     }
 
     ## Bring in det limit type reference table
-    limtype.ref <- TADA_GetDetLimitRef() %>%
+    limtype.ref <- utils::read.csv(system.file("extdata", "WQXDetectionQuantitationLimitTypeRef.csv", package = "TADA")) %>%
       dplyr::rename(DetectionQuantitationLimitTypeName = Name) %>%
       dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)
 

diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R
@@ -493,7 +493,6 @@ TADA_BigDataRetrieval <- function(startDate = "null",
   }
 
   if (!"null" %in% statecode) {
-    # state_cd_cont = utils::read.csv(file = "inst/extdata/statecode.csv",colClasses=c("STATE"="character"))
     load(system.file("extdata", "statecodes_df.Rdata", package = "TADA"))
     statecode <- as.character(statecode)
     statecodes_sub <- statecodes_df %>% dplyr::filter(STUSAB %in% statecode)
@@ -744,6 +743,7 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null",
   if (length(Projects.df) > 1) {
     if (nrow(Projects.df) > 0) {
       join3 <- join2 %>%
+
         dplyr::left_join(
           dplyr::select(
             Projects.df, OrganizationIdentifier, OrganizationFormalName,
@@ -755,7 +755,9 @@ TADA_JoinWQPProfiles <- function(FullPhysChem = "null",
             "OrganizationIdentifier", "OrganizationFormalName",
             "ProjectIdentifier", "ProjectName"
           ),
-          multiple = "all"
+          multiple = "all",
+          # need to specify that this is expected to be a 1-to-many relationship 
+          relationship = "many-to-many"
         )
     } else {
       join3 <- join2

diff --git a/R/Maintenance.R b/R/Maintenance.R
@@ -1,5 +1,6 @@
 #' Update TADA Reference Files
 #' @return Saves updated reference files
+#' This is only needed for ref tables in WQXRefTables.R
 #'
 TADA_UpdateAllRefs <- function() {
   TADA_UpdateWQXCharValRef()
@@ -87,6 +88,24 @@ TADA_UpdateExampleData <- function() {
   rm(Data_NCTCShepherdstown_HUC12)
 }
 
+## Find char-frac-spec-unit combos not present in TADA HarmonizationTemplate. 
+## Add new combinations when found to the HarmonizationTemplate.csv and 
+## NPsummation_key.csv (if relevant to TN or TP summation).
+
+FindSynonyms <- function() {
+  test <- TADA_RandomNationalTestingSet()
+  test1 <- TADA_RunKeyFlagFunctions(test)
+  ref <- TADA_GetSynonymRef()
+  ref_chars <- unique(ref$TADA.CharacteristicName)
+  test_chars <- unique(subset(test1, test1$TADA.CharacteristicName%in%ref_chars)[,c("TADA.CharacteristicName","TADA.ResultSampleFractionText","TADA.MethodSpecificationName","TADA.ResultMeasure.MeasureUnitCode")])
+  test_chars_ref <- merge(test_chars, ref, all.x = TRUE)
+  new_combos <- subset(test_chars_ref, is.na(test_chars_ref$HarmonizationGroup))[,c("TADA.CharacteristicName","TADA.ResultSampleFractionText","TADA.MethodSpecificationName","TADA.ResultMeasure.MeasureUnitCode")]
+  if(dim(new_combos)[1]>0){
+    print("New combinations found in random dataset test.")
+  }
+  return(new_combos)
+}
+
 
 # TADA_OvernightTesting
 #
@@ -108,7 +127,7 @@ TADA_UpdateExampleData <- function() {
 #
 #   for (i in 1:num_iterations) {
 #
-#     testing <- TADA_RandomTestingSet()
+#     testing <- TADA_RandomNationalTestingSet()
 #
 #     testing2 <- TADA_FlagMeasureQualifierCode(testing)
 #

diff --git a/R/ResultFlagsDependent.R b/R/ResultFlagsDependent.R
@@ -61,7 +61,7 @@ TADA_FlagFraction <- function(.data, clean = TRUE, flaggedonly = FALSE) {
     .data <- dplyr::select(.data, -TADA.SampleFraction.Flag)
   }
   # read in sample fraction reference table from extdata and filter
-  frac.ref <- TADA_GetWQXCharValRef() %>%
+  frac.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "TADA")) %>%
     dplyr::filter(Type == "CharacteristicFraction")
 
   # join "Status" column to .data by CharacteristicName and Value (SampleFraction)
@@ -204,7 +204,7 @@ TADA_FlagSpeciation <- function(.data, clean = c("invalid_only", "nonstandardize
   }
 
   # read in speciation reference table from extdata and filter
-  spec.ref <- TADA_GetWQXCharValRef() %>%
+  spec.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "TADA")) %>%
     dplyr::filter(Type == "CharacteristicSpeciation")
 
   # join "Status" column to .data by CharacteristicName and Value (Speciation)
@@ -362,7 +362,7 @@ TADA_FlagResultUnit <- function(.data, clean = c("invalid_only", "nonstandardize
   }
 
   # read in unit reference table from extdata and filter
-  unit.ref <- TADA_GetWQXCharValRef() %>%
+  unit.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "TADA")) %>%
     dplyr::filter(Type == "CharacteristicUnit")
 
   # join "Status" column to .data by CharacteristicName, Source (Media), and Value (unit)
@@ -501,7 +501,7 @@ TADA_FindQCActivities <- function(.data, clean = FALSE, flaggedonly = FALSE) {
   }
 
   # load in ActivityTypeRef Table
-  qc.ref <- TADA_GetActivityTypeRef() %>%
+  qc.ref <- utils::read.csv(system.file("extdata", "WQXActivityTypeRef.csv", package = "TADA")) %>%
     dplyr::rename(ActivityTypeCode = Code) %>%
     dplyr::select(ActivityTypeCode, TADA.ActivityType.Flag)
 
@@ -663,7 +663,7 @@ TADA_FlagMeasureQualifierCode <- function(.data, clean = FALSE, flaggedonly = FA
   }
 
   # load in ResultMeasureQualifier Flag Table
-  qc.ref <- TADA_GetMeasureQualifierCodeRef() %>%
+  qc.ref <- utils::read.csv(system.file("extdata", "WQXMeasureQualifierCodeRef.csv", package = "TADA")) %>%
     dplyr::rename(MeasureQualifierCode = Code) %>%
     dplyr::select(MeasureQualifierCode, TADA.MeasureQualifierCode.Flag)
 

diff --git a/R/ResultFlagsIndependent.R b/R/ResultFlagsIndependent.R
@@ -69,7 +69,7 @@ TADA_FlagMethod <- function(.data, clean = TRUE, flaggedonly = FALSE) {
     .data <- dplyr::select(.data, -TADA.AnalyticalMethod.Flag)
   }
   # read in WQX val reference table and filter
-  meth.ref <- TADA_GetWQXCharValRef() %>%
+  meth.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "TADA")) %>%
     dplyr::filter(Type == "CharacteristicMethod")
 
   # join "TADA.WQXVal.Flag" column to .data by CharacteristicName, Source (Media), and Value (unit)
@@ -350,21 +350,20 @@ TADA_FlagAboveThreshold <- function(.data, clean = TRUE, flaggedonly = FALSE) {
     .data <- dplyr::select(.data, -TADA.ResultValueAboveUpperThreshold.Flag)
   }
 
-  # filter WQXcharVal.ref to include only valid CharacteristicUnit in water media
-  unit.ref <- TADA_GetWQXCharValRef() %>%
-    dplyr::filter(Type == "CharacteristicUnit" & Source == "WATER" &
-      Status == "Valid")
+  # filter WQXcharVal.ref to include only valid CharacteristicUnit
+  unit.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "TADA")) %>%
+    dplyr::filter(Type == "CharacteristicUnit" & Status == "Accepted")
 
   # join unit.ref to raw.data
   check.data <- merge(.data, unit.ref[, c(
     "Characteristic", "Source",
-    "Value", "Maximum"
+    "Value.Unit", "Maximum"
   )],
   by.x = c(
     "TADA.CharacteristicName", "TADA.ActivityMediaName",
     "TADA.ResultMeasure.MeasureUnitCode"
   ),
-  by.y = c("Characteristic", "Source", "Value"), all.x = TRUE
+  by.y = c("Characteristic", "Source", "Value.Unit"), all.x = TRUE
   )
 
   # Create flag column, flag rows where ResultMeasureValue > Maximum
@@ -392,7 +391,6 @@ TADA_FlagAboveThreshold <- function(.data, clean = TRUE, flaggedonly = FALSE) {
     if (flaggedonly == TRUE) {
       print("This dataframe is empty because no data above the WQX Upper Threshold was found in your dataframe")
       emptyflag.data <- dplyr::filter(flag.data, TADA.ResultValueAboveUpperThreshold.Flag %in% "Y")
-      # emptyflag.data <- dplyr::select(emptyflag.data, -TADA.ResultValueAboveUpperThreshold.Flag)
       emptyflag.data <- TADA_OrderCols(emptyflag.data)
       return(emptyflag.data)
     }
@@ -507,20 +505,19 @@ TADA_FlagBelowThreshold <- function(.data, clean = TRUE, flaggedonly = FALSE) {
   }
 
   # filter WQXcharVal.ref to include only valid CharacteristicUnit in water media
-  unit.ref <- TADA_GetWQXCharValRef() %>%
-    dplyr::filter(Type == "CharacteristicUnit" & Source == "WATER" &
-      Status == "Valid")
+  unit.ref <- utils::read.csv(system.file("extdata", "WQXcharValRef.csv", package = "TADA")) %>%
+    dplyr::filter(Type == "CharacteristicUnit" & Status == "Accepted")
 
   # join unit.ref to raw.data
   check.data <- merge(.data, unit.ref[, c(
     "Characteristic", "Source",
-    "Value", "Minimum"
+    "Value.Unit", "Minimum"
   )],
   by.x = c(
     "TADA.CharacteristicName", "TADA.ActivityMediaName",
     "TADA.ResultMeasure.MeasureUnitCode"
   ),
-  by.y = c("Characteristic", "Source", "Value"), all.x = TRUE
+  by.y = c("Characteristic", "Source", "Value.Unit"), all.x = TRUE
   )
 
   # Create flag column, flag rows where TADA.ResultMeasureValue < Minimum

diff --git a/R/TADARefTables.R b/R/TADARefTables.R
@@ -0,0 +1,135 @@
+#' Nutrient Summation Reference Key
+#'
+#' Function downloads and returns the newest available nutrient summation
+#' reference dataframe. This dataframe is used in TADA_CalculateTotalNitrogen as
+#' the basis for the combinations added together to get total nitrogen. Users
+#' may customize this reference table for their own dataset and use the custom
+#' dataframe as an input in TADA_CalculateTotalNitrogen.
+#'
+#' @return Dataframe of nutrient summation combinations
+#'
+#' @export
+
+TADA_GetNutrientSummationRef <- function() {
+  ref <- utils::read.csv(system.file("extdata", "NPsummation_key.csv", package = "TADA"))
+  return(ref)
+}
+
+
+#' Generate Unique Synonym Reference Table
+#'
+#' Function generates a synonym reference table containing all unique
+#' combinations of TADA.CharacteristicName, TADA.ResultSampleFractionText,
+#' TADA.MethodSpecificationName, and TADA.ResultMeasure.MeasureUnitCode. The
+#' function also joins in some TADA-specific suggested synonyms for nutrients
+#' and priority parameters. These target synonyms (denoted in the reference
+#' table with the prefix "Target.") are intended to help the user aggregate
+#' synonymous data that may be uploaded with slightly different metadata
+#' conventions and prepare nutrient data for total N and P summations. Users can
+#' review how their input data relates to target synonyms for
+#' TADA.CharacteristicName, TADA.ResultSampleFractionText,
+#' TADA.MethodSpecificationName, and TADA.ResultMeasure.MeasureUnitCode. Once
+#' the synonym table is created, users may optionally edit the target columns in
+#' the reference table to meet their needs. Additionally, the function assumes
+#' the user has already removed any data containing invalid
+#' characteristic-unit-fraction-speciation combinations (i.e. user has already
+#' run TADA_FlagFraction, TADA_FlagSpeciation, TADA_FlagResultUnit, etc.).
+#'
+#' @param .data TADA dataframe. If a data frame is not provided, the function will return the default internal reference table.
+#'
+#' @return Synonym Reference Table unique to the input dataframe
+#'
+#' @export
+#'
+#' @examples
+#' # Load example dataset:
+#' data(Data_6Tribes_5y)
+#'
+#' # Create a synonym reference table for flagged, cleaned dataframe:
+#' Data_6Tribes_5yClean <- subset(Data_6Tribes_5y, !is.na(Data_6Tribes_5y$TADA.ResultMeasureValue))
+#' Data_6Tribes_5yClean <- TADA_FlagFraction(Data_6Tribes_5yClean, clean = TRUE)
+#' Data_6Tribes_5yClean <- TADA_FlagResultUnit(Data_6Tribes_5yClean, clean = "invalid_only")
+#' Data_6Tribes_5yClean <- TADA_FlagSpeciation(Data_6Tribes_5yClean, clean = "invalid_only")
+#' Data_6Tribes_5yClean <- TADA_FlagMethod(Data_6Tribes_5yClean, clean = TRUE)
+#' CreateRefTable <- TADA_GetSynonymRef(Data_6Tribes_5yClean)
+#'
+#' # Get internal synonym reference table
+#' reference <- TADA_GetSynonymRef()
+TADA_GetSynonymRef <- function(.data) {
+  if (missing(.data)) {
+    ref <- utils::read.csv(system.file("extdata", "HarmonizationTemplate.csv", package = "TADA"))
+    return(ref)
+  }
+
+  # check .data is data.frame
+  TADA_CheckType(.data, "data.frame", "Input object")
+
+  # check .data has the required columns
+  expected_cols <- c(
+    "TADA.CharacteristicName",
+    "TADA.ResultSampleFractionText",
+    "TADA.MethodSpecificationName",
+    "TADA.ResultMeasure.MeasureUnitCode"
+  )
+  TADA_CheckColumns(.data, expected_cols)
+
+  if (!any(c("TADA.MethodSpeciation.Flag", "TADA.SampleFraction.Flag", "TADA.ResultUnit.Flag") %in% names(.data))) {
+    print("Warning: This dataframe is missing TADA QC flagging columns, indicating that you have not yet run the TADA_FlagResultUnit, TADA_FlagFraction, or TADA_FlagSpeciation functions. It is highly recommended you run these flagging functions and remove Invalid combinations before proceeding to this step.")
+  }
+
+  # check to see if any invalid data flags exist
+  check_inv <- .data[, names(.data) %in% c("TADA.MethodSpeciation.Flag", "TADA.SampleFraction.Flag", "TADA.ResultUnit.Flag")]
+  check_inv <- check_inv %>%
+    tidyr::pivot_longer(cols = names(check_inv), names_to = "Flag_Column") %>%
+    dplyr::filter(value == "Invalid")
+
+  if (dim(check_inv)[1] > 0) {
+    check_inv <- check_inv %>%
+      dplyr::group_by(Flag_Column) %>%
+      dplyr::summarise("Result Count" = length(value))
+    print("Warning: Your dataframe contains invalid metadata combinations in the following flag columns:")
+    print(as.data.frame(check_inv))
+  }
+
+  # execute function after checks are passed
+  # define raw harmonization table as an object
+  harm.raw <- utils::read.csv(system.file("extdata", "HarmonizationTemplate.csv", package = "TADA"))
+
+  join.data <- merge(unique(.data[, expected_cols]),
+                     harm.raw,
+                     by = expected_cols,
+                     all.x = TRUE
+  )
+
+  # trim join.data to include only unique combos of char-frac-spec-unit
+  unique.data <- join.data %>% dplyr::distinct()
+
+  unique.data <- unique.data[, names(harm.raw)]
+
+  # return unique.data
+  return(unique.data)
+}
+
+
+#' Nutrient Summation Reference Key
+#'
+#' This internal reference file includes USGS only units/speciations. It was 
+#' created in July 2023 using the pcodes domain table from NWIS 
+#' (https://help.waterdata.usgs.gov/codes-and-parameters/parameters). All USGS units 
+#' and speciations are given a target unit and speciation that is synonymous, but 
+#' adheres to the WQX schema (WQX measure unit domain table).
+#' 
+#' This reference file is used in the TADA_ConvertResultUnits() function where 
+#' synonymous units and speciations are harmonized before units are then also 
+#' harmonized/converted to WQX targets. 
+#' 
+#' 
+#' @return Dataframe of USGS only units and speciations and their WQX compatible 
+#' targets/synonyms. 
+#'
+#' @export
+
+TADA_GetUSGSSynonymRef <- function() {
+  ref <- utils::read.csv(system.file("extdata", "USGS_units_speciation.csv", package = "TADA"))
+  return(ref)
+}