ropensci · RayStick · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -22,6 +22,5 @@
 ^man/data_load\.Rd$
 ^man/output_copy\.Rd$
 ^man/empty_plot\.Rd$
-^man/ref_plot\.Rd$
 ^man/user_categorisation\.Rd$
 ^man/user_categorisation_loop\.Rd$
diff --git a/R/data-package_data.R b/R/data-package_data.R
@@ -54,8 +54,8 @@
 #' metadata_map \cr \cr
 #' This data was created with these two steps:
 #' \enumerate{
-#'  \item \code{domain_list <-
-#'  read.csv('inst/inputs/domain_list_demo.csv',header=FALSE)}
+#'  \item \code{domain_list <- read.csv(system.file('inputs',
+#'  'domain_list_demo.csv', package = 'mapmetadata'))}
 #'  \item \code{usethis::use_data(domain_list)}
 #' }
 #' @docType data
@@ -64,7 +64,7 @@
 #'
 #' @keywords internal
 #'
-#' @format A data frame with 8 rows and 1 column
+#' @format A data frame with 8 rows and 2 columns
 #'
 #' @source The csv was manually created
 "domain_list"
@@ -78,7 +78,8 @@
 #'   \item Navigate to the dataset of interest, select 'Download data' and
 #'   download the Structural Metadata file
 #'   \item Shorten name of downloaded file e.g. 360_NCCHD_Metadata.csv
-#'   \item \code{metadata <- read.csv("inst/inputs/360_NCCHD_Metadata.csv")}
+#'   \item \code{metadata <- read.csv(system.file('inputs',
+#'   '360_NCCHD_Metadata.csv', package = 'mapmetadata'))}
 #'   \item \code{usethis::use_data(metadata)}
 #' }
 #'
@@ -101,7 +102,8 @@
 #' domain code rather than asking the user to categorise.\cr\cr
 #' This data was created with these two steps:
 #' \enumerate{
-#'  \item \code{look_up <- read.csv('inst/inputs/look_up.csv')}
+#'  \item \code{look_up <- read.csv(system.file('inputs',
+#'  'look_up.csv',package = 'mapmetadata'))}
 #'  \item \code{usethis::use_data(look_up)}
 #' }
 #' @docType data
@@ -110,7 +112,7 @@
 #'
 #' @keywords internal
 #'
-#' @format A data frame with a variable number of rows and 3 columns
+#' @format A data frame with a variable number of rows and 2 columns
 #'
 #' @source The csv was manually created
 "look_up"
diff --git a/R/globals.R b/R/globals.R
@@ -6,4 +6,5 @@ utils::globalVariables(c("Column.description",
                          "no_count",
                          "note",
                          "total_variables",
-                         "variable"))
+                         "variable",
+                         "Variable"))
diff --git a/R/inputs_collect.R b/R/inputs_collect.R
@@ -14,6 +14,7 @@
 #' @importFrom cli cli_alert_info
 #' @importFrom utils read.csv
 #' @importFrom tools file_path_sans_ext
+#' @importFrom dplyr left_join
 #' @keywords internal
 #' @dev generate help files for unexported objects, for developers
 
@@ -44,11 +45,11 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
     # Verify the metadata file name pattern and that it is a .csv file
     if (!grepl("^[0-9]+_.*_Metadata\\.csv$", metadata_base) ||
           tools::file_ext(metadata_file) != "csv") {
-      stop(paste("Metadata file name must be a .csv file in the format",
+      stop(paste("metadata_file name must be a .csv file in the format",
                  "ID_Name_Metadata.csv where ID is an integer"))
     } else {
       if (!file.exists(metadata_file)) {
-        stop("Metadata filename is the correct format but it does not exist!")
+        stop("metadata_file is the correct filename but it does not exist!")
       }
     }
 
@@ -57,26 +58,35 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
     column_names <- colnames(metadata)
     expected_column_names <- c("Section", "Column.name", "Data.type",
                                "Column.description", "Sensitive")
-    if (!all(column_names == expected_column_names)) {
-      stop("Metadata file does not have expected column names")
+    if (!identical(sort(column_names), sort(expected_column_names))) {
+      stop("metadata_file does not have expected column names")
     }
 
     metadata_base_0suffix <- sub("_Metadata.csv$", "", metadata_base)
     metadata_desc <- gsub(" ", "", metadata_base_0suffix)
 
-    # Check if the domain_file is a csv and has one column
+    # Check if the domain_file is a csv and has two columns
     if (file.exists(domain_file) && tools::file_ext(domain_file) == "csv") {
-      domains <- read.csv(domain_file, header = FALSE)
-      if (ncol(domains) == 1) {
+      domains <- read.csv(domain_file)
+      column_names <- colnames(domains)
+      expected_column_names <- c("Domain_Code", "Domain_Name")
+      if (identical(sort(column_names), sort(expected_column_names))) {
         domain_list_desc <- file_path_sans_ext(basename(domain_file))
       } else {
-        stop("The domain_file should only have one column.")
+        stop("domain_file does not have the expected column names")
       }
     } else {
-      stop("This domain_file does not exist or is not in csv format.")
+      stop("domain_file does not exist or is not in csv format.")
     }
   }
 
+  # Check the domain_file 'Code' column is correct
+  code_expected <- as.integer(seq(1, nrow(domains), 1))
+  if (!identical(code_expected, domains$Domain_Code)) {
+    stop(paste("'Code' column in domain_file is not as expected.\n",
+               "Expected 1:", nrow(domains)))
+  }
+
   # Collect look up table
   if (is.null(look_up_file)) {
     if (!quiet) {
@@ -86,15 +96,25 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
   } else {
     if (!quiet) {
       cli_alert_info("Using look up file inputted by user")
-      if (file.exists(look_up_file) && tools::file_ext(look_up_file) == "csv") {
-        lookup <- read.csv(look_up_file)
-        expected_column_names <- c("variable", "domain_label", "domain_code")
-        if (!all(colnames(lookup) == expected_column_names)) {
-          stop("Look_up file does not have expected column names")
-        }
-      } else {
-        stop("This look_up_file does not exist or is not in csv format.")
+    }
+    # Check look_up file given by user
+    if (file.exists(look_up_file) && tools::file_ext(look_up_file) == "csv") {
+      lookup <- read.csv(look_up_file)
+      expected_column_names <- c("Variable", "Domain_Name")
+      if (!all(colnames(lookup) == expected_column_names)) {
+        stop("look_up file does not have expected column names")
+      }
+      # Add Domain_Code column into lookup table
+      lookup <- lookup %>% left_join(domains, by = "Domain_Name")
+      # Check for look_up rows not covered by domain_list
+      no_match <- lookup[!lookup$Domain_Name %in% domains$Domain_Name, ]
+      if (nrow(no_match) != 0) {
+        warning(paste("There are domain names in the look_up_file that are not",
+                      "included in the domain_file. If this is not expected,",
+                      "check for mistakes."))
       }
+    } else {
+      stop("look_up_file does not exist or is not in csv format.")
     }
   }
 

diff --git a/R/map_compare.R b/R/map_compare.R
@@ -81,46 +81,28 @@ map_compare <- function(session_dir,
     stop("Cannot locate all four input files.")
   }
 
-  # Read csv files
+  ## Check if domain_file exists
+  if (!file.exists(domain_file))  {
+    stop("Cannot locate domain_file.")
+  }
+
+  ## Check if metadata_file exists
+  if (!file.exists(metadata_file))  {
+    stop("Cannot locate metadata_file.")
+  }
+
+  # Read files
   csv_1a <- read.csv(csv_1a_path)
   csv_2a <- read.csv(csv_2a_path)
   csv_1b <- read.csv(csv_1b_path)
   csv_2b <- read.csv(csv_2b_path)
 
-  # Verify the metadata file name pattern and that it is a .csv file
-  metadata_base <- basename(metadata_file)
-  if (!grepl("^[0-9]+_.*_Metadata\\.csv$", metadata_base) ||
-        tools::file_ext(metadata_file) != "csv") {
-    stop(paste("Metadata file name must be a .csv file in the format",
-               "ID_Name_Metadata.csv where ID is an integer"))
-  } else {
-    if (!file.exists(metadata_file)) {
-      stop("Metadata filename is the correct format but it does not exist!")
-    }
-  }
-
-  # Check if metadata column names match what is expected
   metadata <- read.csv(metadata_file)
-  column_names <- colnames(metadata)
-  expected_column_names <- c("Section", "Column.name", "Data.type",
-                             "Column.description", "Sensitive")
-  if (!all(column_names == expected_column_names)) {
-    stop("Metadata file does not have expected column names")
-  }
-
+  metadata_base <- basename(metadata_file)
   metadata_base_0suffix <- sub("_Metadata.csv$", "", metadata_base)
   metadata_desc <- gsub(" ", "", metadata_base_0suffix)
 
-  # Check if the domain_file is a csv and has one column
-  if (file.exists(domain_file) && tools::file_ext(domain_file) == "csv") {
-    domains <- read.csv(domain_file, header = FALSE)
-    if (ncol(domains) == 1) {
-    } else {
-      stop("The domain_file should only have one column.")
-    }
-  } else {
-    stop("This domain_file does not exist or is not in csv format.")
-  }
+  domains <- read.csv(domain_file)
 
   # CHECK IF A VALID COMPARISON BETWEEN SESSIONS IS POSSIBLE ----
 
@@ -152,7 +134,7 @@ map_compare <- function(session_dir,
     input_1 = nrow(csv_1b),
     input_2 = nrow(csv_2b),
     severity = "danger",
-    severity_text = "Different number of variavles!"
+    severity_text = "Different number of variables!"
   )
 
   ##  Check if sessions can be compared (warnings for user to check):
@@ -166,8 +148,10 @@ map_compare <- function(session_dir,
 
   # DISPLAY TO USER ----
 
-  ## Use 'ref_plot.R' to plot domains for the user's ref (save df for later use)
-  df_plots <- ref_plot(domains)
+  ## Extract domains and plot for user's reference
+  domain_table <- tableGrob(domains, rows = NULL)
+  grid.arrange(domain_table, nrow = 1, ncol = 1)
+  n_codes <- nrow(domains)
 
   # EXTRACT TABLE INFO FROM METADATA ----
   table_name <- csv_1a$table[1]
@@ -182,8 +166,7 @@ map_compare <- function(session_dir,
 
   # FIND MISMATCHES AND ASK FOR CONSENSUS DECISION ----
   for (variable in seq_len(nrow(ses_join))) {
-    consensus <- consensus_on_mismatch(ses_join, table_df, variable,
-                                       max(df_plots$code$code))
+    consensus <- consensus_on_mismatch(ses_join, table_df, variable, n_codes)
     ses_join$domain_code_join[variable] <- consensus$domain_code_join
     ses_join$note_join[variable] <- consensus$note_join
   } # end of loop for variable

diff --git a/R/metadata_map.R b/R/metadata_map.R
@@ -15,12 +15,12 @@
 #' variables, and categorisations for the same variable can be copied from one
 #' table to another. \cr \cr
 #' Example inputs are provided within the package data, for the user to run this
-#' function in a demo mode.
+#' function in a demo mode. Refer to the package website for more guidance.
 #' @param metadata_file This should be a csv download from HDRUK gateway
 #' (in the form of ID_Dataset_Metadata.csv). Run '?mapmetadata::metadata' to
 #' see how the metadata_file for the demo was created.
-#' @param domain_file This should be a csv file created by the user, with each
-#' domain on a separate line, no header. Run '?mapmetadata::domain_list' to
+#' @param domain_file This should be a csv file created by the user, with two
+#' columns (Domain_Code and Domain_Name). Run '?mapmetadata::domain_list' to
 #' see how the domain_file for the demo was created.
 #' @param look_up_file The lookup file makes auto-categorisations intended for
 #' variables that appear regularly in health datasets. It only works for 1:1
@@ -41,8 +41,8 @@
 #' 'L-OUTPUT_' which gives each categorisation its own row. Default is TRUE.
 #' @param quiet Default is FALSE. Change to TRUE to quiet the cli_alert_info
 #' and cli_alert_success messages.
-#' @return The function will return two csv files: 'OUTPUT_' which contains the
-#' mappings and 'LOG_' which contains details about the dataset and session.
+#' @return A html plot summarising the dataset. Various csv and png outputs to
+#' summarise the user's mapping session for a specific table in the dataset.
 #' @examples
 #' # Demo run requires no function inputs but requires user interaction.
 #' # See package documentation to guide user inputs.
@@ -56,6 +56,8 @@
 #' @importFrom utils packageVersion write.csv browseURL menu select.list
 #' @importFrom ggplot2 ggsave
 #' @importFrom htmlwidgets saveWidget
+#' @importFrom gridExtra tableGrob grid.arrange
+#' @importFrom graphics plot.new
 
 metadata_map <- function(
     metadata_file = NULL,
@@ -153,16 +155,10 @@
   log_output_df <- get("log_output_df")
   output_df <- get("output_df")
 
-  ## Use 'ref_plot.R' to plot domains for the user's ref (save df for later use)
-  df_plots <- ref_plot(data$domains)
-
-  ## Check if look_up_file and domain_file are compatible
-  mismatch <- setdiff(data$lookup$domain_code, df_plots$code$code)
-  if (length(mismatch) > 0) {
-    print(mismatch)
-    stop(paste("The look_up_file and domain_file are not compatible. These",
-               "look up codes are not listed in the domain codes:\n"))
-  }
+  ## Extract domains and plot for user's reference
+  domain_table <- tableGrob(data$domains, rows = NULL)
+  grid.arrange(domain_table, nrow = 1, ncol = 1)
+  n_codes <- nrow(data$domains)
 
   ## CHOOSE TABLE TO PROCESS
 
@@ -207,7 +203,7 @@
     df_prev_exist,
     df_prev,
     lookup = data$lookup,
-    df_plots,
+    n_codes,
     output_df
   )
 
@@ -239,7 +235,7 @@
         table_df$Column.name[v_auto],
         table_df$Column.description[v_auto],
         table_df$Data.type[v_auto],
-        max(df_plots$code$code)
+        n_codes
       )
       ##### input user responses into output
       output_df$domain_code[v_auto] <- decision_output$decision
@@ -277,7 +273,7 @@
           table_df$Column.name[v_not_auto],
           table_df$Column.description[v_not_auto],
           table_df$Data.type[v_not_auto],
-          max(df_plots$code$code)
+          n_codes
         )
         ##### input user responses into output
         output_df$domain_code[v_not_auto] <- decision_output$decision
@@ -320,7 +316,7 @@
 
   ### Create and save a summary plot
   end_plot_save <- end_plot(df = output_df, table_name,
-                            ref_table = df_plots$domain_table)
+                            ref_table = domain_table)
   ggsave(
     plot = end_plot_save,
     filename = png_path,

diff --git a/R/plotting.R b/R/plotting.R
@@ -32,31 +32,6 @@ empty_plot <- function(dataframe, bar_title) {
   barplot_html
 }
 
-#' Internal: ref_plot
-#'
-#' This function is called within the metadata_map function. \cr \cr
-#' It plots a reference table to guide the user in their categorisation of
-#' domains. \cr \cr
-#' This reference table is based on the user inputted domains and the default
-#' domains provided by this package.  \cr \cr
-#' @param domains The output of load_data
-#' @return A reference table that appears in the Plots tab. A list of 2
-#' containing the derivatives for this plot, used later in metadata_map'
-#' @importFrom gridExtra tableGrob grid.arrange
-#' @importFrom graphics plot.new
-#' @keywords internal
-#' @dev generate help files for unexported objects, for developers
-
-ref_plot <- function(domains) {
-  colnames(domains)[1] <- "Domain Name"
-  plot.new()
-  code <- data.frame(code = seq_len(nrow(domains)))
-  domain_table <- tableGrob(cbind(code, domains), rows = NULL)
-  grid.arrange(domain_table, nrow = 1, ncol = 1)
-
-  return(list(code = code, domain_table = domain_table))
-}
-
 #' Internal: end_plot
 #'
 #' This function is called within the metadata_map function. \cr \cr