Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify domain_list and look_up for user and code base #200

Merged
merged 3 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,5 @@
^man/data_load\.Rd$
^man/output_copy\.Rd$
^man/empty_plot\.Rd$
^man/ref_plot\.Rd$
^man/user_categorisation\.Rd$
^man/user_categorisation_loop\.Rd$
14 changes: 8 additions & 6 deletions R/data-package_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
#' metadata_map \cr \cr
#' This data was created with these two steps:
#' \enumerate{
#' \item \code{domain_list <-
#' read.csv('inst/inputs/domain_list_demo.csv',header=FALSE)}
#' \item \code{domain_list <- read.csv(system.file('inputs',
#' 'domain_list_demo.csv', package = 'mapmetadata'))}
#' \item \code{usethis::use_data(domain_list)}
#' }
#' @docType data
Expand All @@ -64,7 +64,7 @@
#'
#' @keywords internal
#'
#' @format A data frame with 8 rows and 1 column
#' @format A data frame with 8 rows and 2 columns
#'
#' @source The csv was manually created
"domain_list"
Expand All @@ -78,7 +78,8 @@
#' \item Navigate to the dataset of interest, select 'Download data' and
#' download the Structural Metadata file
#' \item Shorten name of downloaded file e.g. 360_NCCHD_Metadata.csv
#' \item \code{metadata <- read.csv("inst/inputs/360_NCCHD_Metadata.csv")}
#' \item \code{metadata <- read.csv(system.file('inputs',
#' '360_NCCHD_Metadata.csv', package = 'mapmetadata'))}
#' \item \code{usethis::use_data(metadata)}
#' }
#'
Expand All @@ -101,7 +102,8 @@
#' domain code rather than asking the user to categorise.\cr\cr
#' This data was created with these two steps:
#' \enumerate{
#' \item \code{look_up <- read.csv('inst/inputs/look_up.csv')}
#' \item \code{look_up <- read.csv(system.file('inputs',
#' 'look_up.csv',package = 'mapmetadata'))}
#' \item \code{usethis::use_data(look_up)}
#' }
#' @docType data
Expand All @@ -110,7 +112,7 @@
#'
#' @keywords internal
#'
#' @format A data frame with a variable number of rows and 3 columns
#' @format A data frame with a variable number of rows and 2 columns
#'
#' @source The csv was manually created
"look_up"
3 changes: 2 additions & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ utils::globalVariables(c("Column.description",
"no_count",
"note",
"total_variables",
"variable"))
"variable",
"Variable"))
54 changes: 37 additions & 17 deletions R/inputs_collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#' @importFrom cli cli_alert_info
#' @importFrom utils read.csv
#' @importFrom tools file_path_sans_ext
#' @importFrom dplyr left_join
#' @keywords internal
#' @dev generate help files for unexported objects, for developers

Expand Down Expand Up @@ -44,11 +45,11 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
# Verify the metadata file name pattern and that it is a .csv file
if (!grepl("^[0-9]+_.*_Metadata\\.csv$", metadata_base) ||
tools::file_ext(metadata_file) != "csv") {
stop(paste("Metadata file name must be a .csv file in the format",
stop(paste("metadata_file name must be a .csv file in the format",
"ID_Name_Metadata.csv where ID is an integer"))
} else {
if (!file.exists(metadata_file)) {
stop("Metadata filename is the correct format but it does not exist!")
stop("metadata_file is the correct filename but it does not exist!")
}
}

Expand All @@ -57,26 +58,35 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
column_names <- colnames(metadata)
expected_column_names <- c("Section", "Column.name", "Data.type",
"Column.description", "Sensitive")
if (!all(column_names == expected_column_names)) {
stop("Metadata file does not have expected column names")
if (!identical(sort(column_names), sort(expected_column_names))) {
stop("metadata_file does not have expected column names")
}

metadata_base_0suffix <- sub("_Metadata.csv$", "", metadata_base)
metadata_desc <- gsub(" ", "", metadata_base_0suffix)

# Check if the domain_file is a csv and has one column
# Check if the domain_file is a csv and has two columns
if (file.exists(domain_file) && tools::file_ext(domain_file) == "csv") {
domains <- read.csv(domain_file, header = FALSE)
if (ncol(domains) == 1) {
domains <- read.csv(domain_file)
column_names <- colnames(domains)
expected_column_names <- c("Domain_Code", "Domain_Name")
if (identical(sort(column_names), sort(expected_column_names))) {
domain_list_desc <- file_path_sans_ext(basename(domain_file))
} else {
stop("The domain_file should only have one column.")
stop("domain_file does not have the expected column names")
}
} else {
stop("This domain_file does not exist or is not in csv format.")
stop("domain_file does not exist or is not in csv format.")
}
}

# Check the domain_file 'Code' column is correct
code_expected <- as.integer(seq(1, nrow(domains), 1))
if (!identical(code_expected, domains$Domain_Code)) {
stop(paste("'Code' column in domain_file is not as expected.\n",
"Expected 1:", nrow(domains)))
}

# Collect look up table
if (is.null(look_up_file)) {
if (!quiet) {
Expand All @@ -86,15 +96,25 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
} else {
if (!quiet) {
cli_alert_info("Using look up file inputted by user")
if (file.exists(look_up_file) && tools::file_ext(look_up_file) == "csv") {
lookup <- read.csv(look_up_file)
expected_column_names <- c("variable", "domain_label", "domain_code")
if (!all(colnames(lookup) == expected_column_names)) {
stop("Look_up file does not have expected column names")
}
} else {
stop("This look_up_file does not exist or is not in csv format.")
}
# Check look_up file given by user
if (file.exists(look_up_file) && tools::file_ext(look_up_file) == "csv") {
lookup <- read.csv(look_up_file)
expected_column_names <- c("Variable", "Domain_Name")
if (!all(colnames(lookup) == expected_column_names)) {
stop("look_up file does not have expected column names")
}
# Add Domain_Code column into lookup table
lookup <- lookup %>% left_join(domains, by = "Domain_Name")
# Check for look_up rows not covered by domain_list
no_match <- lookup[!lookup$Domain_Name %in% domains$Domain_Name, ]
if (nrow(no_match) != 0) {
warning(paste("There are domain names in the look_up_file that are not",
"included in the domain_file. If this is not expected,",
"check for mistakes."))
}
} else {
stop("look_up_file does not exist or is not in csv format.")
}
}

Expand Down
55 changes: 19 additions & 36 deletions R/map_compare.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,46 +81,28 @@ map_compare <- function(session_dir,
stop("Cannot locate all four input files.")
}

# Read csv files
## Check if domain_file exists
if (!file.exists(domain_file)) {
stop("Cannot locate domain_file.")
}

## Check if metadata_file exists
if (!file.exists(metadata_file)) {
stop("Cannot locate metadata_file.")
}

# Read files
csv_1a <- read.csv(csv_1a_path)
csv_2a <- read.csv(csv_2a_path)
csv_1b <- read.csv(csv_1b_path)
csv_2b <- read.csv(csv_2b_path)

# Verify the metadata file name pattern and that it is a .csv file
metadata_base <- basename(metadata_file)
if (!grepl("^[0-9]+_.*_Metadata\\.csv$", metadata_base) ||
tools::file_ext(metadata_file) != "csv") {
stop(paste("Metadata file name must be a .csv file in the format",
"ID_Name_Metadata.csv where ID is an integer"))
} else {
if (!file.exists(metadata_file)) {
stop("Metadata filename is the correct format but it does not exist!")
}
}

# Check if metadata column names match what is expected
metadata <- read.csv(metadata_file)
column_names <- colnames(metadata)
expected_column_names <- c("Section", "Column.name", "Data.type",
"Column.description", "Sensitive")
if (!all(column_names == expected_column_names)) {
stop("Metadata file does not have expected column names")
}

metadata_base <- basename(metadata_file)
metadata_base_0suffix <- sub("_Metadata.csv$", "", metadata_base)
metadata_desc <- gsub(" ", "", metadata_base_0suffix)

# Check if the domain_file is a csv and has one column
if (file.exists(domain_file) && tools::file_ext(domain_file) == "csv") {
domains <- read.csv(domain_file, header = FALSE)
if (ncol(domains) == 1) {
} else {
stop("The domain_file should only have one column.")
}
} else {
stop("This domain_file does not exist or is not in csv format.")
}
domains <- read.csv(domain_file)

# CHECK IF A VALID COMPARISON BETWEEN SESSIONS IS POSSIBLE ----

Expand Down Expand Up @@ -152,7 +134,7 @@ map_compare <- function(session_dir,
input_1 = nrow(csv_1b),
input_2 = nrow(csv_2b),
severity = "danger",
severity_text = "Different number of variavles!"
severity_text = "Different number of variables!"
)

## Check if sessions can be compared (warnings for user to check):
Expand All @@ -166,8 +148,10 @@ map_compare <- function(session_dir,

# DISPLAY TO USER ----

## Use 'ref_plot.R' to plot domains for the user's ref (save df for later use)
df_plots <- ref_plot(domains)
## Extract domains and plot for user's reference
domain_table <- tableGrob(domains, rows = NULL)
grid.arrange(domain_table, nrow = 1, ncol = 1)
n_codes <- nrow(domains)

# EXTRACT TABLE INFO FROM METADATA ----
table_name <- csv_1a$table[1]
Expand All @@ -182,8 +166,7 @@ map_compare <- function(session_dir,

# FIND MISMATCHES AND ASK FOR CONSENSUS DECISION ----
for (variable in seq_len(nrow(ses_join))) {
consensus <- consensus_on_mismatch(ses_join, table_df, variable,
max(df_plots$code$code))
consensus <- consensus_on_mismatch(ses_join, table_df, variable, n_codes)
ses_join$domain_code_join[variable] <- consensus$domain_code_join
ses_join$note_join[variable] <- consensus$note_join
} # end of loop for variable
Expand Down
34 changes: 15 additions & 19 deletions R/metadata_map.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
#' variables, and categorisations for the same variable can be copied from one
#' table to another. \cr \cr
#' Example inputs are provided within the package data, for the user to run this
#' function in a demo mode.
#' function in a demo mode. Refer to the package website for more guidance.
#' @param metadata_file This should be a csv download from HDRUK gateway
#' (in the form of ID_Dataset_Metadata.csv). Run '?mapmetadata::metadata' to
#' see how the metadata_file for the demo was created.
#' @param domain_file This should be a csv file created by the user, with each
#' domain on a separate line, no header. Run '?mapmetadata::domain_list' to
#' @param domain_file This should be a csv file created by the user, with two
#' columns (Domain_Code and Domain_Name). Run '?mapmetadata::domain_list' to
#' see how the domain_file for the demo was created.
#' @param look_up_file The lookup file makes auto-categorisations intended for
#' variables that appear regularly in health datasets. It only works for 1:1
Expand All @@ -41,8 +41,8 @@
#' 'L-OUTPUT_' which gives each categorisation its own row. Default is TRUE.
#' @param quiet Default is FALSE. Change to TRUE to quiet the cli_alert_info
#' and cli_alert_success messages.
#' @return The function will return two csv files: 'OUTPUT_' which contains the
#' mappings and 'LOG_' which contains details about the dataset and session.
#' @return A html plot summarising the dataset. Various csv and png outputs to
#' summarise the user's mapping session for a specific table in the dataset.
#' @examples
#' # Demo run requires no function inputs but requires user interaction.
#' # See package documentation to guide user inputs.
Expand All @@ -56,6 +56,8 @@
#' @importFrom utils packageVersion write.csv browseURL menu select.list
#' @importFrom ggplot2 ggsave
#' @importFrom htmlwidgets saveWidget
#' @importFrom gridExtra tableGrob grid.arrange
#' @importFrom graphics plot.new

metadata_map <- function(
metadata_file = NULL,
Expand Down Expand Up @@ -153,16 +155,10 @@
log_output_df <- get("log_output_df")
output_df <- get("output_df")

## Use 'ref_plot.R' to plot domains for the user's ref (save df for later use)
df_plots <- ref_plot(data$domains)

## Check if look_up_file and domain_file are compatible
mismatch <- setdiff(data$lookup$domain_code, df_plots$code$code)
if (length(mismatch) > 0) {
print(mismatch)
stop(paste("The look_up_file and domain_file are not compatible. These",
"look up codes are not listed in the domain codes:\n"))
}
## Extract domains and plot for user's reference
domain_table <- tableGrob(data$domains, rows = NULL)
grid.arrange(domain_table, nrow = 1, ncol = 1)
n_codes <- nrow(data$domains)

## CHOOSE TABLE TO PROCESS

Expand Down Expand Up @@ -207,7 +203,7 @@
df_prev_exist,
df_prev,
lookup = data$lookup,
df_plots,
n_codes,
output_df
)

Expand Down Expand Up @@ -239,7 +235,7 @@
table_df$Column.name[v_auto],
table_df$Column.description[v_auto],
table_df$Data.type[v_auto],
max(df_plots$code$code)
n_codes

Check warning on line 238 in R/metadata_map.R

View check run for this annotation

Codecov / codecov/patch

R/metadata_map.R#L238

Added line #L238 was not covered by tests
)
##### input user responses into output
output_df$domain_code[v_auto] <- decision_output$decision
Expand Down Expand Up @@ -277,7 +273,7 @@
table_df$Column.name[v_not_auto],
table_df$Column.description[v_not_auto],
table_df$Data.type[v_not_auto],
max(df_plots$code$code)
n_codes

Check warning on line 276 in R/metadata_map.R

View check run for this annotation

Codecov / codecov/patch

R/metadata_map.R#L276

Added line #L276 was not covered by tests
)
##### input user responses into output
output_df$domain_code[v_not_auto] <- decision_output$decision
Expand Down Expand Up @@ -320,7 +316,7 @@

### Create and save a summary plot
end_plot_save <- end_plot(df = output_df, table_name,
ref_table = df_plots$domain_table)
ref_table = domain_table)
ggsave(
plot = end_plot_save,
filename = png_path,
Expand Down
25 changes: 0 additions & 25 deletions R/plotting.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,6 @@ empty_plot <- function(dataframe, bar_title) {
barplot_html
}

#' Internal: ref_plot
#'
#' This function is called within the metadata_map function. \cr \cr
#' It plots a reference table to guide the user in their categorisation of
#' domains. \cr \cr
#' This reference table is based on the user inputted domains and the default
#' domains provided by this package. \cr \cr
#' @param domains The output of load_data
#' @return A reference table that appears in the Plots tab. A list of 2
#' containing the derivatives for this plot, used later in metadata_map'
#' @importFrom gridExtra tableGrob grid.arrange
#' @importFrom graphics plot.new
#' @keywords internal
#' @dev generate help files for unexported objects, for developers

ref_plot <- function(domains) {
colnames(domains)[1] <- "Domain Name"
plot.new()
code <- data.frame(code = seq_len(nrow(domains)))
domain_table <- tableGrob(cbind(code, domains), rows = NULL)
grid.arrange(domain_table, nrow = 1, ncol = 1)

return(list(code = code, domain_table = domain_table))
}

#' Internal: end_plot
#'
#' This function is called within the metadata_map function. \cr \cr
Expand Down
Loading
Loading