Skip to content

Commit 6fc615a

Browse files
authored
Merge pull request #200 from aim-rsf/improve_lookup_table
Simplify domain_list and look_up for user and code base
2 parents 687baa9 + 6386539 commit 6fc615a

25 files changed

+295
-324
lines changed

.Rbuildignore

-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,5 @@
2222
^man/data_load\.Rd$
2323
^man/output_copy\.Rd$
2424
^man/empty_plot\.Rd$
25-
^man/ref_plot\.Rd$
2625
^man/user_categorisation\.Rd$
2726
^man/user_categorisation_loop\.Rd$

R/data-package_data.R

+8-6
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@
5454
#' metadata_map \cr \cr
5555
#' This data was created with these two steps:
5656
#' \enumerate{
57-
#' \item \code{domain_list <-
58-
#' read.csv('inst/inputs/domain_list_demo.csv',header=FALSE)}
57+
#' \item \code{domain_list <- read.csv(system.file('inputs',
58+
#' 'domain_list_demo.csv', package = 'mapmetadata'))}
5959
#' \item \code{usethis::use_data(domain_list)}
6060
#' }
6161
#' @docType data
@@ -64,7 +64,7 @@
6464
#'
6565
#' @keywords internal
6666
#'
67-
#' @format A data frame with 8 rows and 1 column
67+
#' @format A data frame with 8 rows and 2 columns
6868
#'
6969
#' @source The csv was manually created
7070
"domain_list"
@@ -78,7 +78,8 @@
7878
#' \item Navigate to the dataset of interest, select 'Download data' and
7979
#' download the Structural Metadata file
8080
#' \item Shorten name of downloaded file e.g. 360_NCCHD_Metadata.csv
81-
#' \item \code{metadata <- read.csv("inst/inputs/360_NCCHD_Metadata.csv")}
81+
#' \item \code{metadata <- read.csv(system.file('inputs',
82+
#' '360_NCCHD_Metadata.csv', package = 'mapmetadata'))}
8283
#' \item \code{usethis::use_data(metadata)}
8384
#' }
8485
#'
@@ -101,7 +102,8 @@
101102
#' domain code rather than asking the user to categorise.\cr\cr
102103
#' This data was created with these two steps:
103104
#' \enumerate{
104-
#' \item \code{look_up <- read.csv('inst/inputs/look_up.csv')}
105+
#' \item \code{look_up <- read.csv(system.file('inputs',
106+
#' 'look_up.csv',package = 'mapmetadata'))}
105107
#' \item \code{usethis::use_data(look_up)}
106108
#' }
107109
#' @docType data
@@ -110,7 +112,7 @@
110112
#'
111113
#' @keywords internal
112114
#'
113-
#' @format A data frame with a variable number of rows and 3 columns
115+
#' @format A data frame with a variable number of rows and 2 columns
114116
#'
115117
#' @source The csv was manually created
116118
"look_up"

R/globals.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ utils::globalVariables(c("Column.description",
66
"no_count",
77
"note",
88
"total_variables",
9-
"variable"))
9+
"variable",
10+
"Variable"))

R/inputs_collect.R

+37-17
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#' @importFrom cli cli_alert_info
1515
#' @importFrom utils read.csv
1616
#' @importFrom tools file_path_sans_ext
17+
#' @importFrom dplyr left_join
1718
#' @keywords internal
1819
#' @dev generate help files for unexported objects, for developers
1920

@@ -44,11 +45,11 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
4445
# Verify the metadata file name pattern and that it is a .csv file
4546
if (!grepl("^[0-9]+_.*_Metadata\\.csv$", metadata_base) ||
4647
tools::file_ext(metadata_file) != "csv") {
47-
stop(paste("Metadata file name must be a .csv file in the format",
48+
stop(paste("metadata_file name must be a .csv file in the format",
4849
"ID_Name_Metadata.csv where ID is an integer"))
4950
} else {
5051
if (!file.exists(metadata_file)) {
51-
stop("Metadata filename is the correct format but it does not exist!")
52+
stop("metadata_file is the correct filename but it does not exist!")
5253
}
5354
}
5455

@@ -57,26 +58,35 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
5758
column_names <- colnames(metadata)
5859
expected_column_names <- c("Section", "Column.name", "Data.type",
5960
"Column.description", "Sensitive")
60-
if (!all(column_names == expected_column_names)) {
61-
stop("Metadata file does not have expected column names")
61+
if (!identical(sort(column_names), sort(expected_column_names))) {
62+
stop("metadata_file does not have expected column names")
6263
}
6364

6465
metadata_base_0suffix <- sub("_Metadata.csv$", "", metadata_base)
6566
metadata_desc <- gsub(" ", "", metadata_base_0suffix)
6667

67-
# Check if the domain_file is a csv and has one column
68+
# Check if the domain_file is a csv and has two columns
6869
if (file.exists(domain_file) && tools::file_ext(domain_file) == "csv") {
69-
domains <- read.csv(domain_file, header = FALSE)
70-
if (ncol(domains) == 1) {
70+
domains <- read.csv(domain_file)
71+
column_names <- colnames(domains)
72+
expected_column_names <- c("Domain_Code", "Domain_Name")
73+
if (identical(sort(column_names), sort(expected_column_names))) {
7174
domain_list_desc <- file_path_sans_ext(basename(domain_file))
7275
} else {
73-
stop("The domain_file should only have one column.")
76+
stop("domain_file does not have the expected column names")
7477
}
7578
} else {
76-
stop("This domain_file does not exist or is not in csv format.")
79+
stop("domain_file does not exist or is not in csv format.")
7780
}
7881
}
7982

83+
# Check the domain_file 'Code' column is correct
84+
code_expected <- as.integer(seq(1, nrow(domains), 1))
85+
if (!identical(code_expected, domains$Domain_Code)) {
86+
stop(paste("'Code' column in domain_file is not as expected.\n",
87+
"Expected 1:", nrow(domains)))
88+
}
89+
8090
# Collect look up table
8191
if (is.null(look_up_file)) {
8292
if (!quiet) {
@@ -86,15 +96,25 @@ data_load <- function(metadata_file, domain_file, look_up_file, quiet = FALSE) {
8696
} else {
8797
if (!quiet) {
8898
cli_alert_info("Using look up file inputted by user")
89-
if (file.exists(look_up_file) && tools::file_ext(look_up_file) == "csv") {
90-
lookup <- read.csv(look_up_file)
91-
expected_column_names <- c("variable", "domain_label", "domain_code")
92-
if (!all(colnames(lookup) == expected_column_names)) {
93-
stop("Look_up file does not have expected column names")
94-
}
95-
} else {
96-
stop("This look_up_file does not exist or is not in csv format.")
99+
}
100+
# Check look_up file given by user
101+
if (file.exists(look_up_file) && tools::file_ext(look_up_file) == "csv") {
102+
lookup <- read.csv(look_up_file)
103+
expected_column_names <- c("Variable", "Domain_Name")
104+
if (!all(colnames(lookup) == expected_column_names)) {
105+
stop("look_up file does not have expected column names")
106+
}
107+
# Add Domain_Code column into lookup table
108+
lookup <- lookup %>% left_join(domains, by = "Domain_Name")
109+
# Check for look_up rows not covered by domain_list
110+
no_match <- lookup[!lookup$Domain_Name %in% domains$Domain_Name, ]
111+
if (nrow(no_match) != 0) {
112+
warning(paste("There are domain names in the look_up_file that are not",
113+
"included in the domain_file. If this is not expected,",
114+
"check for mistakes."))
97115
}
116+
} else {
117+
stop("look_up_file does not exist or is not in csv format.")
98118
}
99119
}
100120

R/map_compare.R

+19-36
Original file line numberDiff line numberDiff line change
@@ -81,46 +81,28 @@ map_compare <- function(session_dir,
8181
stop("Cannot locate all four input files.")
8282
}
8383

84-
# Read csv files
84+
## Check if domain_file exists
85+
if (!file.exists(domain_file)) {
86+
stop("Cannot locate domain_file.")
87+
}
88+
89+
## Check if metadata_file exists
90+
if (!file.exists(metadata_file)) {
91+
stop("Cannot locate metadata_file.")
92+
}
93+
94+
# Read files
8595
csv_1a <- read.csv(csv_1a_path)
8696
csv_2a <- read.csv(csv_2a_path)
8797
csv_1b <- read.csv(csv_1b_path)
8898
csv_2b <- read.csv(csv_2b_path)
8999

90-
# Verify the metadata file name pattern and that it is a .csv file
91-
metadata_base <- basename(metadata_file)
92-
if (!grepl("^[0-9]+_.*_Metadata\\.csv$", metadata_base) ||
93-
tools::file_ext(metadata_file) != "csv") {
94-
stop(paste("Metadata file name must be a .csv file in the format",
95-
"ID_Name_Metadata.csv where ID is an integer"))
96-
} else {
97-
if (!file.exists(metadata_file)) {
98-
stop("Metadata filename is the correct format but it does not exist!")
99-
}
100-
}
101-
102-
# Check if metadata column names match what is expected
103100
metadata <- read.csv(metadata_file)
104-
column_names <- colnames(metadata)
105-
expected_column_names <- c("Section", "Column.name", "Data.type",
106-
"Column.description", "Sensitive")
107-
if (!all(column_names == expected_column_names)) {
108-
stop("Metadata file does not have expected column names")
109-
}
110-
101+
metadata_base <- basename(metadata_file)
111102
metadata_base_0suffix <- sub("_Metadata.csv$", "", metadata_base)
112103
metadata_desc <- gsub(" ", "", metadata_base_0suffix)
113104

114-
# Check if the domain_file is a csv and has one column
115-
if (file.exists(domain_file) && tools::file_ext(domain_file) == "csv") {
116-
domains <- read.csv(domain_file, header = FALSE)
117-
if (ncol(domains) == 1) {
118-
} else {
119-
stop("The domain_file should only have one column.")
120-
}
121-
} else {
122-
stop("This domain_file does not exist or is not in csv format.")
123-
}
105+
domains <- read.csv(domain_file)
124106

125107
# CHECK IF A VALID COMPARISON BETWEEN SESSIONS IS POSSIBLE ----
126108

@@ -152,7 +134,7 @@ map_compare <- function(session_dir,
152134
input_1 = nrow(csv_1b),
153135
input_2 = nrow(csv_2b),
154136
severity = "danger",
155-
severity_text = "Different number of variavles!"
137+
severity_text = "Different number of variables!"
156138
)
157139

158140
## Check if sessions can be compared (warnings for user to check):
@@ -166,8 +148,10 @@ map_compare <- function(session_dir,
166148

167149
# DISPLAY TO USER ----
168150

169-
## Use 'ref_plot.R' to plot domains for the user's ref (save df for later use)
170-
df_plots <- ref_plot(domains)
151+
## Extract domains and plot for user's reference
152+
domain_table <- tableGrob(domains, rows = NULL)
153+
grid.arrange(domain_table, nrow = 1, ncol = 1)
154+
n_codes <- nrow(domains)
171155

172156
# EXTRACT TABLE INFO FROM METADATA ----
173157
table_name <- csv_1a$table[1]
@@ -182,8 +166,7 @@ map_compare <- function(session_dir,
182166

183167
# FIND MISMATCHES AND ASK FOR CONSENSUS DECISION ----
184168
for (variable in seq_len(nrow(ses_join))) {
185-
consensus <- consensus_on_mismatch(ses_join, table_df, variable,
186-
max(df_plots$code$code))
169+
consensus <- consensus_on_mismatch(ses_join, table_df, variable, n_codes)
187170
ses_join$domain_code_join[variable] <- consensus$domain_code_join
188171
ses_join$note_join[variable] <- consensus$note_join
189172
} # end of loop for variable

R/metadata_map.R

+15-19
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ select.list <- NULL
1515
#' variables, and categorisations for the same variable can be copied from one
1616
#' table to another. \cr \cr
1717
#' Example inputs are provided within the package data, for the user to run this
18-
#' function in a demo mode.
18+
#' function in a demo mode. Refer to the package website for more guidance.
1919
#' @param metadata_file This should be a csv download from HDRUK gateway
2020
#' (in the form of ID_Dataset_Metadata.csv). Run '?mapmetadata::metadata' to
2121
#' see how the metadata_file for the demo was created.
22-
#' @param domain_file This should be a csv file created by the user, with each
23-
#' domain on a separate line, no header. Run '?mapmetadata::domain_list' to
22+
#' @param domain_file This should be a csv file created by the user, with two
23+
#' columns (Domain_Code and Domain_Name). Run '?mapmetadata::domain_list' to
2424
#' see how the domain_file for the demo was created.
2525
#' @param look_up_file The lookup file makes auto-categorisations intended for
2626
#' variables that appear regularly in health datasets. It only works for 1:1
@@ -41,8 +41,8 @@ select.list <- NULL
4141
#' 'L-OUTPUT_' which gives each categorisation its own row. Default is TRUE.
4242
#' @param quiet Default is FALSE. Change to TRUE to quiet the cli_alert_info
4343
#' and cli_alert_success messages.
44-
#' @return The function will return two csv files: 'OUTPUT_' which contains the
45-
#' mappings and 'LOG_' which contains details about the dataset and session.
44+
#' @return A html plot summarising the dataset. Various csv and png outputs to
45+
#' summarise the user's mapping session for a specific table in the dataset.
4646
#' @examples
4747
#' # Demo run requires no function inputs but requires user interaction.
4848
#' # See package documentation to guide user inputs.
@@ -56,6 +56,8 @@ select.list <- NULL
5656
#' @importFrom utils packageVersion write.csv browseURL menu select.list
5757
#' @importFrom ggplot2 ggsave
5858
#' @importFrom htmlwidgets saveWidget
59+
#' @importFrom gridExtra tableGrob grid.arrange
60+
#' @importFrom graphics plot.new
5961

6062
metadata_map <- function(
6163
metadata_file = NULL,
@@ -153,16 +155,10 @@ metadata_map <- function(
153155
log_output_df <- get("log_output_df")
154156
output_df <- get("output_df")
155157

156-
## Use 'ref_plot.R' to plot domains for the user's ref (save df for later use)
157-
df_plots <- ref_plot(data$domains)
158-
159-
## Check if look_up_file and domain_file are compatible
160-
mismatch <- setdiff(data$lookup$domain_code, df_plots$code$code)
161-
if (length(mismatch) > 0) {
162-
print(mismatch)
163-
stop(paste("The look_up_file and domain_file are not compatible. These",
164-
"look up codes are not listed in the domain codes:\n"))
165-
}
158+
## Extract domains and plot for user's reference
159+
domain_table <- tableGrob(data$domains, rows = NULL)
160+
grid.arrange(domain_table, nrow = 1, ncol = 1)
161+
n_codes <- nrow(data$domains)
166162

167163
## CHOOSE TABLE TO PROCESS
168164

@@ -207,7 +203,7 @@ metadata_map <- function(
207203
df_prev_exist,
208204
df_prev,
209205
lookup = data$lookup,
210-
df_plots,
206+
n_codes,
211207
output_df
212208
)
213209

@@ -239,7 +235,7 @@ metadata_map <- function(
239235
table_df$Column.name[v_auto],
240236
table_df$Column.description[v_auto],
241237
table_df$Data.type[v_auto],
242-
max(df_plots$code$code)
238+
n_codes
243239
)
244240
##### input user responses into output
245241
output_df$domain_code[v_auto] <- decision_output$decision
@@ -277,7 +273,7 @@ metadata_map <- function(
277273
table_df$Column.name[v_not_auto],
278274
table_df$Column.description[v_not_auto],
279275
table_df$Data.type[v_not_auto],
280-
max(df_plots$code$code)
276+
n_codes
281277
)
282278
##### input user responses into output
283279
output_df$domain_code[v_not_auto] <- decision_output$decision
@@ -320,7 +316,7 @@ metadata_map <- function(
320316

321317
### Create and save a summary plot
322318
end_plot_save <- end_plot(df = output_df, table_name,
323-
ref_table = df_plots$domain_table)
319+
ref_table = domain_table)
324320
ggsave(
325321
plot = end_plot_save,
326322
filename = png_path,

R/plotting.R

-25
Original file line numberDiff line numberDiff line change
@@ -32,31 +32,6 @@ empty_plot <- function(dataframe, bar_title) {
3232
barplot_html
3333
}
3434

35-
#' Internal: ref_plot
36-
#'
37-
#' This function is called within the metadata_map function. \cr \cr
38-
#' It plots a reference table to guide the user in their categorisation of
39-
#' domains. \cr \cr
40-
#' This reference table is based on the user inputted domains and the default
41-
#' domains provided by this package. \cr \cr
42-
#' @param domains The output of load_data
43-
#' @return A reference table that appears in the Plots tab. A list of 2
44-
#' containing the derivatives for this plot, used later in metadata_map'
45-
#' @importFrom gridExtra tableGrob grid.arrange
46-
#' @importFrom graphics plot.new
47-
#' @keywords internal
48-
#' @dev generate help files for unexported objects, for developers
49-
50-
ref_plot <- function(domains) {
51-
colnames(domains)[1] <- "Domain Name"
52-
plot.new()
53-
code <- data.frame(code = seq_len(nrow(domains)))
54-
domain_table <- tableGrob(cbind(code, domains), rows = NULL)
55-
grid.arrange(domain_table, nrow = 1, ncol = 1)
56-
57-
return(list(code = code, domain_table = domain_table))
58-
}
59-
6035
#' Internal: end_plot
6136
#'
6237
#' This function is called within the metadata_map function. \cr \cr

0 commit comments

Comments
 (0)