From f799e6cb1286be736cccf75ac4278caa0a7067e8 Mon Sep 17 00:00:00 2001 From: gbganalyst Date: Wed, 6 Mar 2024 10:58:24 +0100 Subject: [PATCH 1/2] updated the code repo --- DESCRIPTION | 5 +- NAMESPACE | 5 + R/bulkreadr-package.R | 6 +- R/fill_missing_values.R | 110 +++++++++---- R/onload.R | 2 +- R/to_date.R | 21 --- R/utils.R | 46 ++++++ README.Rmd | 4 +- README.md | 5 +- _pkgdown.yml | 9 +- man/fill_missing_values.Rd | 43 ++++-- tests/testthat/extdata/airquality_mean.csv | 84 +++++----- tests/testthat/extdata/airquality_median.csv | 154 ------------------- tests/testthat/test-fill_missing_values.R | 21 +-- tests/testthat/test-to_date.R | 10 -- tests/testthat/test-utils.R | 28 ++++ vignettes/bulkreadr.Rmd | 2 +- vignettes/other-functions.Rmd | 36 +++-- 18 files changed, 276 insertions(+), 315 deletions(-) delete mode 100644 R/to_date.R delete mode 100644 tests/testthat/extdata/airquality_median.csv delete mode 100644 tests/testthat/test-to_date.R create mode 100644 tests/testthat/test-utils.R diff --git a/DESCRIPTION b/DESCRIPTION index 4df9c3c..2d0b212 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,13 +36,16 @@ Imports: labelled, lubridate, magrittr, + methods, openxlsx, readr, readxl, + rlang, sjlabelled, stats, stringr, - tibble + tibble, + tidyr Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 87f38ca..164715c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,8 @@ export(read_spss_data) export(read_stata_data) importFrom(curl,has_internet) importFrom(dplyr,across) +importFrom(dplyr,case_when) +importFrom(dplyr,everything) importFrom(dplyr,group_by) importFrom(dplyr,group_split) importFrom(dplyr,mutate) @@ -35,6 +37,7 @@ importFrom(lubridate,as_date) importFrom(lubridate,is.Date) importFrom(lubridate,parse_date_time) importFrom(magrittr,"%>%") +importFrom(methods,as) importFrom(openxlsx,convertToDate) importFrom(purrr,map_df) importFrom(purrr,map_vec) @@ -42,7 +45,9 @@ importFrom(readr,read_csv) importFrom(readxl,excel_sheets) importFrom(readxl,read_excel) importFrom(readxl,read_xlsx) +importFrom(rlang,"%||%") importFrom(sjlabelled,label_to_colnames) importFrom(stats,median) importFrom(stringr,str_length) importFrom(tibble,tibble) +importFrom(tidyr,replace_na) diff --git a/R/bulkreadr-package.R b/R/bulkreadr-package.R index 0003998..b34b29b 100644 --- a/R/bulkreadr-package.R +++ b/R/bulkreadr-package.R @@ -4,7 +4,8 @@ ## usethis namespace: start #' #' @importFrom curl has_internet -#' @importFrom dplyr group_by group_split mutate across select rename +#' @importFrom dplyr group_by group_split mutate across select rename case_when +#' @importFrom dplyr everything #' @importFrom fs dir_ls #' @importFrom googlesheets4 gs4_has_token read_sheet sheet_names #' @importFrom haven read_sav read_dta as_factor is.labelled @@ -12,14 +13,17 @@ #' @importFrom labelled generate_dictionary lookfor #' @importFrom lubridate as_date is.Date parse_date_time #' @importFrom magrittr %>% +#' @importFrom methods as #' @importFrom openxlsx convertToDate #' @importFrom purrr map_df map_vec #' @importFrom readr read_csv #' @importFrom readxl excel_sheets read_excel read_xlsx +#' @importFrom rlang %||% #' @importFrom sjlabelled label_to_colnames #' @importFrom stats median #' @importFrom stringr str_length #' @importFrom tibble tibble +#' @importFrom tidyr replace_na ## usethis namespace: end NULL diff --git a/R/fill_missing_values.R b/R/fill_missing_values.R index 2d04b8e..7c53f49 100644 --- a/R/fill_missing_values.R +++ b/R/fill_missing_values.R @@ -1,13 +1,30 @@ -#' Fill missing values in a dataframe +#' Fill missing values in a data frame #' -#' `fill_missing_values()` is an efficient function that addresses missing values in a dataframe. It uses imputation by function, also known as column-based imputation, to fill numeric variables with the mean or median, and non-numeric variables with the mode. This approach ensures accurate and consistent replacements derived from individual columns, resulting in a complete and reliable dataset for improved analysis and decision-making. +#' `fill_missing_values()` is an efficient function that addresses missing +#' values in a data frame. It uses imputation by function, also known as +#' column-based imputation, to impute the missing values. For continuous +#' variables, it supports various methods of imputation, including minimum, +#' maximum, mean, median, harmonic mean, and geometric mean. For categorical +#' variables, missing values are replaced with the mode of the column. This +#' approach ensures accurate and consistent replacements derived from individual +#' columns, resulting in a complete and reliable dataset for improved analysis +#' and decision-making. #' -#' @param df The input dataframe to be processed. -#' @param use_mean Logical. If `TRUE`, missing values in numeric columns will be replaced with the mean. -#' If `FALSE`, missing values in numeric columns will be replaced with the median. +#' @param df A dataframe to process for missing value imputation. +#' +#' @param selected_variables An optional vector of variable names within `df` for +#' which missing values should be imputed. If `NULL` (default), imputation is +#' applied to all variables in the data frame. +#' +#' @param method A character string specifying the imputation method for continuous +#' variables. Supported methods are "min", "max", "mean", "median", "harmonic", +#' and "geometric". The default method is "mean". For categorical variables, the +#' mode is always used. +#' +#' @return A data frame with missing values imputed according to the specified `method`. #' -#' @return A dataframe with missing values filled. #' @export +#' #' @examples #' #' library(dplyr) @@ -22,19 +39,22 @@ #' NA, "virginica", "setosa") #' ) #' -#' # Using mean to fill missing values for numeric variables +#' # Impute using the mean method for continuous variables #' -#' result_df_mean <- fill_missing_values(df, use_mean = TRUE) +#' result_df_mean <- fill_missing_values(df, method = "mean") #' #' result_df_mean #' -#' # Using median to fill missing values for numeric variables +#' # Impute using the geometric mean for continuous variables and specify +#' # variables `Petal_Length` and `Petal_Width`. #' -#' result_df_median <- fill_missing_values(df, use_mean = FALSE) +#' result_df_geomean <- fill_missing_values(df, selected_variables = c +#' ("Petal_Length", "Petal_Width"), method = "geometric") #' -#' result_df_median +#' result_df_geomean #' #' # Impute missing values (NAs) in a grouped data frame +#' #' # You can do that by using the following: #' #' sample_iris <- tibble::tibble( @@ -48,28 +68,50 @@ #' sample_iris %>% #' group_by(Species) %>% #' group_split() %>% -#' map_df(fill_missing_values) -#' -#' -fill_missing_values <- function(df, use_mean = TRUE) { - if (missing(df)) { - stop("argument 'df' is missing, with no default") - } else { - # Loop over each column in the dataframe - for (col in names(df)) { - if (is.numeric(df[[col]])) { # Check if column is numeric - # Fill missing values with mean or median based on the flag 'use_mean' - if (use_mean) { - df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE) - } else { - df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE) - } - } else { - # Fill missing values with mode - df[[col]][is.na(df[[col]])] <- names(which.max(table(df[[col]]))) - } - } - return(df) +#' map_df(fill_missing_values, method = "median") +#' +#' +fill_missing_values <- function(df, selected_variables = NULL, method = "mean") { + # Validate method input for continuous variables + valid_methods <- c("min", "max", "mean", "median", "harmonic", "geometric") + if (!(method %in% valid_methods)) { + stop("Invalid method. Choose from 'min', 'max', 'mean', 'median', 'harmonic', 'geometric'") } -} + # Calculate the replacement value based on the specified method + + impute_continuous <- function(x, method) { + if (!is.numeric(x)) { + return(x) + } # Skip non-numeric columns + + replacement_value <- switch(method, + min = min(x, na.rm = TRUE), + max = max(x, na.rm = TRUE), + mean = mean(x, na.rm = TRUE), + median = median(x, na.rm = TRUE), + harmonic = harmonic_mean(x), + geometric = geometric_mean(x), + x + ) # Default to return x as is + + + # Explicitly cast the replacement value to the same type as x + + replacement_value_casted <- as(replacement_value, class(x[!is.na(x)][1])) + + # Use the casted replacement value for NA replacement + + replace_na(x, replacement_value_casted) + } + + + df %>% + mutate(across( + .cols = {{ selected_variables }} %||% everything(), + .fns = ~ case_when( + is.numeric(.) ~ impute_continuous(., method), + TRUE ~ replace_na(., get_mode(.)) + ) + )) +} diff --git a/R/onload.R b/R/onload.R index 082cb99..8cca202 100644 --- a/R/onload.R +++ b/R/onload.R @@ -1,3 +1,3 @@ .onAttach <- function(libname, pkgname) { - packageStartupMessage('Welcome to bulkreadr package! To learn more, please run:\nvignette("bulkreadr")') + packageStartupMessage('Welcome to bulkreadr package! To learn more, please run:\nbrowseURL("https://gbganalyst.github.io/bulkreadr")\nto visit the package website.') } diff --git a/R/to_date.R b/R/to_date.R deleted file mode 100644 index f030631..0000000 --- a/R/to_date.R +++ /dev/null @@ -1,21 +0,0 @@ -to_date <- function(x, origin = "1900-01-01", ...) { - if (is.Date(x)) { - return(x) - } - if(is.na(x)){ - return(as_date(x)) - } - if(class(x)[1] == "POSIXct"){ - return(as_date(x)) - } - if (str_length(x) >= 4 && is.na(as.numeric(x))) { - return(lubridate::parse_date_time(x, orders = c("dmy", "ymd", "mdy", "ym"))) - } - if (str_length(x) == 4) { - return(lubridate::parse_date_time(x, orders = "y")) - } - else { - return(openxlsx::convertToDate(x)) - } -} - diff --git a/R/utils.R b/R/utils.R index 9d91d9d..cba8de3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -14,4 +14,50 @@ check_file <- function(path) { path } +# For date + +to_date <- function(x, origin = "1900-01-01", ...) { + if (is.Date(x)) { + return(x) + } + if(is.na(x)){ + return(as_date(x)) + } + if(class(x)[1] == "POSIXct"){ + return(as_date(x)) + } + if (str_length(x) >= 4 && is.na(as.numeric(x))) { + return(lubridate::parse_date_time(x, orders = c("dmy", "ymd", "mdy", "ym"))) + } + if (str_length(x) == 4) { + return(lubridate::parse_date_time(x, orders = "y")) + } + else { + return(openxlsx::convertToDate(x)) + } +} + + +# For descriptive statistics + +# Define the harmonic_mean function if not already defined +harmonic_mean <- function(x) { + n <- length(x) + sum_reciprocal <- sum(1 / x, na.rm = TRUE) + n / sum_reciprocal +} + +# Define the geometric_mean function +geometric_mean <- function(x) { + x_positive <- x[x > 0] # Ensure only positive values are considered + if(length(x_positive) == 0) return(NA) # Avoid -Inf or NaN for non-positive sets + (prod(x_positive, na.rm = TRUE))^(1 / length(x_positive)) +} + +# Define the get_mode function for categorical imputation +get_mode <- function(x) { + ux <- unique(x[!is.na(x)]) + ux[which.max(tabulate(match(x, ux)))] +} + diff --git a/README.Rmd b/README.Rmd index b061c0d..032c5ee 100644 --- a/README.Rmd +++ b/README.Rmd @@ -62,7 +62,7 @@ if(!require("devtools")){ devtools::install_github("gbganalyst/bulkreadr") ``` -## How to load the package +## Usage Now that you have installed `bulkreadr` package, you can simply load it by using: @@ -70,6 +70,8 @@ Now that you have installed `bulkreadr` package, you can simply load it by using library(bulkreadr) ``` +To get started with `bulkreadr`, see the [articles](https://gbganalyst.github.io/bulkreadr/articles/index.html). + ## Context bulkreadr is designed to integrate with and augment the capabilities of established packages such as `readxl`, `readr`, and `googlesheets4`, offering enhanced functionality for reading bulk data within the R programming environment. diff --git a/README.md b/README.md index 9ce4910..3fe69b5 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ if(!require("devtools")){ devtools::install_github("gbganalyst/bulkreadr") ``` -## How to load the package +## Usage Now that you have installed `bulkreadr` package, you can simply load it by using: @@ -71,6 +71,9 @@ by using: library(bulkreadr) ``` +To get started with `bulkreadr`, see the +[articles](https://gbganalyst.github.io/bulkreadr/articles/index.html). + ## Context bulkreadr is designed to integrate with and augment the capabilities of diff --git a/_pkgdown.yml b/_pkgdown.yml index 3a3bfab..ab446b5 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -6,20 +6,20 @@ reference: - title: Spreadsheets desc: > - Functions that operate on spreasheets + Functions for reading bulk data in spreadsheets contents: - read_excel_workbook - read_excel_files_from_dir - title: Google Sheets desc: > - A function that operates on Google Sheets + A function for reading and appending data across multiple sheets within Google Sheets. contents: - read_gsheets - title: Flat files desc: > - A function that operates on csv files + A function designed to read and consolidate multiple CSV files within a specified directory. contents: - read_csv_files_from_dir @@ -48,7 +48,6 @@ reference: - fill_missing_values - inspect_na - articles: - title: Get started navbar: ~ @@ -65,7 +64,7 @@ navbar: href: index.html - text: "Functions" href: reference/index.html - - text: "Vignettes" + - text: "Articles" href: articles/index.html - text: "News" href: news/index.html diff --git a/man/fill_missing_values.Rd b/man/fill_missing_values.Rd index 550c5ad..cb730c9 100644 --- a/man/fill_missing_values.Rd +++ b/man/fill_missing_values.Rd @@ -2,21 +2,35 @@ % Please edit documentation in R/fill_missing_values.R \name{fill_missing_values} \alias{fill_missing_values} -\title{Fill missing values in a dataframe} +\title{Fill missing values in a data frame} \usage{ -fill_missing_values(df, use_mean = TRUE) +fill_missing_values(df, selected_variables = NULL, method = "mean") } \arguments{ -\item{df}{The input dataframe to be processed.} +\item{df}{A dataframe to process for missing value imputation.} -\item{use_mean}{Logical. If \code{TRUE}, missing values in numeric columns will be replaced with the mean. -If \code{FALSE}, missing values in numeric columns will be replaced with the median.} +\item{selected_variables}{An optional vector of variable names within \code{df} for +which missing values should be imputed. If \code{NULL} (default), imputation is +applied to all variables in the data frame.} + +\item{method}{A character string specifying the imputation method for continuous +variables. Supported methods are "min", "max", "mean", "median", "harmonic", +and "geometric". The default method is "mean". For categorical variables, the +mode is always used.} } \value{ -A dataframe with missing values filled. +A data frame with missing values imputed according to the specified \code{method}. } \description{ -\code{fill_missing_values()} is an efficient function that addresses missing values in a dataframe. It uses imputation by function, also known as column-based imputation, to fill numeric variables with the mean or median, and non-numeric variables with the mode. This approach ensures accurate and consistent replacements derived from individual columns, resulting in a complete and reliable dataset for improved analysis and decision-making. +\code{fill_missing_values()} is an efficient function that addresses missing +values in a data frame. It uses imputation by function, also known as +column-based imputation, to impute the missing values. For continuous +variables, it supports various methods of imputation, including minimum, +maximum, mean, median, harmonic mean, and geometric mean. For categorical +variables, missing values are replaced with the mode of the column. This +approach ensures accurate and consistent replacements derived from individual +columns, resulting in a complete and reliable dataset for improved analysis +and decision-making. } \examples{ @@ -32,19 +46,22 @@ Species = c("setosa", NA, "versicolor", "setosa", NA, "virginica", "setosa") ) -# Using mean to fill missing values for numeric variables +# Impute using the mean method for continuous variables -result_df_mean <- fill_missing_values(df, use_mean = TRUE) +result_df_mean <- fill_missing_values(df, method = "mean") result_df_mean -# Using median to fill missing values for numeric variables +# Impute using the geometric mean for continuous variables and specify +# variables `Petal_Length` and `Petal_Width`. -result_df_median <- fill_missing_values(df, use_mean = FALSE) +result_df_geomean <- fill_missing_values(df, selected_variables = c +("Petal_Length", "Petal_Width"), method = "geometric") -result_df_median +result_df_geomean # Impute missing values (NAs) in a grouped data frame + # You can do that by using the following: sample_iris <- tibble::tibble( @@ -58,7 +75,7 @@ Species = c("setosa", "setosa", "versicolor", "setosa", sample_iris \%>\% group_by(Species) \%>\% group_split() \%>\% -map_df(fill_missing_values) +map_df(fill_missing_values, method = "median") } diff --git a/tests/testthat/extdata/airquality_mean.csv b/tests/testthat/extdata/airquality_mean.csv index b9b181a..6ce4a8b 100644 --- a/tests/testthat/extdata/airquality_mean.csv +++ b/tests/testthat/extdata/airquality_mean.csv @@ -3,13 +3,13 @@ Ozone,Solar.R,Wind,Temp,Month,Day 36,118,8,72,5,2 12,149,12.6,74,5,3 18,313,11.5,62,5,4 -42.12931034482759,185.93150684931507,14.3,56,5,5 -28,185.93150684931507,14.9,66,5,6 +42,185,14.3,56,5,5 +28,185,14.9,66,5,6 23,299,8.6,65,5,7 19,99,13.8,59,5,8 8,19,20.1,61,5,9 -42.12931034482759,194,8.6,69,5,10 -7,185.93150684931507,6.9,74,5,11 +42,194,8.6,69,5,10 +7,185,6.9,74,5,11 16,256,9.7,69,5,12 11,290,9.2,66,5,13 14,274,10.9,68,5,14 @@ -23,57 +23,57 @@ Ozone,Solar.R,Wind,Temp,Month,Day 11,320,16.6,73,5,22 4,25,9.7,61,5,23 32,92,12,61,5,24 -42.12931034482759,66,16.6,57,5,25 -42.12931034482759,266,14.9,58,5,26 -42.12931034482759,185.93150684931507,8,57,5,27 +42,66,16.6,57,5,25 +42,266,14.9,58,5,26 +42,185,8,57,5,27 23,13,12,67,5,28 45,252,14.9,81,5,29 115,223,5.7,79,5,30 37,279,7.4,76,5,31 -42.12931034482759,286,8.6,78,6,1 -42.12931034482759,287,9.7,74,6,2 -42.12931034482759,242,16.1,67,6,3 -42.12931034482759,186,9.2,84,6,4 -42.12931034482759,220,8.6,85,6,5 -42.12931034482759,264,14.3,79,6,6 +42,286,8.6,78,6,1 +42,287,9.7,74,6,2 +42,242,16.1,67,6,3 +42,186,9.2,84,6,4 +42,220,8.6,85,6,5 +42,264,14.3,79,6,6 29,127,9.7,82,6,7 -42.12931034482759,273,6.9,87,6,8 +42,273,6.9,87,6,8 71,291,13.8,90,6,9 39,323,11.5,87,6,10 -42.12931034482759,259,10.9,93,6,11 -42.12931034482759,250,9.2,92,6,12 +42,259,10.9,93,6,11 +42,250,9.2,92,6,12 23,148,8,82,6,13 -42.12931034482759,332,13.8,80,6,14 -42.12931034482759,322,11.5,79,6,15 +42,332,13.8,80,6,14 +42,322,11.5,79,6,15 21,191,14.9,77,6,16 37,284,20.7,72,6,17 20,37,9.2,65,6,18 12,120,11.5,73,6,19 13,137,10.3,76,6,20 -42.12931034482759,150,6.3,77,6,21 -42.12931034482759,59,1.7,76,6,22 -42.12931034482759,91,4.6,76,6,23 -42.12931034482759,250,6.3,76,6,24 -42.12931034482759,135,8,75,6,25 -42.12931034482759,127,8,78,6,26 -42.12931034482759,47,10.3,73,6,27 -42.12931034482759,98,11.5,80,6,28 -42.12931034482759,31,14.9,77,6,29 -42.12931034482759,138,8,83,6,30 +42,150,6.3,77,6,21 +42,59,1.7,76,6,22 +42,91,4.6,76,6,23 +42,250,6.3,76,6,24 +42,135,8,75,6,25 +42,127,8,78,6,26 +42,47,10.3,73,6,27 +42,98,11.5,80,6,28 +42,31,14.9,77,6,29 +42,138,8,83,6,30 135,269,4.1,84,7,1 49,248,9.2,85,7,2 32,236,9.2,81,7,3 -42.12931034482759,101,10.9,84,7,4 +42,101,10.9,84,7,4 64,175,4.6,83,7,5 40,314,10.9,83,7,6 77,276,5.1,88,7,7 97,267,6.3,92,7,8 97,272,5.7,92,7,9 85,175,7.4,89,7,10 -42.12931034482759,139,8.6,82,7,11 +42,139,8.6,82,7,11 10,264,14.3,73,7,12 27,175,14.9,81,7,13 -42.12931034482759,291,14.9,91,7,14 +42,291,14.9,91,7,14 7,48,14.3,80,7,15 48,260,6.9,81,7,16 35,274,10.3,82,7,17 @@ -81,8 +81,8 @@ Ozone,Solar.R,Wind,Temp,Month,Day 79,187,5.1,87,7,19 63,220,11.5,85,7,20 16,7,6.9,74,7,21 -42.12931034482759,258,9.7,81,7,22 -42.12931034482759,295,11.5,82,7,23 +42,258,9.7,81,7,22 +42,295,11.5,82,7,23 80,294,8.6,86,7,24 108,223,8,85,7,25 20,81,8.6,82,7,26 @@ -94,18 +94,18 @@ Ozone,Solar.R,Wind,Temp,Month,Day 39,83,6.9,81,8,1 9,24,13.8,81,8,2 16,77,7.4,82,8,3 -78,185.93150684931507,6.9,86,8,4 -35,185.93150684931507,7.4,85,8,5 -66,185.93150684931507,4.6,87,8,6 +78,185,6.9,86,8,4 +35,185,7.4,85,8,5 +66,185,4.6,87,8,6 122,255,4,89,8,7 89,229,10.3,90,8,8 110,207,8,90,8,9 -42.12931034482759,222,8.6,92,8,10 -42.12931034482759,137,11.5,86,8,11 +42,222,8.6,92,8,10 +42,137,11.5,86,8,11 44,192,11.5,86,8,12 28,273,11.5,82,8,13 65,157,9.7,80,8,14 -42.12931034482759,64,11.5,79,8,15 +42,64,11.5,79,8,15 22,71,10.3,77,8,16 59,51,6.3,79,8,17 23,115,7.4,76,8,18 @@ -113,11 +113,11 @@ Ozone,Solar.R,Wind,Temp,Month,Day 44,190,10.3,78,8,20 21,259,15.5,77,8,21 9,36,14.3,72,8,22 -42.12931034482759,255,12.6,75,8,23 +42,255,12.6,75,8,23 45,212,9.7,79,8,24 168,238,3.4,81,8,25 73,215,8,86,8,26 -42.12931034482759,153,5.7,88,8,27 +42,153,5.7,88,8,27 76,203,9.7,97,8,28 118,225,2.3,94,8,29 84,237,6.3,96,8,30 @@ -148,7 +148,7 @@ Ozone,Solar.R,Wind,Temp,Month,Day 7,49,10.3,69,9,24 14,20,16.6,63,9,25 30,193,6.9,70,9,26 -42.12931034482759,145,13.2,77,9,27 +42,145,13.2,77,9,27 14,191,14.3,75,9,28 18,131,8,76,9,29 20,223,11.5,68,9,30 diff --git a/tests/testthat/extdata/airquality_median.csv b/tests/testthat/extdata/airquality_median.csv deleted file mode 100644 index 857338c..0000000 --- a/tests/testthat/extdata/airquality_median.csv +++ /dev/null @@ -1,154 +0,0 @@ -Ozone,Solar.R,Wind,Temp,Month,Day -41,190,7.4,67,5,1 -36,118,8,72,5,2 -12,149,12.6,74,5,3 -18,313,11.5,62,5,4 -31.5,205,14.3,56,5,5 -28,205,14.9,66,5,6 -23,299,8.6,65,5,7 -19,99,13.8,59,5,8 -8,19,20.1,61,5,9 -31.5,194,8.6,69,5,10 -7,205,6.9,74,5,11 -16,256,9.7,69,5,12 -11,290,9.2,66,5,13 -14,274,10.9,68,5,14 -18,65,13.2,58,5,15 -14,334,11.5,64,5,16 -34,307,12,66,5,17 -6,78,18.4,57,5,18 -30,322,11.5,68,5,19 -11,44,9.7,62,5,20 -1,8,9.7,59,5,21 -11,320,16.6,73,5,22 -4,25,9.7,61,5,23 -32,92,12,61,5,24 -31.5,66,16.6,57,5,25 -31.5,266,14.9,58,5,26 -31.5,205,8,57,5,27 -23,13,12,67,5,28 -45,252,14.9,81,5,29 -115,223,5.7,79,5,30 -37,279,7.4,76,5,31 -31.5,286,8.6,78,6,1 -31.5,287,9.7,74,6,2 -31.5,242,16.1,67,6,3 -31.5,186,9.2,84,6,4 -31.5,220,8.6,85,6,5 -31.5,264,14.3,79,6,6 -29,127,9.7,82,6,7 -31.5,273,6.9,87,6,8 -71,291,13.8,90,6,9 -39,323,11.5,87,6,10 -31.5,259,10.9,93,6,11 -31.5,250,9.2,92,6,12 -23,148,8,82,6,13 -31.5,332,13.8,80,6,14 -31.5,322,11.5,79,6,15 -21,191,14.9,77,6,16 -37,284,20.7,72,6,17 -20,37,9.2,65,6,18 -12,120,11.5,73,6,19 -13,137,10.3,76,6,20 -31.5,150,6.3,77,6,21 -31.5,59,1.7,76,6,22 -31.5,91,4.6,76,6,23 -31.5,250,6.3,76,6,24 -31.5,135,8,75,6,25 -31.5,127,8,78,6,26 -31.5,47,10.3,73,6,27 -31.5,98,11.5,80,6,28 -31.5,31,14.9,77,6,29 -31.5,138,8,83,6,30 -135,269,4.1,84,7,1 -49,248,9.2,85,7,2 -32,236,9.2,81,7,3 -31.5,101,10.9,84,7,4 -64,175,4.6,83,7,5 -40,314,10.9,83,7,6 -77,276,5.1,88,7,7 -97,267,6.3,92,7,8 -97,272,5.7,92,7,9 -85,175,7.4,89,7,10 -31.5,139,8.6,82,7,11 -10,264,14.3,73,7,12 -27,175,14.9,81,7,13 -31.5,291,14.9,91,7,14 -7,48,14.3,80,7,15 -48,260,6.9,81,7,16 -35,274,10.3,82,7,17 -61,285,6.3,84,7,18 -79,187,5.1,87,7,19 -63,220,11.5,85,7,20 -16,7,6.9,74,7,21 -31.5,258,9.7,81,7,22 -31.5,295,11.5,82,7,23 -80,294,8.6,86,7,24 -108,223,8,85,7,25 -20,81,8.6,82,7,26 -52,82,12,86,7,27 -82,213,7.4,88,7,28 -50,275,7.4,86,7,29 -64,253,7.4,83,7,30 -59,254,9.2,81,7,31 -39,83,6.9,81,8,1 -9,24,13.8,81,8,2 -16,77,7.4,82,8,3 -78,205,6.9,86,8,4 -35,205,7.4,85,8,5 -66,205,4.6,87,8,6 -122,255,4,89,8,7 -89,229,10.3,90,8,8 -110,207,8,90,8,9 -31.5,222,8.6,92,8,10 -31.5,137,11.5,86,8,11 -44,192,11.5,86,8,12 -28,273,11.5,82,8,13 -65,157,9.7,80,8,14 -31.5,64,11.5,79,8,15 -22,71,10.3,77,8,16 -59,51,6.3,79,8,17 -23,115,7.4,76,8,18 -31,244,10.9,78,8,19 -44,190,10.3,78,8,20 -21,259,15.5,77,8,21 -9,36,14.3,72,8,22 -31.5,255,12.6,75,8,23 -45,212,9.7,79,8,24 -168,238,3.4,81,8,25 -73,215,8,86,8,26 -31.5,153,5.7,88,8,27 -76,203,9.7,97,8,28 -118,225,2.3,94,8,29 -84,237,6.3,96,8,30 -85,188,6.3,94,8,31 -96,167,6.9,91,9,1 -78,197,5.1,92,9,2 -73,183,2.8,93,9,3 -91,189,4.6,93,9,4 -47,95,7.4,87,9,5 -32,92,15.5,84,9,6 -20,252,10.9,80,9,7 -23,220,10.3,78,9,8 -21,230,10.9,75,9,9 -24,259,9.7,73,9,10 -44,236,14.9,81,9,11 -21,259,15.5,76,9,12 -28,238,6.3,77,9,13 -9,24,10.9,71,9,14 -13,112,11.5,71,9,15 -46,237,6.9,78,9,16 -18,224,13.8,67,9,17 -13,27,10.3,76,9,18 -24,238,10.3,68,9,19 -16,201,8,82,9,20 -13,238,12.6,64,9,21 -23,14,9.2,71,9,22 -36,139,10.3,81,9,23 -7,49,10.3,69,9,24 -14,20,16.6,63,9,25 -30,193,6.9,70,9,26 -31.5,145,13.2,77,9,27 -14,191,14.3,75,9,28 -18,131,8,76,9,29 -20,223,11.5,68,9,30 diff --git a/tests/testthat/test-fill_missing_values.R b/tests/testthat/test-fill_missing_values.R index 6d9b69d..fe8a169 100644 --- a/tests/testthat/test-fill_missing_values.R +++ b/tests/testthat/test-fill_missing_values.R @@ -1,27 +1,12 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) -}) - - -test_that("fill_missing_values fill missingness in data frame columns", { +test_that("fill_missing_values() fill missingness in data frame columns", { path_test_mean <- test_path("extdata", "airquality_mean.csv") - path_test_median <- test_path("extdata", "airquality_median.csv") - - train_data_mean <- fill_missing_values(tibble(airquality), use_mean = TRUE) - - train_data_median <- fill_missing_values(tibble( airquality), use_mean = FALSE) - + train_data_mean <- fill_missing_values(tibble(airquality)) test_data_mean <- readr::read_csv(path_test_mean, show_col_types = FALSE) - test_data_median <- readr::read_csv(path_test_median, show_col_types = FALSE) - expect_equal(train_data_mean, test_data_mean) - expect_equal(train_data_median, test_data_median) - - -}) + }) diff --git a/tests/testthat/test-to_date.R b/tests/testthat/test-to_date.R deleted file mode 100644 index 1c7a002..0000000 --- a/tests/testthat/test-to_date.R +++ /dev/null @@ -1,10 +0,0 @@ -test_that("to_date parsing an input scalar into POSIXct date object", { - - train <- suppressWarnings(to_date("2022-09-22")) - - test <- as.POSIXct("2022-09-22",tz = "UTC") - - expect_equal(train, test) -}) - - diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R new file mode 100644 index 0000000..067adfd --- /dev/null +++ b/tests/testthat/test-utils.R @@ -0,0 +1,28 @@ +test_that("to_date parsing an input scalar into POSIXct date object", { + + train <- suppressWarnings(to_date("2022-09-22")) + + test <- as.POSIXct("2022-09-22",tz = "UTC") + + expect_equal(train, test) +}) + + +test_that("is_string() works", { + expect_true(is_string("string")) +}) + + +x <- c(1, 2, 3, 4, 3, 2, 3, 2, 3, 4) + +test_that("harmonic_mean() return an harmonic mean of a vector", { + expect_equal(harmonic_mean(x), 2.3076923) +}) + +test_that("geometric_mean() returns geometric mean of a vector", { + expect_equal(geometric_mean(x), 2.5209806) +}) + +test_that("get_mode() returns the mode of a vector", { + expect_equal(get_mode(x), 3) +}) diff --git a/vignettes/bulkreadr.Rmd b/vignettes/bulkreadr.Rmd index 4c227a9..9a9641a 100644 --- a/vignettes/bulkreadr.Rmd +++ b/vignettes/bulkreadr.Rmd @@ -2,7 +2,7 @@ title: "Introduction to bulkreadr" output: rmarkdown::html_vignette description: > - Start here if this is your first time using bulkreadr. You'll learn how to + Start here if this is your first time using `bulkreadr`. You'll learn how to use functions like `read_excel_workbook()` and `read_excel_files_from_dir()` for importing data from Excel and `read_gsheets()` for Google Sheets, allowing for data importation from multiple sheets. For handling CSV diff --git a/vignettes/other-functions.Rmd b/vignettes/other-functions.Rmd index 689ba8a..b0a5f08 100644 --- a/vignettes/other-functions.Rmd +++ b/vignettes/other-functions.Rmd @@ -94,7 +94,8 @@ airquality %>% ## fill_missing_values() -`fill_missing_values()` in an efficient function that addresses missing values in a dataframe. It uses imputation by function, meaning it replaces missing data in numeric variables with either the mean or the median, and in non-numeric variables with the mode. The function takes a column-based imputation approach, ensuring that replacement values are derived from the respective columns, resulting in accurate and consistent data. This method enhances the integrity of the dataset and promotes sound decision-making and analysis in data processing workflows. +`fill_missing_values()` is an efficient function that addresses missing values in a data frame. It uses imputation by function, also known as column-based imputation, to impute the missing values. It supports various imputation methods for continuous variables, including minimum, maximum, mean, median, harmonic mean, and geometric mean. For categorical variables, missing values are replaced with the mode of the column. This approach ensures accurate and consistent replacements derived from individual columns, resulting in a complete and reliable dataset for improved analysis and decision-making. + ```{r example 6} @@ -115,30 +116,42 @@ df ``` -**Using mean to fill missing values for numeric variables** +**Impute using the mean method for continuous variables** ```{r} -result_df_mean <- fill_missing_values(df, use_mean = TRUE) - -result_df_mean +#' df <- tibble::tibble( +#' Sepal_Length = c(5.2, 5, 5.7, NA, 6.2, 6.7, 5.5), +#' Petal_Length = c(1.5, 1.4, 4.2, 1.4, NA, 5.8, 3.7), +#' Petal_Width = c(NA, 0.2, 1.2, 0.2, 1.3, 1.8, NA), +#' Species = c("setosa", NA, "versicolor", "setosa", +#' NA, "virginica", "setosa") +#' ) ``` -**Using median to fill missing values for numeric variables** - ```{r} -result_df_median <- fill_missing_values(df, use_mean = FALSE) +result_df_mean <- fill_missing_values(df, method = "mean") -result_df_median +result_df_mean ``` +**Impute using the geometric mean for continuous variables and specify variables `Petal_Length` and `Petal_Width`** + +```{r} + +result_df_geomean <- fill_missing_values(df, selected_variables = c +("Petal_Length", "Petal_Width"), method = "geometric") + +result_df_geomean +``` ### Impute missing values (NAs) in a grouped data frame -You can use the `fill_missing_values()` in a grouped data frame by using other grouping and map functions. Here is an example of how to do this: +You can use the `fill_missing_values()` in a grouped data frame by using other +grouping and map functions. Here is an example of how to do this: ```{r} sample_iris <- tibble::tibble( @@ -159,6 +172,5 @@ sample_iris sample_iris %>% group_by(Species) %>% group_split() %>% - map_df(fill_missing_values) + map_df(fill_missing_values, method = "median") ``` - From 6b1696be1ff6f9f8606968a5c7a8a01684bd2910 Mon Sep 17 00:00:00 2001 From: gbganalyst Date: Wed, 6 Mar 2024 12:01:38 +0100 Subject: [PATCH 2/2] bulkreadr version 1.1.1 --- DESCRIPTION | 2 +- NEWS.md | 24 ++++++++++++++++++++++++ R/fill_missing_values.R | 5 +++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2d0b212..594b328 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: bulkreadr Title: The Ultimate Tool for Reading Data in Bulk -Version: 1.1.0 +Version: 1.1.1 Authors@R: c( person("Ezekiel", "Ogundepo", , "gbganalyst@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3974-2733")), diff --git a/NEWS.md b/NEWS.md index b9651ec..6da807b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,27 @@ +# bulkreadr 1.1.1 (2024-03-01) + +We are pleased to announce the release of `bulkreadr` version 1.1.1. This version introduces significant enhancements and features aimed at improving the functionality and user experience of the package. + +* **Enhanced `fill_missing_values()` Functionality**: The `fill_missing_values()` function has been significantly improved to support various imputation methods, empowering users to handle missing data with greater precision and flexibility. In addition to the previously supported "mean" imputation method, the function now accommodates the following strategies: + +`Minimum Value (Min)`: Imputes missing entries with the minimum value observed within each respective column. + +`Maximum Value (Max)`: Fills missing data points with the maximum value found in each column. +Mean: Continues to offer the average value imputation, replacing missing values with the mean of the available data in each column. + +`Median`: Imputes missing entries by employing the median value of each column, providing a robust alternative to mean imputation, especially in the presence of outliers. + +`Harmonic Mean`: Offers a sophisticated option for imputing missing values using the harmonic mean, ideal for data distributions where this approach is more representative. + +`Geometric Mean`: Completes our enhanced range of imputation methods by allowing for the replacement of missing values with the geometric mean, suited for datasets where the product of values is of interest. + +These enhancements are designed to provide users with a comprehensive toolkit for data imputation, ensuring that `fill_missing_values()` can be effectively tailored to meet the unique demands of diverse datasets and analysis requirements. + + +* **Package Website Launch:** To better serve our users and provide detailed documentation, we have launched the official `bulkreadr` package website. The website offers comprehensive guides, function references, and examples to help users maximize the package's potential. Visit us at [https://gbganalyst.github.io/bulkreadr](https://gbganalyst.github.io/bulkreadr) for more information. + +We believe these updates will significantly enhance your data analysis workflows and look forward to your feedback. + # bulkreadr 1.1.0 (2023-11-13) This update includes the following new features: diff --git a/R/fill_missing_values.R b/R/fill_missing_values.R index 7c53f49..1344d89 100644 --- a/R/fill_missing_values.R +++ b/R/fill_missing_values.R @@ -72,6 +72,11 @@ #' #' fill_missing_values <- function(df, selected_variables = NULL, method = "mean") { + + if (missing(df)) { + stop("argument 'df' is missing, with no default") + } + # Validate method input for continuous variables valid_methods <- c("min", "max", "mean", "median", "harmonic", "geometric") if (!(method %in% valid_methods)) {