Merge pull request #20 from gbganalyst/pkgdown

bulkreadr version 1.1.1
gbganalyst · Mar 6, 2024 · 8334531 · 8334531
2 parents 3421e4f + 6b1696b
commit 8334531
Show file tree

Hide file tree

Showing 19 changed files with 302 additions and 312 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: bulkreadr
 Title: The Ultimate Tool for Reading Data in Bulk
-Version: 1.1.0
+Version: 1.1.1
 Authors@R: c(
     person("Ezekiel", "Ogundepo", , "gbganalyst@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-3974-2733")),
@@ -36,13 +36,16 @@ Imports:
     labelled,
     lubridate,
     magrittr,
+    methods,
     openxlsx,
     readr,
     readxl,
+    rlang,
     sjlabelled,
     stats,
     stringr,
-    tibble
+    tibble,
+    tidyr
 Suggests:
     knitr,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -15,6 +15,8 @@ export(read_spss_data)
 export(read_stata_data)
 importFrom(curl,has_internet)
 importFrom(dplyr,across)
+importFrom(dplyr,case_when)
+importFrom(dplyr,everything)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_split)
 importFrom(dplyr,mutate)
@@ -35,14 +37,17 @@ importFrom(lubridate,as_date)
 importFrom(lubridate,is.Date)
 importFrom(lubridate,parse_date_time)
 importFrom(magrittr,"%>%")
+importFrom(methods,as)
 importFrom(openxlsx,convertToDate)
 importFrom(purrr,map_df)
 importFrom(purrr,map_vec)
 importFrom(readr,read_csv)
 importFrom(readxl,excel_sheets)
 importFrom(readxl,read_excel)
 importFrom(readxl,read_xlsx)
+importFrom(rlang,"%||%")
 importFrom(sjlabelled,label_to_colnames)
 importFrom(stats,median)
 importFrom(stringr,str_length)
 importFrom(tibble,tibble)
+importFrom(tidyr,replace_na)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,27 @@
+# bulkreadr 1.1.1 (2024-03-01)
+
+We are pleased to announce the release of `bulkreadr` version 1.1.1. This version introduces significant enhancements and features aimed at improving the functionality and user experience of the package.
+
+* **Enhanced `fill_missing_values()` Functionality**: The `fill_missing_values()` function has been significantly improved to support various imputation methods, empowering users to handle missing data with greater precision and flexibility. In addition to the previously supported "mean" imputation method, the function now accommodates the following strategies:
+
+`Minimum Value (Min)`: Imputes missing entries with the minimum value observed within each respective column.
+
+`Maximum Value (Max)`: Fills missing data points with the maximum value found in each column.
+Mean: Continues to offer the average value imputation, replacing missing values with the mean of the available data in each column.
+
+`Median`: Imputes missing entries by employing the median value of each column, providing a robust alternative to mean imputation, especially in the presence of outliers.
+
+`Harmonic Mean`: Offers a sophisticated option for imputing missing values using the harmonic mean, ideal for data distributions where this approach is more representative.
+
+`Geometric Mean`: Completes our enhanced range of imputation methods by allowing for the replacement of missing values with the geometric mean, suited for datasets where the product of values is of interest.
+
+These enhancements are designed to provide users with a comprehensive toolkit for data imputation, ensuring that `fill_missing_values()` can be effectively tailored to meet the unique demands of diverse datasets and analysis requirements.
+
+
+* **Package Website Launch:** To better serve our users and provide detailed documentation, we have launched the official `bulkreadr` package website. The website offers comprehensive guides, function references, and examples to help users maximize the package's potential. Visit us at [https://gbganalyst.github.io/bulkreadr](https://gbganalyst.github.io/bulkreadr) for more information.
+
+We believe these updates will significantly enhance your data analysis workflows and look forward to your feedback.
+
 # bulkreadr 1.1.0 (2023-11-13)
 
 This update includes the following new features:

diff --git a/R/bulkreadr-package.R b/R/bulkreadr-package.R
@@ -4,22 +4,26 @@
 ## usethis namespace: start
 #'
 #' @importFrom curl has_internet
-#' @importFrom dplyr group_by group_split mutate across select rename
+#' @importFrom dplyr group_by group_split mutate across select rename case_when
+#' @importFrom dplyr everything
 #' @importFrom fs dir_ls
 #' @importFrom googlesheets4 gs4_has_token read_sheet sheet_names
 #' @importFrom haven read_sav read_dta as_factor is.labelled
 #' @importFrom inspectdf inspect_na
 #' @importFrom labelled generate_dictionary lookfor
 #' @importFrom lubridate as_date is.Date parse_date_time
 #' @importFrom magrittr %>%
+#' @importFrom methods as
 #' @importFrom openxlsx convertToDate
 #' @importFrom purrr map_df map_vec
 #' @importFrom readr read_csv
 #' @importFrom readxl excel_sheets read_excel read_xlsx
+#' @importFrom rlang %||%
 #' @importFrom sjlabelled label_to_colnames
 #' @importFrom stats median
 #' @importFrom stringr str_length
 #' @importFrom tibble tibble
+#' @importFrom tidyr replace_na
 ## usethis namespace: end
 NULL
 
diff --git a/R/fill_missing_values.R b/R/fill_missing_values.R
@@ -1,13 +1,30 @@
-#' Fill missing values in a dataframe
+#' Fill missing values in a data frame
 #'
-#' `fill_missing_values()` is an efficient function that addresses missing values in a dataframe. It uses imputation by function, also known as column-based imputation, to fill numeric variables with the mean or median, and non-numeric variables with the mode. This approach ensures accurate and consistent replacements derived from individual columns, resulting in a complete and reliable dataset for improved analysis and decision-making.
+#' `fill_missing_values()` is an efficient function that addresses missing
+#' values in a data frame. It uses imputation by function, also known as
+#' column-based imputation, to impute the missing values. For continuous
+#' variables, it supports various methods of imputation, including minimum,
+#' maximum, mean, median, harmonic mean, and geometric mean. For categorical
+#' variables, missing values are replaced with the mode of the column. This
+#' approach ensures accurate and consistent replacements derived from individual
+#' columns, resulting in a complete and reliable dataset for improved analysis
+#' and decision-making.
 #'
-#' @param df The input dataframe to be processed.
-#' @param use_mean Logical. If `TRUE`, missing values in numeric columns will be replaced with the mean.
-#'  If `FALSE`, missing values in numeric columns will be replaced with the median.
+#' @param df A dataframe to process for missing value imputation.
+#'
+#' @param selected_variables An optional vector of variable names within `df` for
+#'   which missing values should be imputed. If `NULL` (default), imputation is
+#'   applied to all variables in the data frame.
+#'
+#' @param method A character string specifying the imputation method for continuous
+#'   variables. Supported methods are "min", "max", "mean", "median", "harmonic",
+#'   and "geometric". The default method is "mean". For categorical variables, the
+#'   mode is always used.
+#'
+#' @return  A data frame with missing values imputed according to the specified `method`.
 #'
-#' @return A dataframe with missing values filled.
 #' @export
+#'
 #' @examples
 #'
 #' library(dplyr)
@@ -22,19 +39,22 @@
 #'            NA, "virginica", "setosa")
 #' )
 #'
-#' # Using mean to fill missing values for numeric variables
+#' # Impute using the mean method for continuous variables
 #'
-#' result_df_mean <- fill_missing_values(df, use_mean = TRUE)
+#' result_df_mean <- fill_missing_values(df, method = "mean")
 #'
 #' result_df_mean
 #'
-#' # Using median to fill missing values for numeric variables
+#' # Impute using the geometric mean for continuous variables and specify
+#' # variables `Petal_Length` and `Petal_Width`.
 #'
-#' result_df_median <- fill_missing_values(df, use_mean = FALSE)
+#' result_df_geomean <- fill_missing_values(df, selected_variables = c
+#' ("Petal_Length", "Petal_Width"), method = "geometric")
 #'
-#' result_df_median
+#' result_df_geomean
 #'
 #' # Impute missing values (NAs) in a grouped data frame
+#'
 #' # You can do that by using the following:
 #'
 #' sample_iris <- tibble::tibble(
@@ -48,28 +68,55 @@
 #' sample_iris %>%
 #' group_by(Species) %>%
 #' group_split() %>%
-#' map_df(fill_missing_values)
+#' map_df(fill_missing_values, method = "median")
 #'
 #'
-fill_missing_values <- function(df, use_mean = TRUE) {
+fill_missing_values <- function(df, selected_variables = NULL, method = "mean") {
+
   if (missing(df)) {
     stop("argument 'df' is missing, with no default")
-  } else {
-    # Loop over each column in the dataframe
-    for (col in names(df)) {
-      if (is.numeric(df[[col]])) { # Check if column is numeric
-        # Fill missing values with mean or median based on the flag 'use_mean'
-        if (use_mean) {
-          df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
-        } else {
-          df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
-        }
-      } else {
-        # Fill missing values with mode
-        df[[col]][is.na(df[[col]])] <- names(which.max(table(df[[col]])))
-      }
-    }
-    return(df)
   }
-}
 
+  # Validate method input for continuous variables
+  valid_methods <- c("min", "max", "mean", "median", "harmonic", "geometric")
+  if (!(method %in% valid_methods)) {
+    stop("Invalid method. Choose from 'min', 'max', 'mean', 'median', 'harmonic', 'geometric'")
+  }
+
+  # Calculate the replacement value based on the specified method
+
+  impute_continuous <- function(x, method) {
+    if (!is.numeric(x)) {
+      return(x)
+    } # Skip non-numeric columns
+
+    replacement_value <- switch(method,
+      min = min(x, na.rm = TRUE),
+      max = max(x, na.rm = TRUE),
+      mean = mean(x, na.rm = TRUE),
+      median = median(x, na.rm = TRUE),
+      harmonic = harmonic_mean(x),
+      geometric = geometric_mean(x),
+      x
+    ) # Default to return x as is
+
+
+    # Explicitly cast the replacement value to the same type as x
+
+    replacement_value_casted <- as(replacement_value, class(x[!is.na(x)][1]))
+
+    # Use the casted replacement value for NA replacement
+
+    replace_na(x, replacement_value_casted)
+  }
+
+
+  df %>%
+    mutate(across(
+      .cols = {{ selected_variables }} %||% everything(),
+      .fns = ~ case_when(
+        is.numeric(.) ~ impute_continuous(., method),
+        TRUE ~ replace_na(., get_mode(.))
+      )
+    ))
+}
diff --git a/R/onload.R b/R/onload.R
@@ -1,3 +1,3 @@
 .onAttach <- function(libname, pkgname) {
-  packageStartupMessage('Welcome to bulkreadr package! To learn more, please run:\nvignette("bulkreadr")')
+  packageStartupMessage('Welcome to bulkreadr package! To learn more, please run:\nbrowseURL("https://gbganalyst.github.io/bulkreadr")\nto visit the package website.')
 }
diff --git a/R/to_date.R b/R/to_date.R
diff --git a/R/utils.R b/R/utils.R
@@ -14,4 +14,50 @@ check_file <- function(path) {
   path
 }
 
+# For date
+
+to_date <- function(x, origin = "1900-01-01", ...) {
+  if (is.Date(x)) {
+    return(x)
+  }
+  if(is.na(x)){
+    return(as_date(x))
+  }
+  if(class(x)[1] == "POSIXct"){
+    return(as_date(x))
+  }
+  if (str_length(x) >= 4 && is.na(as.numeric(x))) {
+    return(lubridate::parse_date_time(x, orders = c("dmy", "ymd", "mdy", "ym")))
+  }
+  if (str_length(x) == 4) {
+    return(lubridate::parse_date_time(x, orders = "y"))
+  }
+  else {
+    return(openxlsx::convertToDate(x))
+  }
+}
+
+
+# For descriptive statistics
+
+# Define the harmonic_mean function if not already defined
+harmonic_mean <- function(x) {
+  n <- length(x)
+  sum_reciprocal <- sum(1 / x, na.rm = TRUE)
+  n / sum_reciprocal
+}
+
+# Define the geometric_mean function
+geometric_mean <- function(x) {
+  x_positive <- x[x > 0] # Ensure only positive values are considered
+  if(length(x_positive) == 0) return(NA) # Avoid -Inf or NaN for non-positive sets
+  (prod(x_positive, na.rm = TRUE))^(1 / length(x_positive))
+}
+
+# Define the get_mode function for categorical imputation
+get_mode <- function(x) {
+  ux <- unique(x[!is.na(x)])
+  ux[which.max(tabulate(match(x, ux)))]
+}
+
 
diff --git a/README.Rmd b/README.Rmd
@@ -62,14 +62,16 @@ if(!require("devtools")){
 devtools::install_github("gbganalyst/bulkreadr")
 ```
 
-## How to load the package
+## Usage
 
 Now that you have installed `bulkreadr` package, you can simply load it by using:
 
 ```{r pkgload}
 library(bulkreadr)
 ```
 
+To get started with `bulkreadr`, see the [articles](https://gbganalyst.github.io/bulkreadr/articles/index.html).
+
 ## Context
 
 bulkreadr is designed to integrate with and augment the capabilities of established packages such as `readxl`, `readr`, and `googlesheets4`, offering enhanced functionality for reading bulk data within the R programming environment.

diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ if(!require("devtools")){
 devtools::install_github("gbganalyst/bulkreadr")
 ```
 
-## How to load the package
+## Usage
 
 Now that you have installed `bulkreadr` package, you can simply load it
 by using:
@@ -71,6 +71,9 @@ by using:
 library(bulkreadr)
 ```
 
+To get started with `bulkreadr`, see the
+[articles](https://gbganalyst.github.io/bulkreadr/articles/index.html).
+
 ## Context
 
 bulkreadr is designed to integrate with and augment the capabilities of