Merge pull request #399 from Olink-Proteomics/optimization_develop_ch…

…eck_names_of_df Optimization develop check names of df
Olink-Proteomics · Jul 8, 2024 · 8a4b4e4 · 8a4b4e4
2 parents 6b71d3e + 97bdfe7
commit 8a4b4e4
Show file tree

Hide file tree

Showing 15 changed files with 1,080 additions and 186 deletions.
diff --git a/OlinkAnalyze/R/read_npx_delim.R b/OlinkAnalyze/R/read_npx_delim.R
@@ -89,6 +89,8 @@ read_npx_delim <- function(file,
 
   # additional checks ----
 
+  # only one column
+
   if (length(names(df_olink)) == 1L) {
 
     cli::cli_warn(
@@ -102,6 +104,10 @@ read_npx_delim <- function(file,
 
   }
 
+  # top row is as expected for the corresponding format
+
+  read_npx_format_colnames(df = df_olink, file = file)
+
   # convert df class ----
 
   # if needed convert the object to the requested output

diff --git a/OlinkAnalyze/R/read_npx_excel.R b/OlinkAnalyze/R/read_npx_excel.R
@@ -87,6 +87,10 @@ read_npx_excel <- function(file,
 
   }
 
+  # top row is as expected for the corresponding format
+
+  read_npx_format_colnames(df = df_olink, file = file)
+
   # if needed convert the object to the requested output
   df_olink <- convert_read_npx_output(df = df_olink,
                                       out_df = out_df)

diff --git a/OlinkAnalyze/R/read_npx_legacy.R b/OlinkAnalyze/R/read_npx_legacy.R
@@ -441,6 +441,14 @@ read_npx_legacy <- function(file,
                             olink_platform = NULL,
                             data_type = NULL,
                             quiet = TRUE) {
+  cli::cli_warn(
+    c("You are using the function read_npx_legacy()!",
+      "This function imports Olink data in wide format from MS Excel files
+      exported by \"Olink NPX Manager\" or \"Olink NPX Signature\" version
+      earlier than 1.8.0, but fails for data exported from more recent software
+      versions.")
+  )
+
   # check input ----
 
   check_is_scalar_boolean(bool = quiet,

diff --git a/OlinkAnalyze/R/read_npx_utils.R b/OlinkAnalyze/R/read_npx_utils.R
@@ -0,0 +1,291 @@
+#' Help function checking that the requested output class of the read_npx*
+#' functions is acceptable.
+#'
+#' @author Klev Diamanti
+#'
+#' @param out_df The class of output data frame. One of `tibble` (default) or
+#' `arrow` for ArrowObject.
+#'
+#' @return An error if the argument is not as expected.
+#'
+check_out_df_arg <- function(out_df) {
+
+  # check taht out_df is a string
+  check_is_scalar_character(string = out_df,
+                            error = TRUE)
+
+  if (!(out_df %in% read_npx_df_output)) {
+
+    cli::cli_abort(
+      message = c(
+        "x" = "Unknown output argument {.arg {rlang::caller_arg(out_df)}}!",
+        "i" = "Acceptable {.arg {rlang::caller_arg(out_df)}}:
+        {read_npx_df_output}"
+      ),
+      call = rlang::caller_env(),
+      wrap = FALSE
+    )
+
+  }
+
+}
+
+#' Help function converting the output data frame from a read_npx* function to a
+#' tibble or an ArrowObject.
+#'
+#' @author
+#'   Klev Diamanti
+#'
+#' @param df The data frame to be converted.
+#' @param out_df The class of output data frame. One of `tibble` (default) or
+#' `arrow` for ArrowObject.
+#'
+#' @return The data frame in the requested class.
+#'
+convert_read_npx_output <- function(df,
+                                    out_df) {
+
+  # check that out_df is ok
+  check_out_df_arg(out_df = out_df)
+
+  if (check_is_arrow_or_tibble(df = df, error = FALSE)) {
+
+    if (out_df == "tibble") {
+
+      return(dplyr::as_tibble(df))
+
+    } else if (out_df == "arrow") {
+
+      return(arrow::as_arrow_table(df))
+
+    }
+
+  } else {
+
+    # if nont of the above throw an error
+    cli::cli_abort(
+      message = c(
+        "x" = "Unexpected input data frame {.arg {rlang::caller_arg(df)}}!",
+        "i" = "Expecting: { cli::ansi_collapse(x = read_npx_df_output,
+                                               last = \", or \") }"
+      ),
+      call = rlang::caller_env(),
+      wrap = FALSE
+    )
+
+  }
+
+}
+
+#' Help function checking that the olink_platform is acceptable.
+#'
+#' @param x The name of the Olink platform. One of `Explore 3072`, `Explore HT`,
+#' `Target 96`, `Target 48`, `Flex` or `Focus`.
+#' @param broader_platform Name of the broader Olink platform. One of `qPCR` or
+#' `NGS`.
+#'
+#' @return
+#' Nothing if platform is ok, otherwise an error.
+#'
+check_olink_platform <- function(x,
+                                 broader_platform = NULL) {
+
+  # input check ----
+
+  check_is_scalar_character(string = x,
+                            error = TRUE)
+  if (!is.null(broader_platform)) {
+    check_olink_broader_platform(x = broader_platform)
+  }
+
+  # check platform ----
+
+  # filter the global variable accepted_olink_platforms to have a collection
+  # of platforms available.
+  if (is.null(broader_platform)) {
+
+    olink_platforms <- accepted_olink_platforms
+
+  } else {
+
+    olink_platforms <- accepted_olink_platforms |>
+      dplyr::filter(.data[["broader_platform"]] == .env[["broader_platform"]])
+
+  }
+
+  # Throw an error if unexpected platform
+  if (!(x %in% olink_platforms$name)) {
+
+    cli::cli_abort(
+      message = c(
+        "x" = "Unexpected Olink platform {.arg {rlang::caller_arg(x)}}!",
+        "i" = "Expected one of: {olink_platforms$name}"
+      ),
+      call = rlang::caller_env(),
+      wrap = FALSE
+    )
+
+  }
+
+}
+
+#' Help function checking that the Olink data_type is acceptable.
+#'
+#' @param x The name of the Olink data type. One of `NPX`, `Quantified` or `Ct`.
+#' @param broader_platform Name of the broader Olink platform. One of `qPCR` or
+#' `NGS`.
+#'
+#' @return
+#' Nothing if data_type is ok, otherwise an error.
+#'
+check_olink_data_type <- function(x,
+                                  broader_platform = NULL) {
+
+  # input check ----
+
+  check_is_scalar_character(string = x,
+                            error = TRUE)
+  if (!is.null(broader_platform)) {
+    check_olink_broader_platform(x = broader_platform)
+  }
+
+  # check data_type ----
+
+  # filter the global variable accepted_olink_platforms to have a collection
+  # of data types available.
+  if (is.null(broader_platform)) {
+
+    olink_quant_methods <- accepted_olink_platforms
+
+  } else {
+
+    olink_quant_methods <- accepted_olink_platforms |>
+      dplyr::filter(
+        .data[["broader_platform"]] == .env[["broader_platform"]]
+      )
+  }
+
+  olink_quant_methods <- olink_quant_methods |>
+    dplyr::pull(
+      .data[["quant_method"]]
+    ) |>
+    unlist() |>
+    unique()
+
+  # Throw an error if unexpected data_type
+  if (!(x %in% olink_quant_methods)) {
+
+    cli::cli_abort(
+      message = c(
+        "x" = "Unexpected Olink data type {.arg {rlang::caller_arg(x)}}!",
+        "i" = "Expected one of: {olink_quant_methods}"
+      ),
+      call = rlang::caller_env(),
+      wrap = FALSE
+    )
+
+  }
+
+}
+
+#' Help function checking that the broader Olink platform is acceptable.
+#'
+#' @param x Name of the broader Olink platform. One of `qPCR` or `NGS`.
+#'
+#' @return
+#' Nothing if broader Olink platform is ok, otherwise an error.
+#'
+check_olink_broader_platform <- function(x) {
+
+  # input check ----
+
+  check_is_scalar_character(string = x,
+                            error = TRUE)
+
+  # check broader platform ----
+
+  if (!(x %in% unique(accepted_olink_platforms$broader_platform))) {
+
+    cli::cli_abort(
+      message = c(
+        "x" = "Unexpected Olink broader platform
+        {.arg {rlang::caller_arg(x)}}!",
+        "i" = "Expected one of:
+        {unique(accepted_olink_platforms$broader_platform)}"
+      ),
+      call = rlang::caller_env(),
+      wrap = FALSE
+    )
+
+  }
+
+}
+
+#' Help function checking whether a data set contains NA or empty strings on
+#' its column names
+#'
+#' @author Klev Diamanti
+#'
+#' @param df Tibble or ArrowObject with Olink data in wide or long format.
+#' @param file Path to Olink software output file in wide or long format.
+#' Expecting file extensions `csv`, `txt`, `xls`, `xlsx`, `parquet` or `zip`.
+#'
+#' @return Error is file contains problematic column names. `NULL` otherwise.
+#'
+read_npx_format_colnames <- function(df,
+                                     file) {
+
+  # check input ----
+  check_is_arrow_or_tibble(df = df,
+                           error = TRUE)
+
+  # check columns names ----
+
+  # check if column names are correct
+  # in wide format we expect only cells A1 and B1 to be populated
+  # in long format no column names should not be empty
+  if (all(grepl(pattern = "^V", x = names(df)))) { # wide format
+
+    # get first row of df
+    df_row_1 <- df |>
+      dplyr::slice_head(n = 1L) |>
+      dplyr::collect()
+
+    num_of_cells_with_vals <- ncol(df_row_1) -
+      sum(is.na(df_row_1)) - sum(df_row_1 == "", na.rm = TRUE)
+
+    if (ncol(df_row_1) < 3L || num_of_cells_with_vals != 2L) {
+
+      cli::cli_abort(
+        message = c(
+          "x" = "Unexpected first row in file {.file {file}}!",
+          "i" = "Detected file in wide format. Expected only cells in A1 and B1
+          to be populated."
+        ),
+        call = rlang::caller_env(),
+        wrap = FALSE
+      )
+
+    }
+
+  } else { # long format
+
+    if (any(names(df) == "") == TRUE
+        || check_is_character(string = names(df),
+                              error = FALSE) == FALSE) {
+
+      cli::cli_abort(
+        message = c(
+          "x" = "Unexpected columns in file {.file {file}}!",
+          "i" = "The dataset contains column names that are `NA` or `empty
+          string` (\"\")."
+        ),
+        call = rlang::caller_env(),
+        wrap = FALSE
+      )
+
+    }
+
+  }
+
+}
diff --git a/OlinkAnalyze/R/sysdata.rda b/OlinkAnalyze/R/sysdata.rda
diff --git a/OlinkAnalyze/data-raw/column_name_dict.R b/OlinkAnalyze/data-raw/column_name_dict.R
@@ -16,6 +16,11 @@ column_name_dict <- list(
                   "sample_type",
                   "sampletype",
                   NA_character_),
+  assay_type = c("Assay_Type",
+                 "AssayType",
+                 "assay_type",
+                 "assaytype",
+                 NA_character_),
   olink_id = c("OlinkID",
                "OID",
                "olinkid",

diff --git a/OlinkAnalyze/man/check_olink_broader_platform.Rd b/OlinkAnalyze/man/check_olink_broader_platform.Rd