Merge pull request #349 from Olink-Proteomics/optimization_develop_in…

…st_extdata MacOS still has issues with arrow. All other tests pass seamlessly.
Olink-Proteomics · Apr 17, 2024 · 4270d6e · 4270d6e
2 parents 331ce07 + dd2d55f
commit 4270d6e
Show file tree

Hide file tree

Showing 33 changed files with 2,278 additions and 371,244 deletions.
diff --git a/OlinkAnalyze/DESCRIPTION b/OlinkAnalyze/DESCRIPTION
@@ -58,7 +58,7 @@ Config/testthat/parallel: true
 Config/testthat/start-first: read_npx_wide
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends: R (>= 4.0)
 Imports: 
     arrow,

diff --git a/OlinkAnalyze/R/read_npx.R b/OlinkAnalyze/R/read_npx.R
@@ -60,7 +60,7 @@
 #' @examples
 #' \donttest{
 #' file <- system.file("extdata",
-#'                     "Example_NPX_Data2_1.csv",
+#'                     "npx_data_long_csv.csv",
 #'                     package = "OlinkAnalyze")
 #'
 #' read_NPX(filename = file,

diff --git a/OlinkAnalyze/R/read_npx_format.R b/OlinkAnalyze/R/read_npx_format.R
@@ -496,7 +496,9 @@ read_npx_format_get_format <- function(df,
 
   }
 
-  # Check if the detected file format matches the customer input ----
+  # Checks ----
+
+  ## Check if the detected file format matches the customer input ----
 
   if (!is.null(long_format)) {
 
@@ -547,6 +549,26 @@ read_npx_format_get_format <- function(df,
 
   }
 
+  ## Check that long format data do not have NA colnames ----
+
+  check_na_colname <- df[1L, ] |>
+    as.character() |>
+    is.na() |>
+    any() && (is_long_format == TRUE)
+
+  if (check_na_colname) {
+
+    cli::cli_abort(
+      message = c(
+        "x" = "`NA` column names in long format file: {.file {file}}!",
+        "i" = "Please inspect the input file!"
+      ),
+      call = rlang::caller_env(),
+      wrap = FALSE
+    )
+
+  }
+
   # Return ----
 
   # return the cells of the data that determine the file format and a boolean

diff --git a/OlinkAnalyze/R/read_npx_wide.R b/OlinkAnalyze/R/read_npx_wide.R
@@ -866,8 +866,9 @@ read_npx_wide_top <- function(df,
 
   ## check df_top_oid ----
 
-  # no NAs are allowed in df_top_oid
-  if (any(is.na(df_top_oid))) {
+  # no NAs are allowed in df_top_oid in any column other than "Uniprot ID"
+  # the latter because the assay NT-proBNP does not have a Uniprot ID
+  if (any(is.na(dplyr::select(df_top_oid, -dplyr::all_of("Uniprot ID"))))) {
 
     cli::cli_abort(
       message = c(

diff --git a/OlinkAnalyze/R/sysdata.rda b/OlinkAnalyze/R/sysdata.rda
diff --git a/OlinkAnalyze/data-raw/ref_manifest.rds b/OlinkAnalyze/data-raw/ref_manifest.rds
diff --git a/OlinkAnalyze/data-raw/ref_npx_data1.rds b/OlinkAnalyze/data-raw/ref_npx_data1.rds
diff --git a/OlinkAnalyze/data-raw/ref_npx_data2.rds b/OlinkAnalyze/data-raw/ref_npx_data2.rds
diff --git a/OlinkAnalyze/data-raw/sample_manifest.R b/OlinkAnalyze/data-raw/sample_manifest.R
@@ -0,0 +1,78 @@
+# readme ----
+
+# This script creates a synthetic manifest file to generate the sample manifest
+# data/manifest.rda which is used throughout OlinkAnalyze.
+#
+# As this script did not exist prior to 2024-04-08, we have stored the original
+# manifest.rds file under data-raw/ref_manifest.rds to compare to the dataset
+# generated by this script.
+#
+# A new data/manifest.rda will be generated ONLY IF manifest from this script
+# matches ref_manifest!
+#
+
+# manifest ----
+
+## create random manifest ----
+
+n_subject <- 23L
+n_visit <- 6L
+
+manifest <- dplyr::tibble(
+  SubjectID = rep(x = LETTERS[1L:n_subject], each = n_visit),
+  Visit = rep(x = 1L:n_visit, times = n_subject)
+) |>
+  dplyr::mutate(
+    SampleID = paste(SubjectID, Visit),
+    Site = c(rep(x = "Site1", times = ceiling(n_subject / 2) * n_visit),
+             rep(x = "Site2", times = floor(n_subject / 2) * n_visit))
+  )
+
+# clean up
+rm(n_subject, n_visit)
+
+# compare to reference manifest ----
+
+## load reference manifest ----
+
+ref_manifest_file <- system.file("data-raw",
+                                 "ref_manifest.rds",
+                                 package = "OlinkAnalyze",
+                                 mustWork = TRUE)
+
+ref_manifest <- readRDS(ref_manifest_file)
+
+## check columns ----
+
+stopifnot(
+  identical(colnames(manifest), colnames(ref_manifest))
+)
+
+# clean up
+rm(ref_manifest_file)
+
+# check identical ----
+
+# at this stage manifest should be identical to the reference dataset
+# ref_manifest. We simply allow some rounding error on the 4th decimal digit.
+
+stopifnot(
+  manifest_eq <- all.equal(target = ref_manifest,
+                           current = manifest,
+                           tolerance = 1e-4,
+                           check.attributes = TRUE,
+                           check.names = TRUE)
+)
+
+#### IMPORTANT
+# It is extremely important that the two datasets are identical with some minor
+# rounding tolerance!!
+
+# save to data/manifest.rda ----
+
+if (manifest_eq == TRUE) {
+  usethis::use_data(manifest,
+                    overwrite = TRUE,
+                    compress = "xz",
+                    version = 2L)
+}
diff --git a/OlinkAnalyze/data-raw/sample_npx_data1.R b/OlinkAnalyze/data-raw/sample_npx_data1.R
@@ -0,0 +1,209 @@
+# readme ----
+
+# This script uses the raw data files:
+# 1. inst/extdata/npx_data1_meta_original.csv
+# 2. inst/extdata/npx_data1_original.xlsx
+# to generate the sample dataset data/npx_data1.rda which is used throughout
+# OlinkAnalyze.
+#
+# As this script did not exist prior to 2024-04-08, we have stored the original
+# npx_data1.rds file under data-raw/ref_npx_data1.rds to compare to the dataset
+# generated by this script.
+#
+# A new data/npx_data1.rda will be generated ONLY IF npx_data1 from this script
+# matches ref_npx_data1!
+#
+
+# manifest ----
+
+## load manifest ----
+
+manifest_data1_file <- system.file("extdata",
+                                   "npx_data1_meta.csv",
+                                   package = "OlinkAnalyze",
+                                   mustWork = TRUE)
+
+manifest_data1 <- read.delim(
+  file = manifest_data1_file,
+  header = TRUE,
+  sep = ";",
+  na.strings = c("", "NA")
+)
+
+## modify manifest ----
+
+manifest_data1 <- manifest_data1 |>
+  # remove duplicate entries
+  dplyr::distinct() |>
+  # make all columns character vectors
+  dplyr::mutate(
+    dplyr::across(
+      dplyr::everything(),
+      ~ as.character(.x)
+    )
+  ) |>
+  # rename the project to "data1" to match reference
+  dplyr::mutate(
+    Project = dplyr::if_else(is.na(.data[["Project"]]),
+                             NA_character_,
+                             "data1")
+  )
+
+## clean up
+rm(manifest_data1_file)
+
+# npx_data1 ----
+
+# note that this data frame is quite large and for the purposes of this package
+# we will use only 2 panels.
+#
+# we want the outcome from this section to be identical to the reference data
+# frame npx_data1
+
+## load npx_data1 ----
+
+npx_data1_file <- system.file("extdata",
+                              "npx_data1.xlsx",
+                              package = "OlinkAnalyze",
+                              mustWork = TRUE)
+
+npx_data1 <- read_npx(filename = npx_data1_file,
+                      out_df = "tibble",
+                      long_format = FALSE,
+                      data_type = "NPX",
+                      olink_platform = "Target 96")
+# ignore the following:
+# 1. warning message about 2 duplicate samples. this is driven by control
+#    samples
+# 2. warning that the olink platform could not be determined from the file and
+#    that the use input "Target 96" should be accepted
+
+## modify npx_data1 ----
+
+npx_data1 <- npx_data1 |>
+  # keep only data from 2 panels: cardiometabolic and inflammation
+  dplyr::filter(
+    .data[["Panel"]] %in% c("Olink CARDIOMETABOLIC", "Olink INFLAMMATION")
+  ) |>
+  # make Panel as a title: first letter of every word capital and the remaining
+  # lower case
+  dplyr::mutate(
+    Panel = stringr::str_to_title(string = .data[["Panel"]])
+  ) |>
+  # Panel_Version is NA from read_npx as it cannot be determined from the input
+  # file, so we have to input it manually
+  dplyr::mutate(
+    Panel_Version = dplyr::case_match(
+      .data[["Panel"]],
+      "Olink Cardiometabolic" ~ "v.1201",
+      "Olink Inflammation" ~ "v.1002",
+      .default = NA_character_
+    )
+  ) |>
+  # Convert NPX, LOD and MissingFreq to numeric and keep only 5 sign digits
+  dplyr::mutate(
+    dplyr::across(
+      dplyr::all_of(
+        c("NPX", "LOD", "MissingFreq")
+      ),
+      ~ as.numeric(.x) |>
+        signif(digits = 5L)
+    )
+  ) |>
+  # remove columns missing from the reference npx_data1
+  dplyr::select(
+    -dplyr::all_of(
+      "Olink NPX Signature Version"
+    )
+  )
+
+## join with manifest ----
+
+npx_data1 <- npx_data1 |>
+  # bring in sample info from manifest file
+  dplyr::inner_join(
+    manifest_data1,
+    by = "SampleID",
+    relationship = "many-to-one"
+  )
+
+## order dataset ----
+
+npx_data1 <- npx_data1 |>
+  # order df to match reference npx_data1
+  dplyr::arrange(
+    .data[["OlinkID"]], .data[["PlateID"]], .data[["SampleID"]]
+  )
+
+# clean up
+rm(npx_data1_file, manifest_data1)
+
+# compare to reference npx_data1 ----
+
+## load reference npx_data1 ----
+
+ref_npx_data1_file <- system.file("data-raw",
+                                  "ref_npx_data1.rds",
+                                  package = "OlinkAnalyze",
+                                  mustWork = TRUE)
+
+ref_npx_data1 <- readRDS(ref_npx_data1_file)
+
+## check columns ----
+
+stopifnot(
+  all(colnames(npx_data1) %in% colnames(ref_npx_data1))
+)
+
+stopifnot(
+  ncol(npx_data1) == 16L
+)
+
+stopifnot(
+  ncol(ref_npx_data1) == 17L
+)
+
+## modify reference npx_data1 ----
+
+ref_npx_data1 <- ref_npx_data1 |>
+  # selecting only columns that are present in npx_data1. this should result in
+  # removing only column "Index" from ref_npx_data1 and ordering its columns
+  # similarly to npx_data1
+  dplyr::select(
+    dplyr::all_of(
+      colnames(npx_data1)
+    )
+  ) |>
+  # order df to match npx_data1
+  dplyr::arrange(
+    .data[["OlinkID"]], .data[["PlateID"]], .data[["SampleID"]]
+  )
+
+# clean up
+rm(ref_npx_data1_file)
+
+# check identical ----
+
+# at this stage npx_data1 should be identical to the reference dataset
+# ref_npx_data1. We simply allow some rounding error on the 4th decimal digit.
+
+stopifnot(
+  npx_data1_eq <- all.equal(target = ref_npx_data1,
+                            current = npx_data1,
+                            tolerance = 1e-4,
+                            check.attributes = TRUE,
+                            check.names = TRUE)
+)
+
+#### IMPORTANT
+# It is extremely important that the two datasets are identical with some minor
+# rounding tolerance!!
+
+# save to data/npx_data1.rda ----
+
+if (npx_data1_eq == TRUE) {
+  usethis::use_data(npx_data1,
+                    overwrite = TRUE,
+                    compress = "xz",
+                    version = 2L)
+}