Skip to content

Commit

Permalink
Merge pull request #349 from Olink-Proteomics/optimization_develop_in…
Browse files Browse the repository at this point in the history
…st_extdata

MacOS still has issues with arrow. All other tests pass seamlessly.
  • Loading branch information
klevdiamanti authored Apr 17, 2024
2 parents 331ce07 + dd2d55f commit 4270d6e
Show file tree
Hide file tree
Showing 33 changed files with 2,278 additions and 371,244 deletions.
2 changes: 1 addition & 1 deletion OlinkAnalyze/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Config/testthat/parallel: true
Config/testthat/start-first: read_npx_wide
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Depends: R (>= 4.0)
Imports:
arrow,
Expand Down
2 changes: 1 addition & 1 deletion OlinkAnalyze/R/read_npx.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
#' @examples
#' \donttest{
#' file <- system.file("extdata",
#' "Example_NPX_Data2_1.csv",
#' "npx_data_long_csv.csv",
#' package = "OlinkAnalyze")
#'
#' read_NPX(filename = file,
Expand Down
24 changes: 23 additions & 1 deletion OlinkAnalyze/R/read_npx_format.R
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,9 @@ read_npx_format_get_format <- function(df,

}

# Check if the detected file format matches the customer input ----
# Checks ----

## Check if the detected file format matches the customer input ----

if (!is.null(long_format)) {

Expand Down Expand Up @@ -547,6 +549,26 @@ read_npx_format_get_format <- function(df,

}

## Check that long format data do not have NA colnames ----

check_na_colname <- df[1L, ] |>
as.character() |>
is.na() |>
any() && (is_long_format == TRUE)

if (check_na_colname) {

cli::cli_abort(
message = c(
"x" = "`NA` column names in long format file: {.file {file}}!",
"i" = "Please inspect the input file!"
),
call = rlang::caller_env(),
wrap = FALSE
)

}

# Return ----

# return the cells of the data that determine the file format and a boolean
Expand Down
5 changes: 3 additions & 2 deletions OlinkAnalyze/R/read_npx_wide.R
Original file line number Diff line number Diff line change
Expand Up @@ -866,8 +866,9 @@ read_npx_wide_top <- function(df,

## check df_top_oid ----

# no NAs are allowed in df_top_oid
if (any(is.na(df_top_oid))) {
# no NAs are allowed in df_top_oid in any column other than "Uniprot ID"
# the latter because the assay NT-proBNP does not have a Uniprot ID
if (any(is.na(dplyr::select(df_top_oid, -dplyr::all_of("Uniprot ID"))))) {

cli::cli_abort(
message = c(
Expand Down
Binary file modified OlinkAnalyze/R/sysdata.rda
Binary file not shown.
Binary file added OlinkAnalyze/data-raw/ref_manifest.rds
Binary file not shown.
Binary file added OlinkAnalyze/data-raw/ref_npx_data1.rds
Binary file not shown.
Binary file added OlinkAnalyze/data-raw/ref_npx_data2.rds
Binary file not shown.
78 changes: 78 additions & 0 deletions OlinkAnalyze/data-raw/sample_manifest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# readme ----

# This script creates a synthetic manifest file to generate the sample manifest
# data/manifest.rda which is used throughout OlinkAnalyze.
#
# As this script did not exist prior to 2024-04-08, we have stored the original
# manifest.rds file under data-raw/ref_manifest.rds to compare to the dataset
# generated by this script.
#
# A new data/manifest.rda will be generated ONLY IF manifest from this script
# matches ref_manifest!
#

# manifest ----

## create random manifest ----

n_subject <- 23L
n_visit <- 6L

manifest <- dplyr::tibble(
SubjectID = rep(x = LETTERS[1L:n_subject], each = n_visit),
Visit = rep(x = 1L:n_visit, times = n_subject)
) |>
dplyr::mutate(
SampleID = paste(SubjectID, Visit),
Site = c(rep(x = "Site1", times = ceiling(n_subject / 2) * n_visit),
rep(x = "Site2", times = floor(n_subject / 2) * n_visit))
)

# clean up
rm(n_subject, n_visit)

# compare to reference manifest ----

## load reference manifest ----

ref_manifest_file <- system.file("data-raw",
"ref_manifest.rds",
package = "OlinkAnalyze",
mustWork = TRUE)

ref_manifest <- readRDS(ref_manifest_file)

## check columns ----

stopifnot(
identical(colnames(manifest), colnames(ref_manifest))
)

# clean up
rm(ref_manifest_file)

# check identical ----

# at this stage manifest should be identical to the reference dataset
# ref_manifest. We simply allow some rounding error on the 4th decimal digit.

stopifnot(
manifest_eq <- all.equal(target = ref_manifest,
current = manifest,
tolerance = 1e-4,
check.attributes = TRUE,
check.names = TRUE)
)

#### IMPORTANT
# It is extremely important that the two datasets are identical with some minor
# rounding tolerance!!

# save to data/manifest.rda ----

if (manifest_eq == TRUE) {
usethis::use_data(manifest,
overwrite = TRUE,
compress = "xz",
version = 2L)
}
209 changes: 209 additions & 0 deletions OlinkAnalyze/data-raw/sample_npx_data1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# readme ----

# This script uses the raw data files:
# 1. inst/extdata/npx_data1_meta_original.csv
# 2. inst/extdata/npx_data1_original.xlsx
# to generate the sample dataset data/npx_data1.rda which is used throughout
# OlinkAnalyze.
#
# As this script did not exist prior to 2024-04-08, we have stored the original
# npx_data1.rds file under data-raw/ref_npx_data1.rds to compare to the dataset
# generated by this script.
#
# A new data/npx_data1.rda will be generated ONLY IF npx_data1 from this script
# matches ref_npx_data1!
#

# manifest ----

## load manifest ----

manifest_data1_file <- system.file("extdata",
"npx_data1_meta.csv",
package = "OlinkAnalyze",
mustWork = TRUE)

manifest_data1 <- read.delim(
file = manifest_data1_file,
header = TRUE,
sep = ";",
na.strings = c("", "NA")
)

## modify manifest ----

manifest_data1 <- manifest_data1 |>
# remove duplicate entries
dplyr::distinct() |>
# make all columns character vectors
dplyr::mutate(
dplyr::across(
dplyr::everything(),
~ as.character(.x)
)
) |>
# rename the project to "data1" to match reference
dplyr::mutate(
Project = dplyr::if_else(is.na(.data[["Project"]]),
NA_character_,
"data1")
)

## clean up
rm(manifest_data1_file)

# npx_data1 ----

# note that this data frame is quite large and for the purposes of this package
# we will use only 2 panels.
#
# we want the outcome from this section to be identical to the reference data
# frame npx_data1

## load npx_data1 ----

npx_data1_file <- system.file("extdata",
"npx_data1.xlsx",
package = "OlinkAnalyze",
mustWork = TRUE)

npx_data1 <- read_npx(filename = npx_data1_file,
out_df = "tibble",
long_format = FALSE,
data_type = "NPX",
olink_platform = "Target 96")
# ignore the following:
# 1. warning message about 2 duplicate samples. this is driven by control
# samples
# 2. warning that the olink platform could not be determined from the file and
# that the use input "Target 96" should be accepted

## modify npx_data1 ----

npx_data1 <- npx_data1 |>
# keep only data from 2 panels: cardiometabolic and inflammation
dplyr::filter(
.data[["Panel"]] %in% c("Olink CARDIOMETABOLIC", "Olink INFLAMMATION")
) |>
# make Panel as a title: first letter of every word capital and the remaining
# lower case
dplyr::mutate(
Panel = stringr::str_to_title(string = .data[["Panel"]])
) |>
# Panel_Version is NA from read_npx as it cannot be determined from the input
# file, so we have to input it manually
dplyr::mutate(
Panel_Version = dplyr::case_match(
.data[["Panel"]],
"Olink Cardiometabolic" ~ "v.1201",
"Olink Inflammation" ~ "v.1002",
.default = NA_character_
)
) |>
# Convert NPX, LOD and MissingFreq to numeric and keep only 5 sign digits
dplyr::mutate(
dplyr::across(
dplyr::all_of(
c("NPX", "LOD", "MissingFreq")
),
~ as.numeric(.x) |>
signif(digits = 5L)
)
) |>
# remove columns missing from the reference npx_data1
dplyr::select(
-dplyr::all_of(
"Olink NPX Signature Version"
)
)

## join with manifest ----

npx_data1 <- npx_data1 |>
# bring in sample info from manifest file
dplyr::inner_join(
manifest_data1,
by = "SampleID",
relationship = "many-to-one"
)

## order dataset ----

npx_data1 <- npx_data1 |>
# order df to match reference npx_data1
dplyr::arrange(
.data[["OlinkID"]], .data[["PlateID"]], .data[["SampleID"]]
)

# clean up
rm(npx_data1_file, manifest_data1)

# compare to reference npx_data1 ----

## load reference npx_data1 ----

ref_npx_data1_file <- system.file("data-raw",
"ref_npx_data1.rds",
package = "OlinkAnalyze",
mustWork = TRUE)

ref_npx_data1 <- readRDS(ref_npx_data1_file)

## check columns ----

stopifnot(
all(colnames(npx_data1) %in% colnames(ref_npx_data1))
)

stopifnot(
ncol(npx_data1) == 16L
)

stopifnot(
ncol(ref_npx_data1) == 17L
)

## modify reference npx_data1 ----

ref_npx_data1 <- ref_npx_data1 |>
# selecting only columns that are present in npx_data1. this should result in
# removing only column "Index" from ref_npx_data1 and ordering its columns
# similarly to npx_data1
dplyr::select(
dplyr::all_of(
colnames(npx_data1)
)
) |>
# order df to match npx_data1
dplyr::arrange(
.data[["OlinkID"]], .data[["PlateID"]], .data[["SampleID"]]
)

# clean up
rm(ref_npx_data1_file)

# check identical ----

# at this stage npx_data1 should be identical to the reference dataset
# ref_npx_data1. We simply allow some rounding error on the 4th decimal digit.

stopifnot(
npx_data1_eq <- all.equal(target = ref_npx_data1,
current = npx_data1,
tolerance = 1e-4,
check.attributes = TRUE,
check.names = TRUE)
)

#### IMPORTANT
# It is extremely important that the two datasets are identical with some minor
# rounding tolerance!!

# save to data/npx_data1.rda ----

if (npx_data1_eq == TRUE) {
usethis::use_data(npx_data1,
overwrite = TRUE,
compress = "xz",
version = 2L)
}
Loading

0 comments on commit 4270d6e

Please sign in to comment.