diff --git a/OlinkAnalyze/DESCRIPTION b/OlinkAnalyze/DESCRIPTION index d5fe32ab..a0df5bbb 100644 --- a/OlinkAnalyze/DESCRIPTION +++ b/OlinkAnalyze/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: OlinkAnalyze Title: Facilitate Analysis of Proteomic Data from Olink -Version: 3.8.2 +Version: 3.9.0 Authors@R: c( person("Kathleen", "Nevola", , "biostattools@olink.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-5183-6444", Github = "kathy-nevola")), @@ -27,6 +27,8 @@ Authors@R: c( comment = c(Github = "leiliuC")), person("Kristyn", "Chin", role = "aut", comment = c(Github = "kristynchin-olink")), + person("Danai", "Topouza", role = "aut", + comment = c(Github = "dtopouza", ORCID = "0000-0002-6897-9281")), person("Kristian", "Hodén", role = "ctb", comment = c(ORCID = "0000-0003-0354-0662", Github = "kristianHoden")), person("Per", "Eriksson", role = "ctb", @@ -43,8 +45,6 @@ Authors@R: c( comment = c(Github = "olofmansson")), person("Ola", "Caster", role = "ctb", comment = c(Github = "OlaCaster")), - person("Danai", "Topouza", role = "ctb", - comment = c(Github = "dtopouza", ORCID = "0000-0002-6897-9281")), person("Olink", role = c("cph", "fnd")) ) Description: A collection of functions to facilitate analysis of proteomic diff --git a/OlinkAnalyze/NEWS.md b/OlinkAnalyze/NEWS.md index 736fb720..8aa7c62f 100644 --- a/OlinkAnalyze/NEWS.md +++ b/OlinkAnalyze/NEWS.md @@ -1,3 +1,20 @@ +# Olink Analyze 3.9.0 +## Minor Changes +* Explore HT recommended bridging samples have been added to Introduction to Bridging tutorial (#409, @kathy-nevola) +* Support for CSVs with SampleQC column was added to read_NPX (#406, @kathy-nevola) +* Support for Olink Analyze Export parquets was added to read_NPX (#408, @kathy-nevola) +* Quantitative value csvs will now give a warning about limited support for Quant data (#406, @kathy-nevola) +* Instructions for importing multiple NPX files has been added to the overview tutorial (#403, @dtopouza) +* Additional background information was added to the LOD tutorial to clarify how LOD is calculated from counts (#404, @kathy-nevola) +* LOD can now be calculated using fixed LOD, negative controls, or both methods (#390, @kathy-nevola) +* An error message will now appear when running anova and control assays are present (#416, @dtopouza) +* Danai Topouza's role has been changed from contributor to author (#415, @kathy-nevola) + +## Bug Fixes +* URLs in tutorials will now direct to updated olink.com locations (#402, @kathy-nevola) +* Instructions to export parquet files with LOD have been updated (#408, @kathy-nevola) +* removed scale_name argument when ggplot2 3.5+ is installed (#421, @kathy-nevola) + # Olink Analyze 3.8.2 ## Bug Fixes * update to URL hyperlink in LOD tutorial to include https diff --git a/OlinkAnalyze/R/Olink_anova.R b/OlinkAnalyze/R/Olink_anova.R index 55bf9eab..97258ddc 100644 --- a/OlinkAnalyze/R/Olink_anova.R +++ b/OlinkAnalyze/R/Olink_anova.R @@ -5,6 +5,8 @@ #'Samples that have no variable information or missing factor levels are automatically removed from the analysis (specified in a message if verbose = TRUE). #'Character columns in the input dataframe are automatically converted to factors (specified in a message if verbose = TRUE). #'Numerical variables are not converted to factors. +#'Control samples should be removed before using this function. +#'Control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed before using this function. #'If a numerical variable is to be used as a factor, this conversion needs to be done on the dataframe before the function call. \cr\cr #'Crossed analysis, i.e. A*B formula notation, is inferred from the variable argument in the following cases: \cr #'\itemize{ @@ -54,7 +56,7 @@ #' #' library(dplyr) #' -#' npx_df <- npx_data1 %>% filter(!grepl('control',SampleID, ignore.case = TRUE)) +#' npx_df <- npx_data1 |> filter(!grepl('control|ctrl',SampleID, ignore.case = TRUE)) #' #' #One-way ANOVA, no covariates. #' #Results in a model NPX~Time @@ -106,10 +108,28 @@ olink_anova <- function(df, stop('The df and variable arguments need to be specified.') } + # Stop if internal controls (assays) have not been removed + if ("AssayType" %in% names(df)) { + if (any(df$AssayType != "assay")) { + ctrl_assays <- df |> + dplyr::filter(AssayType != "assay") + + stop(paste0('Control assays have not been removed from the dataset.\n Assays with AssayType != "assay" should be excluded.\n The following ', length(unique(ctrl_assays$Assay)) ,' control assays were found:\n ', + paste(strwrap(toString(unique(ctrl_assays$Assay)), width = 80), collapse = "\n"))) + } + } else if (any(stringr::str_detect(df$Assay, stringr::regex("control|ctrl", ignore_case = TRUE)))) { + ctrl_assays <- df |> + dplyr::filter(stringr::str_detect(df$Assay, stringr::regex("control|ctrl", ignore_case = TRUE))) + + stop(paste0('Control assays have not been removed from the dataset.\n Assays with "control" in their Assay field should be excluded.\n The following ', length(unique(ctrl_assays$Assay)) ,' control assays were found:\n ', + paste(strwrap(toString(unique(ctrl_assays$Assay)), width = 80), collapse = "\n"))) + } + + withCallingHandlers({ - + #Filtering on valid OlinkID - df <- df %>% + df <- df |> dplyr::filter(stringr::str_detect(OlinkID, "OID[0-9]{5}")) @@ -176,13 +196,13 @@ olink_anova <- function(df, for(effect in single_fixed_effects){ - current_nas <- df %>% - dplyr::filter(!(OlinkID %in% npxCheck$all_nas)) %>% #Exclude assays that have all NA:s - dplyr::group_by(OlinkID, !!rlang::ensym(effect)) %>% - dplyr::summarise(n = dplyr::n(), n_na = sum(is.na(NPX))) %>% - dplyr::ungroup() %>% - dplyr::filter(n == n_na) %>% - dplyr::distinct(OlinkID) %>% + current_nas <- df |> + dplyr::filter(!(OlinkID %in% npxCheck$all_nas)) |> #Exclude assays that have all NA:s + dplyr::group_by(OlinkID, !!rlang::ensym(effect)) |> + dplyr::summarise(n = dplyr::n(), n_na = sum(is.na(NPX))) |> + dplyr::ungroup() |> + dplyr::filter(n == n_na) |> + dplyr::distinct(OlinkID) |> dplyr::pull(OlinkID) @@ -198,12 +218,12 @@ olink_anova <- function(df, call. = FALSE) } - number_of_samples_w_more_than_one_level <- df %>% - dplyr::group_by(SampleID) %>% - dplyr::summarise(n_levels = dplyr::n_distinct(!!rlang::ensym(effect), na.rm = TRUE)) %>% - dplyr::ungroup() %>% - dplyr::filter(n_levels > 1) %>% - nrow(.) + number_of_samples_w_more_than_one_level <- df |> + dplyr::group_by(SampleID) |> + dplyr::summarise(n_levels = dplyr::n_distinct(!!rlang::ensym(effect), na.rm = TRUE)) |> + dplyr::ungroup() |> + dplyr::filter(n_levels > 1) |> + nrow() if (number_of_samples_w_more_than_one_level > 0) { stop(paste0("There are ", @@ -259,7 +279,7 @@ olink_anova <- function(df, if(!is.null(covariates) & any(grepl(":", covariates))){ - covariate_filter_string <- covariates[str_detect(covariates, ':')] + covariate_filter_string <- covariates[stringr::str_detect(covariates, ':')] covariate_filter_string <- sub("(.*)\\:(.*)$", "\\2:\\1", covariate_filter_string) covariate_filter_string <- c(covariates, covariate_filter_string) @@ -267,35 +287,33 @@ olink_anova <- function(df, covariate_filter_string <- covariates } - p.val <- df %>% - dplyr::filter(!(OlinkID %in% npxCheck$all_nas)) %>% #Exclude assays that have all NA:s - dplyr::filter(!(OlinkID %in% nas_in_var)) %>% - dplyr::group_by(Assay, OlinkID, UniProt, Panel) %>% - dplyr::do(generics::tidy(car::Anova(stats::lm(as.formula(formula_string), - data=., - contrasts = sapply(fact.vars,function(x) return(contr.sum), - simplify = FALSE)),type=3))) %>% - - dplyr::ungroup() %>% - dplyr::filter(!term %in% c('(Intercept)','Residuals')) %>% - dplyr::mutate(covariates = term %in% covariate_filter_string) %>% - dplyr::group_by(covariates) %>% - dplyr::mutate(Adjusted_pval = p.adjust(p.value,method="fdr")) %>% - dplyr::mutate(Threshold = ifelse(Adjusted_pval<0.05,"Significant","Non-significant")) %>% + p.val <- df |> + dplyr::filter(!(OlinkID %in% npxCheck$all_nas)) |> #Exclude assays that have all NA:s + dplyr::filter(!(OlinkID %in% nas_in_var)) |> + dplyr::group_by(Assay, OlinkID, UniProt, Panel) |> + dplyr::group_modify(~ internal_anova(x = .x, + formula_string = formula_string, + fact.vars = fact.vars))|> + dplyr::ungroup()|> + dplyr::filter(!term %in% c('(Intercept)','Residuals')) |> + dplyr::mutate(covariates = term %in% covariate_filter_string) |> + dplyr::group_by(covariates) |> + dplyr::mutate(Adjusted_pval = p.adjust(p.value,method="fdr")) |> + dplyr::mutate(Threshold = ifelse(Adjusted_pval<0.05,"Significant","Non-significant")) |> dplyr::mutate(Adjusted_pval = ifelse(covariates,NA,Adjusted_pval), - Threshold = ifelse(covariates,NA,Threshold)) %>% - dplyr::ungroup() %>% - dplyr::select(-covariates) %>% - dplyr::mutate(meansq=sumsq/df) %>% + Threshold = ifelse(covariates,NA,Threshold)) |> + dplyr::ungroup() |> + dplyr::select(-covariates) |> + dplyr::mutate(meansq=sumsq/df) |> dplyr::select(Assay,OlinkID,UniProt,Panel,term,df,sumsq, - meansq,statistic,p.value,Adjusted_pval,Threshold) %>% + meansq,statistic,p.value,Adjusted_pval,Threshold) |> dplyr::arrange(Adjusted_pval) if(return.covariates){ return(p.val) } else{ - return(p.val %>% + return(p.val |> dplyr::filter(!term%in%covariate_filter_string)) } @@ -310,6 +328,8 @@ olink_anova <- function(df, #'Performs a post hoc ANOVA test using emmeans::emmeans with Tukey p-value adjustment per assay (by OlinkID) for each panel at confidence level 0.95. #'See \code{olink_anova} for details of input notation. \cr\cr #'The function handles both factor and numerical variables and/or covariates. +#'Control samples should be removed before using this function. +#'Control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed before using this function. #'The posthoc test for a numerical variable compares the difference in means of the outcome variable (default: NPX) for 1 standard deviation difference in the numerical variable, e.g. #'mean NPX at mean(numerical variable) versus mean NPX at mean(numerical variable) + 1*SD(numerical variable). #' @@ -349,7 +369,7 @@ olink_anova <- function(df, #' #' library(dplyr) #' -#' npx_df <- npx_data1 %>% filter(!grepl('control',SampleID, ignore.case = TRUE)) +#' npx_df <- npx_data1 |> filter(!grepl('control|ctrl',SampleID, ignore.case = TRUE)) #' #' #Two-way ANOVA, one main effect (Site) covariate. #' #Results in model NPX~Treatment*Time+Site. @@ -361,10 +381,10 @@ olink_anova <- function(df, #' #on the interaction effect Treatment:Time with covariate Site. #' #' #Filtering out significant and relevant results. -#' significant_assays <- anova_results %>% -#' filter(Threshold == 'Significant' & term == 'Treatment:Time') %>% -#' select(OlinkID) %>% -#' distinct() %>% +#' significant_assays <- anova_results |> +#' filter(Threshold == 'Significant' & term == 'Treatment:Time') |> +#' select(OlinkID) |> +#' distinct() |> #' pull() #' #' #Posthoc, all pairwise comparisons @@ -443,18 +463,39 @@ olink_anova_posthoc <- function(df, stop("All effect terms must be included in the variable argument or model formula.") } + # Stop if internal controls (assays) have not been removed + if ("AssayType" %in% names(df)) { + if (any(df$AssayType != "assay")) { + ctrl_assays <- df |> + dplyr::filter(AssayType != "assay") + + stop(paste0( + 'Control assays have not been removed from the dataset.\n Assays with AssayType != "assay" should be excluded.\n The following ', length(unique(ctrl_assays$Assay)), " control assays were found:\n ", + paste(strwrap(toString(unique(ctrl_assays$Assay)), width = 80), collapse = "\n") + )) + } + } else if (any(stringr::str_detect(df$Assay, stringr::regex("control|ctrl", ignore_case = TRUE)))) { + ctrl_assays <- df |> + dplyr::filter(stringr::str_detect(df$Assay, stringr::regex("control|ctrl", ignore_case = TRUE))) + + stop(paste0( + 'Control assays have not been removed from the dataset.\n Assays with "control" in their Assay field should be excluded.\n The following ', length(unique(ctrl_assays$Assay)), " control assays were found:\n ", + paste(strwrap(toString(unique(ctrl_assays$Assay)), width = 80), collapse = "\n") + )) + } + withCallingHandlers({ - + #Filtering on valid OlinkID - df <- df %>% + df <- df |> dplyr::filter(stringr::str_detect(OlinkID, "OID[0-9]{5}")) if(is.null(olinkid_list)){ - olinkid_list <- df %>% - dplyr::select(OlinkID) %>% - dplyr::distinct() %>% + olinkid_list <- df |> + dplyr::select(OlinkID) |> + dplyr::distinct() |> dplyr::pull() } @@ -547,37 +588,37 @@ olink_anova_posthoc <- function(df, e_form <- as.formula(paste0("pairwise~", paste(effect,collapse="+"))) } - anova_posthoc_results <- df %>% - dplyr::filter(OlinkID %in% olinkid_list) %>% - dplyr::filter(!(OlinkID %in% npxCheck$all_nas)) %>% #Exclude assays that have all NA:s - dplyr::mutate(OlinkID = factor(OlinkID, levels = olinkid_list)) %>% - dplyr::group_by(Assay, OlinkID, UniProt, Panel) %>% + anova_posthoc_results <- df |> + dplyr::filter(OlinkID %in% olinkid_list) |> + dplyr::filter(!(OlinkID %in% npxCheck$all_nas)) |> #Exclude assays that have all NA:s + dplyr::mutate(OlinkID = factor(OlinkID, levels = olinkid_list)) |> + dplyr::group_by(Assay, OlinkID, UniProt, Panel) |> dplyr::do(data.frame(emmeans::emmeans(stats::lm(as.formula(formula_string),data=.), specs=e_form, cov.reduce = function(x) round(c(mean(x),mean(x)+sd(x)),4), infer=c(TRUE,TRUE), adjust=post_hoc_padjust_method)[[c("contrasts","emmeans")[1+as.numeric(mean_return)]]], - stringsAsFactors=FALSE)) %>% - dplyr::ungroup() %>% - dplyr::mutate(term=paste(effect,collapse=":")) %>% + stringsAsFactors=FALSE)) |> + dplyr::ungroup() |> + dplyr::mutate(term=paste(effect,collapse=":")) |> dplyr::rename(conf.low=lower.CL, conf.high=upper.CL) if(mean_return){ - anova_posthoc_results <- anova_posthoc_results %>% + anova_posthoc_results <- anova_posthoc_results |> dplyr::select(all_of(c("Assay", "OlinkID", "UniProt", "Panel", "term", effect, "emmean", "conf.low", "conf.high"))) } else if(!mean_return){ - anova_posthoc_results <- anova_posthoc_results %>% - dplyr::rename(Adjusted_pval = p.value) %>% - dplyr::arrange(Adjusted_pval) %>% + anova_posthoc_results <- anova_posthoc_results |> + dplyr::rename(Adjusted_pval = p.value) |> + dplyr::arrange(Adjusted_pval) |> dplyr::mutate(Threshold = if_else(Adjusted_pval < 0.05, 'Significant', - 'Non-significant')) %>% + 'Non-significant')) |> dplyr::select(tidyselect::any_of(c("Assay", "OlinkID", "UniProt", "Panel", "term", "contrast", effect, "estimate", "conf.low", "conf.high", "Adjusted_pval","Threshold"))) - if(post_hoc_padjust_method=="none") anova_posthoc_results <- anova_posthoc_results %>% rename(pvalue=Adjusted_pval) + if(post_hoc_padjust_method=="none") anova_posthoc_results <- anova_posthoc_results |> rename(pvalue=Adjusted_pval) } return(anova_posthoc_results) @@ -588,3 +629,20 @@ olink_anova_posthoc <- function(df, }) } + + +#' Internal Anova function +#' +#' @param x grouped data frame +#' @param formula_string anova formula +#' @param fact.vars variables in factor form +#' +#' @return anova results +#' @noRd +internal_anova <- function(x, formula_string, fact.vars){ + generics::tidy(car::Anova(stats::lm(as.formula(formula_string), + data=x, + contrasts = sapply(fact.vars,function(x) return(contr.sum), + simplify = FALSE)),type=3)) +} + diff --git a/OlinkAnalyze/R/Olink_boxplot.R b/OlinkAnalyze/R/Olink_boxplot.R index 5408ad47..19cb48e1 100644 --- a/OlinkAnalyze/R/Olink_boxplot.R +++ b/OlinkAnalyze/R/Olink_boxplot.R @@ -13,7 +13,6 @@ #' @param ... coloroption passed to specify color order #' #' @return A list of objects of class “ggplot” (the actual ggplot object is entry 1 in the list). Box and whisker plot of NPX (y-axis) by variable (x-axis) for each Assay -#' @importFrom magrittr %>% #' @importFrom dplyr filter mutate select #' @importFrom stringr str_detect #' @importFrom tidyr unite @@ -26,12 +25,12 @@ #' \donttest{ #' #' library(dplyr) -#' -#' anova_results <- olink_anova(npx_data1, variable = "Site") -#' significant_assays <- anova_results %>% -#' filter(Threshold == 'Significant') %>% +#' npx_df <- npx_data1 |> filter(!grepl('control|ctrl',SampleID, ignore.case = TRUE)) +#' anova_results <- olink_anova(npx_df, variable = "Site") +#' significant_assays <- anova_results |> +#' filter(Threshold == 'Significant') |> #' pull(OlinkID) -#' olink_boxplot(npx_data1, +#' olink_boxplot(npx_df, #' variable = "Site", #' olinkid_list = significant_assays, #' verbose = TRUE, @@ -87,9 +86,9 @@ olink_boxplot <- function(df, #Filtering on valid OlinkID - df <- df %>% + df <- df |> dplyr::filter(stringr::str_detect(OlinkID, - "OID[0-9]{5}")) %>% + "OID[0-9]{5}")) |> dplyr::filter(!(OlinkID %in% npx_check$all_nas)) #Column setup @@ -146,16 +145,16 @@ olink_boxplot <- function(df, assays_for_plotting <- olinkid_list[c(from_protein:(to_protein-1))] - npx_for_plotting <- df %>% - dplyr::filter(OlinkID %in% assays_for_plotting) %>% - dplyr::mutate(OlinkID = factor(OlinkID, levels = assays_for_plotting)) %>% - dplyr::select(OlinkID, UniProt, Assay, NPX, eval(variable)) %>% - with(., .[order(OlinkID),]) %>% - tidyr::unite(c(Assay, OlinkID), col = 'Name_OID', sep = ' ', remove = FALSE) %>% + npx_for_plotting <- df |> + dplyr::filter(OlinkID %in% assays_for_plotting) |> + dplyr::mutate(OlinkID = factor(OlinkID, levels = assays_for_plotting)) |> + dplyr::select(OlinkID, UniProt, Assay, NPX, eval(variable)) |> + arrange(OlinkID) |> + tidyr::unite(c(Assay, OlinkID), col = 'Name_OID', sep = ' ', remove = FALSE) |> dplyr::mutate(Name_OID = forcats::as_factor(Name_OID)) if(is.null(posthoc_results) && is.null(ttest_results)) { - boxplot <- npx_for_plotting %>% + boxplot <- npx_for_plotting |> ggplot2::ggplot(ggplot2::aes(y = NPX, !!x_variable[[1]])) + ggplot2::geom_boxplot(ggplot2::aes(fill = !!fill_variable[[1]])) + @@ -167,55 +166,55 @@ olink_boxplot <- function(df, } else if (!is.null(posthoc_results) && is.null(ttest_results)){ - star.info <- data.frame(x.vals = levels(npx_for_plotting %>% - dplyr::pull(eval(variable)) %>% + star.info <- data.frame(x.vals = levels(npx_for_plotting |> + dplyr::pull(eval(variable)) |> addNA()), - id = 1:length(levels(npx_for_plotting %>% - dplyr::pull(eval(variable)) %>% - addNA()))) %>% + id = 1:length(levels(npx_for_plotting |> + dplyr::pull(eval(variable)) |> + addNA()))) |> dplyr::mutate(x.vals = replace(x.vals, is.na(x.vals), "NA")) - posthoc.results_temp <- posthoc_results %>% - dplyr::filter(OlinkID %in% assays_for_plotting) %>% - tidyr::unite(c(Assay, OlinkID), col = 'Name_OID', sep = ' ', remove = FALSE) %>% + posthoc.results_temp <- posthoc_results |> + dplyr::filter(OlinkID %in% assays_for_plotting) |> + tidyr::unite(c(Assay, OlinkID), col = 'Name_OID', sep = ' ', remove = FALSE) |> dplyr::mutate(Name_OID = forcats::as_factor(Name_OID)) - scale_inf <- npx_for_plotting %>% - dplyr::group_by(Name_OID) %>% - dplyr::summarise(maxNPX = max(NPX), rangeNPX = diff(range(NPX))) %>% + scale_inf <- npx_for_plotting |> + dplyr::group_by(Name_OID) |> + dplyr::summarise(maxNPX = max(NPX), rangeNPX = diff(range(NPX))) |> dplyr::ungroup() - line.data <- posthoc.results_temp %>% - dplyr::left_join(scale_inf, by = "Name_OID") %>% + line.data <- posthoc.results_temp |> + dplyr::left_join(scale_inf, by = "Name_OID") |> dplyr::mutate(C1=sapply(strsplit(as.character(contrast)," - "),function(x) x[1]), - C2=sapply(strsplit(as.character(contrast)," - "),function(x) x[2])) %>% - dplyr::group_by(Name_OID, contrast) %>% - dplyr::mutate(c.sort=min(C1,C2)) %>% - dplyr::mutate(p.value=paste0(myRound(Adjusted_pval)," Contrast: ", contrast)) %>% - dplyr::ungroup() %>% - dplyr::group_by(Name_OID) %>% - dplyr::arrange(c.sort) %>% - dplyr::mutate(rowNum=n():1) %>% - dplyr::ungroup() %>% - dplyr::mutate(y.anchor = maxNPX + rowNum * rangeNPX *(.5)/max(rowNum)) %>% - dplyr::select(Name_OID,contrast,Adjusted_pval,C1,C2,p.value,Threshold,c.sort,y.anchor) %>% - tidyr::pivot_longer(-c(Name_OID,Threshold,contrast,Adjusted_pval,p.value,c.sort,y.anchor),names_to="tmp",values_to = "x.vals") %>% + C2=sapply(strsplit(as.character(contrast)," - "),function(x) x[2])) |> + dplyr::group_by(Name_OID, contrast) |> + dplyr::mutate(c.sort=min(C1,C2)) |> + dplyr::mutate(p.value=paste0(myRound(Adjusted_pval)," Contrast: ", contrast)) |> + dplyr::ungroup() |> + dplyr::group_by(Name_OID) |> + dplyr::arrange(c.sort) |> + dplyr::mutate(rowNum=n():1) |> + dplyr::ungroup() |> + dplyr::mutate(y.anchor = maxNPX + rowNum * rangeNPX *(.5)/max(rowNum)) |> + dplyr::select(Name_OID,contrast,Adjusted_pval,C1,C2,p.value,Threshold,c.sort,y.anchor) |> + tidyr::pivot_longer(-c(Name_OID,Threshold,contrast,Adjusted_pval,p.value,c.sort,y.anchor),names_to="tmp",values_to = "x.vals") |> dplyr::mutate(Star = dplyr::case_when(Adjusted_pval <0.05 & Adjusted_pval > 0.01 ~ "*", Adjusted_pval <= 0.01 & Adjusted_pval > 0.005 ~ "**", Adjusted_pval <= 0.005 ~ "***", - Adjusted_pval >= 0.05 ~ NA_character_)) %>% - dplyr::left_join(star.info, by = "x.vals") %>% - dplyr::group_by(contrast,Name_OID) %>% - dplyr::mutate(x.m = sum(id)/2) %>% - dplyr::ungroup() %>% + Adjusted_pval >= 0.05 ~ NA_character_)) |> + dplyr::left_join(star.info, by = "x.vals") |> + dplyr::group_by(contrast,Name_OID) |> + dplyr::mutate(x.m = sum(id)/2) |> + dplyr::ungroup() |> dplyr::filter(Threshold=="Significant") - boxplot <- npx_for_plotting %>% + boxplot <- npx_for_plotting |> ggplot2::ggplot(ggplot2::aes(y = NPX, x = !!rlang::ensym(variable))) + ggplot2::geom_boxplot(ggplot2::aes(fill = !!rlang::ensym(variable))) + ggplot2::geom_line(data=line.data, ggplot2::aes(x = x.vals, y = y.anchor, group = p.value)) + - ggplot2::geom_text(data=line.data %>% dplyr::filter(tmp == "C1"), ggplot2::aes(group = p.value, x = x.m, y = y.anchor+0.1, label = Star)) + + ggplot2::geom_text(data=line.data |> dplyr::filter(tmp == "C1"), ggplot2::aes(group = p.value, x = x.m, y = y.anchor+0.1, label = Star)) + OlinkAnalyze::set_plot_theme() + OlinkAnalyze::olink_fill_discrete(...) + ggplot2::theme(axis.ticks.x = element_blank(), @@ -224,58 +223,58 @@ olink_boxplot <- function(df, } else if (is.null(posthoc_results) && !is.null(ttest_results)){ - star.info <- data.frame(x.vals = npx_for_plotting %>% - dplyr::pull(eval(variable)) %>% + star.info <- data.frame(x.vals = npx_for_plotting |> + dplyr::pull(eval(variable)) |> unique(), - id = 1:length(npx_for_plotting %>% - dplyr::pull(eval(variable)) %>% - unique())) %>% + id = 1:length(npx_for_plotting |> + dplyr::pull(eval(variable)) |> + unique())) |> dplyr::mutate(x.vals = replace(x.vals, is.na(x.vals), "NA")) - ttest_results_temp <- ttest_results %>% - dplyr::filter(OlinkID %in% assays_for_plotting) %>% - tidyr::unite(c(Assay, OlinkID), col = 'Name_OID', sep = ' ', remove = FALSE) %>% + ttest_results_temp <- ttest_results |> + dplyr::filter(OlinkID %in% assays_for_plotting) |> + tidyr::unite(c(Assay, OlinkID), col = 'Name_OID', sep = ' ', remove = FALSE) |> dplyr::mutate(Name_OID = forcats::as_factor(Name_OID)) - scale_inf <- npx_for_plotting %>% - dplyr::group_by(Name_OID) %>% - dplyr::summarise(maxNPX = max(NPX), rangeNPX = diff(range(NPX))) %>% + scale_inf <- npx_for_plotting |> + dplyr::group_by(Name_OID) |> + dplyr::summarise(maxNPX = max(NPX), rangeNPX = diff(range(NPX))) |> dplyr::ungroup() - ttest_variable <- npx_for_plotting %>% - dplyr::select(!!rlang::ensym(variable)) %>% - unique() %>% - na.omit() %>% + ttest_variable <- npx_for_plotting |> + dplyr::select(!!rlang::ensym(variable)) |> + unique() |> + na.omit() |> dplyr::pull(!!rlang::ensym(variable)) - line.data <- ttest_results_temp %>% - dplyr::select(Name_OID, Assay, OlinkID, UniProt, Panel, Adjusted_pval,Threshold) %>% + line.data <- ttest_results_temp |> + dplyr::select(Name_OID, Assay, OlinkID, UniProt, Panel, Adjusted_pval,Threshold) |> dplyr::mutate(C1 = ttest_variable[1], - C2 = ttest_variable[2]) %>% - dplyr::left_join(scale_inf, by = "Name_OID") %>% - dplyr::group_by(Name_OID) %>% - dplyr::mutate(c.sort=min(C1,C2)) %>% - dplyr::ungroup() %>% - dplyr::mutate(y.anchor = maxNPX + rangeNPX *(.2)) %>% - dplyr::select(Name_OID,Adjusted_pval,C1,C2,Threshold,c.sort,y.anchor) %>% - tidyr::pivot_longer(-c(Name_OID, Threshold, Adjusted_pval, c.sort, y.anchor), names_to = "tmp", values_to = "x.vals") %>% + C2 = ttest_variable[2]) |> + dplyr::left_join(scale_inf, by = "Name_OID") |> + dplyr::group_by(Name_OID) |> + dplyr::mutate(c.sort=min(C1,C2)) |> + dplyr::ungroup() |> + dplyr::mutate(y.anchor = maxNPX + rangeNPX *(.2)) |> + dplyr::select(Name_OID,Adjusted_pval,C1,C2,Threshold,c.sort,y.anchor) |> + tidyr::pivot_longer(-c(Name_OID, Threshold, Adjusted_pval, c.sort, y.anchor), names_to = "tmp", values_to = "x.vals") |> dplyr::mutate(Star = case_when(Adjusted_pval <0.05 & Adjusted_pval > 0.01 ~ "*", Adjusted_pval <= 0.01 & Adjusted_pval > 0.005 ~ "**", Adjusted_pval <= 0.005 ~ "***", - Adjusted_pval >= 0.05 ~ NA_character_)) %>% - dplyr::left_join(star.info, by = "x.vals") %>% - dplyr::group_by(Name_OID) %>% - dplyr::mutate(x.m = sum(id)/2) %>% - dplyr::ungroup() %>% + Adjusted_pval >= 0.05 ~ NA_character_)) |> + dplyr::left_join(star.info, by = "x.vals") |> + dplyr::group_by(Name_OID) |> + dplyr::mutate(x.m = sum(id)/2) |> + dplyr::ungroup() |> dplyr::filter(Threshold=="Significant") - boxplot <- npx_for_plotting %>% + boxplot <- npx_for_plotting |> ggplot2::ggplot(ggplot2::aes(y = NPX, x = !!rlang::ensym(variable))) + ggplot2::geom_boxplot(aes(fill = !!rlang::ensym(variable))) + ggplot2::geom_line(data=line.data, ggplot2::aes(x = x.vals, y = y.anchor, group = Name_OID )) + - ggplot2::geom_text(data=line.data %>% dplyr::filter(tmp == "C1"),aes(group = Name_OID , x = x.m, y = y.anchor+0.1, label = Star)) + + ggplot2::geom_text(data=line.data |> dplyr::filter(tmp == "C1"),aes(group = Name_OID , x = x.m, y = y.anchor+0.1, label = Star)) + OlinkAnalyze::set_plot_theme() + OlinkAnalyze::olink_fill_discrete(...) + ggplot2::theme(axis.ticks.x = element_blank(), diff --git a/OlinkAnalyze/R/Olink_color.R b/OlinkAnalyze/R/Olink_color.R index 58981a83..9aa01742 100644 --- a/OlinkAnalyze/R/Olink_color.R +++ b/OlinkAnalyze/R/Olink_color.R @@ -144,8 +144,17 @@ olink_pal <- function(alpha = 1, coloroption = NULL) { olink_color_discrete <- function(..., alpha = 1, coloroption = NULL) { + # Add support for older and newer versions of ggplot + if (utils::packageVersion("ggplot2")< "3.5.0"){ + ggplot2::discrete_scale(aesthetics = "colour", + scale_name = "olink", + palette = olink_pal(alpha, coloroption), ...) + } else{ + ggplot2::discrete_scale(aesthetics = "colour", + palette = olink_pal(alpha, coloroption), ...) + } + - ggplot2::discrete_scale(aesthetics = "colour", scale_name = 'olink', palette = olink_pal(alpha, coloroption), ...) } @@ -202,8 +211,14 @@ olink_color_gradient <- function(..., alpha = 1, coloroption = NULL) { #' @importFrom ggplot2 discrete_scale olink_fill_discrete <- function(..., alpha = 1, coloroption = NULL) { - ggplot2::discrete_scale(aesthetics = "fill", scale_name = 'olink', + if (utils::packageVersion("ggplot2")< "3.5.0"){ + ggplot2::discrete_scale(aesthetics = "fill", + scale_name = "olink", palette = olink_pal(alpha, coloroption), ...) + } else { + ggplot2::discrete_scale(aesthetics = "fill", + palette = olink_pal(alpha, coloroption), ...) + } } #' Olink fill scale for continuous ggplots diff --git a/OlinkAnalyze/R/Read_NPX_data.R b/OlinkAnalyze/R/Read_NPX_data.R index ed8cbd9a..6cbd9bd0 100644 --- a/OlinkAnalyze/R/Read_NPX_data.R +++ b/OlinkAnalyze/R/Read_NPX_data.R @@ -76,8 +76,10 @@ read_NPX_explore <- function(filename) { # Check that all column names are present # We have had 5 column names versions so far: 1, 1.1 and 2, 2.1, and 3 # please add newer versions to the list chronologically - header_standard <- c("SampleID", "OlinkID", "UniProt", "Assay", - "Panel", "PlateID", "QC_Warning", "NPX") + header_base <- c("SampleID", "OlinkID", "UniProt", "Assay", + "Panel", "PlateID", "NPX") + + header_standard <- c(header_base, "QC_Warning") header_quant_standard <- c("SampleID", "OlinkID", "UniProt", "Assay", "Panel", "PlateID", "QC_Warning") @@ -190,7 +192,41 @@ read_NPX_explore <- function(filename) { "PlateLOD", "QC.Deviation.Inc.Ctrl", "QC.Deviation.Det.Ctrl", - "Olink.NPX.Signature.Version") + "Olink.NPX.Signature.Version"), + "header_csv_hp" = c(header_base, + "SampleType", + "WellID", + "DataAnalysisRefID", + "AssayType", + "Block", + "Counts", + "ExtNPX", + "Normalization", + "PCNormalizedNPX", + "AssayQC", + "SampleQC", + "SoftwareVersion", + "SoftwareName"), + "header_ext_csv_hp" = c(header_base, + "SampleType", + "WellID", + "DataAnalysisRefID", + "AssayType", + "Block", + "Counts", + "ExtNPX", + "Normalization", + "PCNormalizedNPX", + "AssayQC", + "SampleQC", + "SoftwareVersion", + "SoftwareName", + "IntraCV", + "InterCV", + "SampleBlockQCWarn", + "SampleBlockQCFail", + "BlockQCFail", + "AssayQCWarn") ) @@ -248,6 +284,11 @@ read_NPX_explore <- function(filename) { dplyr::mutate(Panel = trimws(Panel, which = "right")) %>% dplyr::select(-Panel_Start, -Panel_End) } + if ("Quantified_value" %in% names(out)){ + message("QUANT data detected. Some downstream functions may not be supported.") + out <- out %>% + mutate(Quantified_value = as.numeric(Quantified_value)) + } return(out) diff --git a/OlinkAnalyze/R/olink_lod.R b/OlinkAnalyze/R/olink_lod.R index 13c002f4..39c4ad5a 100644 --- a/OlinkAnalyze/R/olink_lod.R +++ b/OlinkAnalyze/R/olink_lod.R @@ -2,12 +2,20 @@ #' #' @param data npx data file #' @param lod_file_path location of lod file from Olink. Only needed if -#' lod_method = "FixedLOD". Default `NULL`. +#' lod_method = "FixedLOD" or "Both". Default `NULL`. #' @param lod_method method for calculating LOD using either "FixedLOD" or -#' negative controls ("NCLOD"). Default `NCLOD`. +#' negative controls ("NCLOD"), or both ("Both"). Default `NCLOD`. #' -#' @return A dataframe with 2 additional columns, LOD and PCNormalizedLOD. When -#' `Normalization = "Plate Control"`, LOD and PCNormalizedLOD are identical. +#' @return A dataframe with 2 additional columns, LOD and PCNormalizedLOD if `lod_method` is FixedLOD or NCLOD. When `Normalization = "Plate Control"`, LOD and PCNormalizedLOD are identical. +#' +#' If `lod_method` is "Both", 4 additional columns will be added: +#' \itemize{ +#' \item NCLOD - LOD calculated from negative controls and normalized based on normalization column +#' \item NCPCNormalizedLOD - PC Normalized LOD calculated from negative controls +#' \item FixedLOD - LOD calculated from fixed LOD file and normalized based on normalization column +#' \item FixedPCNormalizedLOD - PC Normalized LOD calculated from fixed LOD file +#' } +#' #' #' @export #' @examples @@ -33,38 +41,72 @@ #' } #' olink_lod <- function(data, lod_file_path = NULL, lod_method = "NCLOD"){ + # lod_method must be one of options + if(!lod_method %in% c("NCLOD", "FixedLOD", "Both")){ + stop("`lod_method` must be one of \"NCLOD\", \"FixedLOD\", or \"Both\"") + } + + # If lod method is both or fixed you need the file + if(missing(lod_file_path) & lod_method %in% c("Both", "FixedLOD")) { + stop(paste0("`lod_file_path` must be specified for lod_method = \"", + lod_method, "\"")) + } + + req_columns <- c("SampleType", "OlinkID", "DataAnalysisRefID", "SampleQC", "AssayType", "NPX") + missingcols <- setdiff(req_columns, names(data)) + if (length(missingcols) != 0){ + stop(paste0("Required column(s) not found: ", paste0(missingcols, collapse = ", "))) + } + + + if (lod_method == "Both"){ + data_nc <- olink_lod_internal(data, lod_file_path, lod_method = "NCLOD") |> + rename("NCLOD" = "LOD", + "NCPCNormalizedLOD" = "PCNormalizedLOD") + data_fixed <- olink_lod_internal(data, lod_file_path, lod_method = "FixedLOD") |> + rename("FixedLOD" = "LOD", + "FixedPCNormalizedLOD" = "PCNormalizedLOD") + + data <- dplyr::full_join(data_nc, data_fixed,by = names(data)) + + }else{ + data<- olink_lod_internal(data = data, + lod_file_path = lod_file_path, + lod_method = lod_method) + } + return(data) +} + +olink_lod_internal <- function(data, lod_file_path = NULL, lod_method = "NCLOD"){ + # store original column names of `data` to restore them later original_columns <- names(data) - + # check the type of LOD calculation to perform and compute or extract: # LODNPX, LODCount and LODMethod - + lod_methods <- list(fix_lod = "FixedLOD", nc_lod = "NCLOD") - + if (lod_method == lod_methods$fix_lod) { - - if(missing(lod_file_path)) { - stop(paste0("lod_file_path must be specified for lod_method = \"", - lod_methods$fix_lod, "\"")) - } - + lod_file <- read.table(file = lod_file_path, sep = ";", header = TRUE) lod_data <- olink_fixed_lod(data_analysis_ref_id = data$DataAnalysisRefID, lod_file = lod_file) - + } else if (lod_method == lod_methods$nc_lod) { - + lod_data <- olink_nc_lod(data = data) - + } else { - + stop(paste0("lod_method must be one of \"", lod_methods$fix_lod, "\" or \"", - lod_methods$nc_lod, "\"")) - + lod_methods$nc_lod, "\"", "or \"Both\"")) + } + # If NPX is intensity normalized, then intensity normalize LOD data <- int_norm_count( data = data, @@ -86,13 +128,13 @@ olink_lod <- function(data, lod_file_path = NULL, lod_method = "NCLOD"){ c(original_columns, "LOD", "PCNormalizedLOD") ) ) - + return(data) } # extract LODNPX, LODCount and LODMethods from reference file olink_fixed_lod <- function(data_analysis_ref_id, lod_file) { - + lod_file |> dplyr::filter( .data[["DataAnalysisRefID"]] %in% data_analysis_ref_id @@ -102,13 +144,13 @@ olink_fixed_lod <- function(data_analysis_ref_id, lod_file) { c("OlinkID", "DataAnalysisRefID", "LODNPX", "LODCount", "LODMethod") ) ) - + } # compute LODNPX, LODCount and LODMethods from NCs olink_nc_lod <- function(data, min_num_nc = 10L) { # Calculate LOD in counts and NPX - + # rows from NC lod_data <- data |> dplyr::filter( @@ -117,13 +159,13 @@ olink_nc_lod <- function(data, min_num_nc = 10L) { & .data[["AssayType"]] == "assay" & !is.na(.data[["NPX"]]) ) - + # check that we have at least `min_num_nc` NCs if(length(unique(lod_data$SampleID)) < min_num_nc){ stop(paste("At least", min_num_nc, "Negative Controls are required to", "calculate LOD from Negative Controls.")) } - + # compute LOD lod_data <- lod_data |> # LOD is comnputed per assay and lot of reagents @@ -148,14 +190,14 @@ olink_nc_lod <- function(data, min_num_nc = 10L) { c("MaxCount") ) ) - + return(lod_data) - + } pc_norm_count <- function(data, lod_data){ # Calculate PC median for normalization - + pc_median <- data |> dplyr::group_by( OlinkID, PlateID @@ -169,11 +211,11 @@ pc_norm_count <- function(data, lod_data){ c("OlinkID", "PlateID", "PCMedian") ) ) - + if(nrow(pc_median) == 0L){ stop("Insufficient Plate Control data for normalization of LOD.") } - + # Calculate ExtCount for normalization ext_count <- data |> dplyr::filter( @@ -185,7 +227,7 @@ pc_norm_count <- function(data, lod_data){ "ExtCount" = "Count") ) ) - + data <- data |> dplyr::left_join( lod_data, @@ -199,9 +241,9 @@ pc_norm_count <- function(data, lod_data){ ext_count, by = c("SampleID", "WellID", "Panel", "Block", "SampleType", "PlateID") ) - + # Convert count LOD to PC norm NPX - + data <- data |> dplyr::mutate( PCNormalizedLOD = dplyr::case_when( @@ -213,45 +255,45 @@ pc_norm_count <- function(data, lod_data){ ) ) |> dplyr::mutate(LOD = PCNormalizedLOD) - + return(data) } int_norm_count <- function(data, lod_data){ - + data <- pc_norm_count(data, lod_data) - + if(any(data[["Normalization"]]=="Intensity")){ plate_median <- data |> dplyr::filter( .data[["SampleType"]] =="SAMPLE" - ) |> - dplyr::group_by( - OlinkID, PlateID - ) |> - dplyr::summarise( - PlateMedianNPX = median(.data[["PCNormalizedNPX"]], na.rm = TRUE) - ) |> - dplyr::ungroup() |> - dplyr::select( - dplyr::all_of( - c("OlinkID", "PlateID", "PlateMedianNPX")) - ) - - if(nrow(plate_median) == 0L){ - stop("Insufficient data for intensity normalization of LOD.") - } - - data <- data |> - dplyr::left_join( - plate_median, - by = c("OlinkID", "PlateID") - ) |> - dplyr::mutate( - LOD = .data[["PCNormalizedLOD"]] - .data[["PlateMedianNPX"]] - ) + ) |> + dplyr::group_by( + OlinkID, PlateID + ) |> + dplyr::summarise( + PlateMedianNPX = median(.data[["PCNormalizedNPX"]], na.rm = TRUE) + ) |> + dplyr::ungroup() |> + dplyr::select( + dplyr::all_of( + c("OlinkID", "PlateID", "PlateMedianNPX")) + ) + + if(nrow(plate_median) == 0L){ + stop("Insufficient data for intensity normalization of LOD.") + } + + data <- data |> + dplyr::left_join( + plate_median, + by = c("OlinkID", "PlateID") + ) |> + dplyr::mutate( + LOD = .data[["PCNormalizedLOD"]] - .data[["PlateMedianNPX"]] + ) } - -return(data) - -} + + return(data) + +} \ No newline at end of file diff --git a/OlinkAnalyze/R/read_npx_csv.R b/OlinkAnalyze/R/read_npx_csv.R index 6c3c4456..5ed9de79 100644 --- a/OlinkAnalyze/R/read_npx_csv.R +++ b/OlinkAnalyze/R/read_npx_csv.R @@ -47,6 +47,7 @@ read_npx_csv <- function(filename) { file = filename, header = TRUE, sep = ",", + quote = "", stringsAsFactors = FALSE, na.strings = c("NA", "") ) diff --git a/OlinkAnalyze/R/read_npx_parquet.R b/OlinkAnalyze/R/read_npx_parquet.R index 37c6814c..f94b88ba 100644 --- a/OlinkAnalyze/R/read_npx_parquet.R +++ b/OlinkAnalyze/R/read_npx_parquet.R @@ -73,7 +73,8 @@ read_npx_parquet <- function (filename) { "Extended NPX File", "CLI Data Export File", "Internal CLI Data Export File", - "R Package Export File") + "R Package Export File", + "Olink Analyze Export File") if (parquet_file$metadata$DataFileType %in% olink_files) { # Check that required columns are present diff --git a/OlinkAnalyze/cran-comments.md b/OlinkAnalyze/cran-comments.md index cd7b8962..3b342de9 100644 --- a/OlinkAnalyze/cran-comments.md +++ b/OlinkAnalyze/cran-comments.md @@ -1,6 +1,6 @@ ## Release Summary -This release updates the URL hyperlinks from Olink Analyze 3.8.1 to include https. +This release provides additional functionality and documentation to Olink Analyze 3.8.3. ## R CMD check results diff --git a/OlinkAnalyze/inst/WORDLIST b/OlinkAnalyze/inst/WORDLIST index a8dd6651..782fad45 100644 --- a/OlinkAnalyze/inst/WORDLIST +++ b/OlinkAnalyze/inst/WORDLIST @@ -184,4 +184,12 @@ Welch's WellID biomarker dataset's -lod \ No newline at end of file +lod +FixedPCNormalizedLOD +LQL +NCPCNormalizedLOD +PlateLQL +Quant +Topouza's +ctrl +lapply \ No newline at end of file diff --git a/OlinkAnalyze/inst/extdata/Example_NPX_Data.csv b/OlinkAnalyze/inst/extdata/Example_NPX_Data.csv index 4333e408..de3bf002 100644 --- a/OlinkAnalyze/inst/extdata/Example_NPX_Data.csv +++ b/OlinkAnalyze/inst/extdata/Example_NPX_Data.csv @@ -1,2 +1,2 @@ -"","SampleID","Index","OlinkID","UniProt","Assay","MissingFreq","Panel","Panel_Version","PlateID","QC_Warning","LOD","NPX","Subject","Treatment","Site","Time","Project" -"1","A1",1,"OID01216","O00533","CHL1",0.01875,"Olink CARDIOMETABOLIC","v.1201","Example_Data_1_CAM.csv","Pass",2.36846658156787,12.9561425886431,"ID1","Untreated","Site_D","Baseline","20200001" +SampleID,Index,OlinkID,UniProt,Assay,MissingFreq,Panel,Panel_Version,PlateID,QC_Warning,LOD,NPX,Subject,Treatment,Site,Time,Project +A1,1,OID01216,O00533,CHL1,0.01875,Olink CARDIOMETABOLIC,v.1201,Example_Data_1_CAM.csv,Pass,2.36846658156787,12.9561425886431,ID1,Untreated,Site_D,Baseline,P1 diff --git a/OlinkAnalyze/man/olink_anova.Rd b/OlinkAnalyze/man/olink_anova.Rd index 5d46369b..18fcfc4e 100644 --- a/OlinkAnalyze/man/olink_anova.Rd +++ b/OlinkAnalyze/man/olink_anova.Rd @@ -57,6 +57,8 @@ The function handles both factor and numerical variables and/or covariates. \cr\ Samples that have no variable information or missing factor levels are automatically removed from the analysis (specified in a message if verbose = TRUE). Character columns in the input dataframe are automatically converted to factors (specified in a message if verbose = TRUE). Numerical variables are not converted to factors. +Control samples should be removed before using this function. +Control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed before using this function. If a numerical variable is to be used as a factor, this conversion needs to be done on the dataframe before the function call. \cr\cr Crossed analysis, i.e. A*B formula notation, is inferred from the variable argument in the following cases: \cr \itemize{ @@ -75,7 +77,7 @@ The threshold is determined by logic evaluation of Adjusted_pval < 0.05. Covaria library(dplyr) -npx_df <- npx_data1 \%>\% filter(!grepl('control',SampleID, ignore.case = TRUE)) +npx_df <- npx_data1 |> filter(!grepl('control|ctrl',SampleID, ignore.case = TRUE)) #One-way ANOVA, no covariates. #Results in a model NPX~Time diff --git a/OlinkAnalyze/man/olink_anova_posthoc.Rd b/OlinkAnalyze/man/olink_anova_posthoc.Rd index d0d96d64..1dd1f4fb 100644 --- a/OlinkAnalyze/man/olink_anova_posthoc.Rd +++ b/OlinkAnalyze/man/olink_anova_posthoc.Rd @@ -64,6 +64,8 @@ Columns include: Performs a post hoc ANOVA test using emmeans::emmeans with Tukey p-value adjustment per assay (by OlinkID) for each panel at confidence level 0.95. See \code{olink_anova} for details of input notation. \cr\cr The function handles both factor and numerical variables and/or covariates. +Control samples should be removed before using this function. +Control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed before using this function. The posthoc test for a numerical variable compares the difference in means of the outcome variable (default: NPX) for 1 standard deviation difference in the numerical variable, e.g. mean NPX at mean(numerical variable) versus mean NPX at mean(numerical variable) + 1*SD(numerical variable). } @@ -72,7 +74,7 @@ mean NPX at mean(numerical variable) versus mean NPX at mean(numerical variable) library(dplyr) -npx_df <- npx_data1 \%>\% filter(!grepl('control',SampleID, ignore.case = TRUE)) +npx_df <- npx_data1 |> filter(!grepl('control|ctrl',SampleID, ignore.case = TRUE)) #Two-way ANOVA, one main effect (Site) covariate. #Results in model NPX~Treatment*Time+Site. @@ -84,10 +86,10 @@ anova_results <- olink_anova(df = npx_df, #on the interaction effect Treatment:Time with covariate Site. #Filtering out significant and relevant results. -significant_assays <- anova_results \%>\% -filter(Threshold == 'Significant' & term == 'Treatment:Time') \%>\% -select(OlinkID) \%>\% -distinct() \%>\% +significant_assays <- anova_results |> +filter(Threshold == 'Significant' & term == 'Treatment:Time') |> +select(OlinkID) |> +distinct() |> pull() #Posthoc, all pairwise comparisons diff --git a/OlinkAnalyze/man/olink_boxplot.Rd b/OlinkAnalyze/man/olink_boxplot.Rd index 156d9fc4..5ba74f49 100644 --- a/OlinkAnalyze/man/olink_boxplot.Rd +++ b/OlinkAnalyze/man/olink_boxplot.Rd @@ -43,12 +43,12 @@ Generates faceted boxplots of NPX vs. grouping variable(s) for a given list of p \donttest{ library(dplyr) - -anova_results <- olink_anova(npx_data1, variable = "Site") -significant_assays <- anova_results \%>\% - filter(Threshold == 'Significant') \%>\% +npx_df <- npx_data1 |> filter(!grepl('control|ctrl',SampleID, ignore.case = TRUE)) +anova_results <- olink_anova(npx_df, variable = "Site") +significant_assays <- anova_results |> + filter(Threshold == 'Significant') |> pull(OlinkID) -olink_boxplot(npx_data1, +olink_boxplot(npx_df, variable = "Site", olinkid_list = significant_assays, verbose = TRUE, diff --git a/OlinkAnalyze/man/olink_lod.Rd b/OlinkAnalyze/man/olink_lod.Rd index c468ba85..df8b787f 100644 --- a/OlinkAnalyze/man/olink_lod.Rd +++ b/OlinkAnalyze/man/olink_lod.Rd @@ -10,14 +10,21 @@ olink_lod(data, lod_file_path = NULL, lod_method = "NCLOD") \item{data}{npx data file} \item{lod_file_path}{location of lod file from Olink. Only needed if -lod_method = "FixedLOD". Default `NULL`.} +lod_method = "FixedLOD" or "Both". Default `NULL`.} \item{lod_method}{method for calculating LOD using either "FixedLOD" or -negative controls ("NCLOD"). Default `NCLOD`.} +negative controls ("NCLOD"), or both ("Both"). Default `NCLOD`.} } \value{ -A dataframe with 2 additional columns, LOD and PCNormalizedLOD. When -`Normalization = "Plate Control"`, LOD and PCNormalizedLOD are identical. +A dataframe with 2 additional columns, LOD and PCNormalizedLOD if `lod_method` is FixedLOD or NCLOD. When `Normalization = "Plate Control"`, LOD and PCNormalizedLOD are identical. + +If `lod_method` is "Both", 4 additional columns will be added: + \itemize{ + \item NCLOD - LOD calculated from negative controls and normalized based on normalization column + \item NCPCNormalizedLOD - PC Normalized LOD calculated from negative controls + \item FixedLOD - LOD calculated from fixed LOD file and normalized based on normalization column + \item FixedPCNormalizedLOD - PC Normalized LOD calculated from fixed LOD file + } } \description{ Calculate LOD using Negative Controls or Fixed LOD diff --git a/OlinkAnalyze/tests/testthat/test-Olink_anova.R b/OlinkAnalyze/tests/testthat/test-Olink_anova.R index c259ed74..3eaaf239 100644 --- a/OlinkAnalyze/tests/testthat/test-Olink_anova.R +++ b/OlinkAnalyze/tests/testthat/test-Olink_anova.R @@ -5,19 +5,54 @@ load(refRes_file) #Load data with hidden/excluded assays (all NPX=NA) load(file = '../data/npx_data_format221010.RData') +# Remove assays with NPX == NA from npx_data_format221010 for testing +npx_data_format221010_no_NA <- npx_data_format221010 |> + dplyr::filter(!is.na(NPX)) + +# Add dummy Extension control assays +extension_control <- npx_data_format221010_no_NA |> + dplyr::filter(stringr::str_detect(Assay, "Incubation control")) |> + dplyr::mutate(Assay = gsub("Incubation", "Extension", Assay)) |> + dplyr::mutate(UniProt = gsub("INC", "EXT", UniProt)) + +npx_data_format221010_ext_ctrl <- rbind( + npx_data_format221010_no_NA, + extension_control +) + +# Add AssayType +npx_data_format221010_AssayType <- npx_data_format221010_ext_ctrl |> + dplyr::mutate(AssayType = dplyr::case_when( + stringr::str_detect(Assay, "Incubation control") ~ "inc_ctrl", + stringr::str_detect(Assay, "Amplification control") ~ "amp_ctrl", + stringr::str_detect(Assay, "Extension control") ~ "ext_ctrl", + TRUE ~ "assay" + )) + +# Remove control assays from npx_data_format221010 for warning tests +npx_data_format221010_no_ctrl <- npx_data_format221010 |> + dplyr::filter(!str_detect(Assay, "control")) + +# Remove sample controls from npx_data1 to preserve test results +npx_data1 <- npx_data1 |> + dplyr::filter(!stringr::str_detect(npx_data1$SampleID, + stringr::regex("control|ctrl", + ignore_case = TRUE))) + + #Run olink_anova anova_results_1_site <- olink_anova(npx_data1, 'Site') %>% - mutate(id = as.character(OlinkID)) %>% - arrange(id) %>% - select(-id) + dplyr::mutate(id = as.character(OlinkID)) %>% + dplyr::arrange(id) %>% + dplyr::select(-id) anova_results_1_time <- olink_anova(npx_data1, 'Time') %>% - mutate(id = as.character(OlinkID)) %>% - arrange(id) %>% - select(-id) + dplyr::mutate(id = as.character(OlinkID)) %>% + dplyr::arrange(id) %>% + dplyr::select(-id) anova_results_1_siteTime <- olink_anova(npx_data1, c('Site', 'Time')) %>% - mutate(id = as.character(OlinkID)) %>% - arrange(id, term) %>% #Since OlinkID is not unique here (=> ties), term is used to break the ties - select(-id) + dplyr::mutate(id = as.character(OlinkID)) %>% + dplyr::arrange(id, term) %>% #Since OlinkID is not unique here (=> ties), term is used to break the ties + dplyr::select(-id) #Run olink_anova_posthoc anova_posthoc_1_site <- olink_anova_posthoc(npx_data1, @@ -26,10 +61,10 @@ anova_posthoc_1_site <- olink_anova_posthoc(npx_data1, head(10) %>% dplyr::pull(OlinkID)}, effect = 'Site') %>% - mutate(id = as.character(OlinkID)) %>% - arrange(id, contrast) %>% #Since OlinkID is not unique here (=> ties), contrast is used to break the ties - mutate(contrast = as.character(contrast)) %>% # In R 3.6.1 we get factors, but reference is characters - select(-id) + dplyr::mutate(id = as.character(OlinkID)) %>% + dplyr::arrange(id, contrast) %>% #Since OlinkID is not unique here (=> ties), contrast is used to break the ties + dplyr::mutate(contrast = as.character(contrast)) %>% # In R 3.6.1 we get factors, but reference is characters + dplyr::select(-id) anova_posthoc_1_time <- olink_anova_posthoc(npx_data1, @@ -38,10 +73,10 @@ anova_posthoc_1_time <- olink_anova_posthoc(npx_data1, head(10) %>% dplyr::pull(OlinkID)}, effect = 'Time') %>% - mutate(id = as.character(OlinkID)) %>% - arrange(id, contrast) %>% #Just for consistency. Not actually needed in this case - mutate(contrast = as.character(contrast)) %>% # In R 3.6.1 we get factors, but reference is characters - select(-id) + dplyr::mutate(id = as.character(OlinkID)) %>% + dplyr::arrange(id, contrast) %>% #Just for consistency. Not actually needed in this case + dplyr::mutate(contrast = as.character(contrast)) %>% # In R 3.6.1 we get factors, but reference is characters + dplyr::select(-id) test_that("olink_anova function works", { expect_equal(anova_results_1_site, ref_results$anova_results_1_site) ##result equal to testfile @@ -52,9 +87,12 @@ test_that("olink_anova function works", { expect_equal(ncol(anova_results_1_siteTime), 12) expect_error(olink_anova(npx_data1,)) ##no input data - - expect_warning(olink_anova(npx_data_format221010, 'treatment2')) # data with all NPX=NA for some assays -}) + + expect_warning(olink_anova(npx_data_format221010_no_ctrl, 'treatment2')) # data with all NPX=NA for some assays + + expect_error(olink_anova(npx_data_format221010_AssayType, variable = "Site")) # Assay controls not removed, AssayType present + expect_error(olink_anova(npx_data_format221010_ext_ctrl, variable = "Site")) # Assay controls not removed, no AssayType + }) test_that("olink_anova_posthoc function works", { expect_equal(anova_posthoc_1_site, ref_results$anova_posthoc_1_site) ## result equal to testfile - posthoc @@ -69,5 +107,13 @@ test_that("olink_anova_posthoc function works", { expect_equal(nrow(anova_posthoc_1_time), 30) expect_equal(anova_posthoc_1_time %>% ncol(), 11) - expect_warning(olink_anova_posthoc(npx_data_format221010, variable = 'treatment2', effect = 'treatment2')) # data with all NPX=NA for some assays + expect_warning(olink_anova_posthoc(npx_data_format221010_no_ctrl, variable = 'treatment2', effect = 'treatment2')) # data with all NPX=NA for some assays + + expect_error(olink_anova_posthoc(npx_data_format221010_AssayType, + variable = 'Site', + effect = 'Site')) # Assay controls not removed, AssayType present + expect_error(olink_anova_posthoc(npx_data_format221010_ext_ctrl, + variable = 'Site', + effect = 'Site')) # Assay controls not removed, no AssayType + }) diff --git a/OlinkAnalyze/vignettes/LOD.Rmd b/OlinkAnalyze/vignettes/LOD.Rmd index bc105b84..28026d64 100644 --- a/OlinkAnalyze/vignettes/LOD.Rmd +++ b/OlinkAnalyze/vignettes/LOD.Rmd @@ -35,21 +35,56 @@ library(dplyr) ``` - ## Introduction -This tutorial describes how to use Olink® Analyze to integrate Limit of Detection (LOD) into Olink® Explore HT and Olink® Explore 384/3072 datasets. Although it is recommended to use all Olink Explore data in downstream analyses, LOD information can be useful when performing technical evaluations of a dataset. - -In this tutorial, you will learn how to use `olink_lod()` to add LOD information to your Olink Explore dataset. Note that Olink Analyze does not contain example Olink Explore HT or Olink Explore 384/3072 datasets within the package, so external data will be necessary for the code below to work. The external data should contain internal and external controls for proper calculation and normalization. All file paths should be replaced with a path to your data and fixed LOD reference file (if applicable). - +This tutorial describes how to use Olink® Analyze to integrate Limit of +Detection (LOD) into Olink® Explore HT and Olink® Explore 384/3072 +datasets. Although it is recommended to use all Olink Explore data in +downstream analyses, LOD information can be useful when performing +technical evaluations of a dataset. + +In this tutorial, you will learn how to use `olink_lod()` to add LOD +information to your Olink Explore dataset. Note that Olink Analyze does +not contain example Olink Explore HT or Olink Explore 384/3072 datasets +within the package, so external data will be necessary for the code +below to work. The external data should contain internal and external +controls for proper calculation and normalization. All file paths should +be replaced with a path to your data and fixed LOD reference file (if +applicable). ## Integrating LOD -Limit of Detection (LOD) is a metric that indicates the lowest measurable value of a protein. LOD can be helpful when performing technical evaluations of NPX™ datasets, such as calculating CVs. As a note, LOD is less important in downstream statistical analyses as values under LOD typically converge across groups. As such, including data below LOD is unlikely to increase the risk of false positive discoveries. Furthermore, data below LOD can be instrumental in downstream analyses such as biomarker discovery as a protein may be well expressed in one group and not measured in another group. In this case, this protein can be a strong biomarker candidate for specific groups. - -LOD can be added to Olink Explore NPX datasets using `olink_lod()`. This function can calculate LOD from an NPX dataset using the dataset's negative controls or a list of predetermined fixed LOD values (available in the Document Download Center at [olink.com](https://olink.com/)). As the default setting, `olink_lod()` will calculate LOD using a dataset's negative controls. +Limit of Detection (LOD) is a metric that indicates the lowest +measurable value of a protein. LOD can be helpful when performing +technical evaluations of NPX™ datasets, such as calculating CVs. As a +note, LOD is less important in downstream statistical analyses as values +under LOD typically converge across groups. As such, including data +below LOD is unlikely to increase the risk of false positive +discoveries. Furthermore, data below LOD can be instrumental in +downstream analyses such as biomarker discovery as a protein may be well +expressed in one group and not measured in another group. In this case, +this protein can be a strong biomarker candidate for specific groups. + +LOD can be added to Olink Explore NPX datasets using `olink_lod()`. This +function can calculate LOD from an NPX dataset using the dataset's +negative controls or a list of predetermined fixed LOD values (available +in the Document Download Center at [olink.com](https://olink.com/knowledge/documents)). As the default setting, `olink_lod()` will calculate LOD using a dataset's +negative controls. + +Olink Explore data is commonly delivered plate control (PC) normalized +or intensity normalized (the normalization type employed is indicated in +the NPX file column Normalization), where the latter is dependent on +that the analyzed samples are randomized. These are reported in the two +respective columns PCNormalizedNPX and NPX. Please notice that for PC +normalized datasets the content in these two columns will be identical, +while for intensity normalized datasets the NPX column will include the +intensity normalized values. Similarly, the `olink_lod()` function adds +two columns to your dataset; PCNormalizedLOD and LOD respectively. For a +PC normalized dataset the content in these two columns will be +identical, while for an intensity normalized dataset the LOD column will +contain intensity normalized LOD values. Examples of results for plate +control and intensity normalization are shown in the tables below. -Olink Explore data is commonly delivered plate control (PC) normalized or intensity normalized (the normalization type employed is indicated in the NPX file column Normalization), where the latter is dependent on that the analyzed samples are randomized. These are reported in the two respective columns PCNormalizedNPX and NPX. Please notice that for PC normalized datasets the content in these two columns will be identical, while for intensity normalized datasets the NPX column will include the intensity normalized values. Similarly, the `olink_lod()` function adds two columns to your dataset; PCNormalizedLOD and LOD respectively. For a PC normalized dataset the content in these two columns will be identical, while for an intensity normalized dataset the LOD column will contain intensity normalized LOD values. Examples of results for plate control and intensity normalization are shown in the tables below. ```{r, echo=FALSE} set.seed(1234) @@ -79,28 +114,67 @@ table1 |> ``` - ## Import Olink Explore datasets -Olink Explore datasets are standard Olink Explore HT and Olink Explore 384/3072 NPX tables. The `read_NPX()` function can be used to import an NPX file in parquet form as generated by Olink® NPX Explore Software. More information on using `read_NPX()` can be found in [the Olink Analyze Overview tutorial](Vignett.html). - +Olink Explore datasets are standard Olink Explore HT and Olink Explore +384/3072 NPX tables. The `read_NPX()` function can be used to import an +NPX file in parquet form as generated by Olink® NPX Explore Software. +More information on using `read_NPX()` can be found in [the Olink +Analyze Overview tutorial](Vignett.html). ```{r dataset_generation, eval = FALSE, message=FALSE, warning=FALSE} explore_npx <- read_NPX("~/Explore_NPX_file.parquet") ``` - ## Integrating Negative Control LOD -The negative control (NC) LOD method requires at least 10 negative controls in a dataset. Negative control data is available in the standard exported Explore HT and Explore 384/3072 NPX parquet files. NCs can be identified through the SampleID and SampleType columns. +The negative control (NC) LOD method requires at least 10 negative +controls in a dataset. Negative control data is available in the +standard exported Explore HT and Explore 384/3072 NPX parquet files. NCs +can be identified through the SampleID and SampleType columns. -A negative control will not contribute to the minimum number of required NCs if the negative control does not pass sample QC criteria (sample QC failure or warning) in all of the data (i.e. all Explore HT blocks, all Explore 3072 panels, or all Explore 384 panels that were measured) +A negative control will not contribute to the minimum number of required +NCs if the negative control does not pass sample QC criteria (sample QC +failure or warning) in all of the data (i.e. all Explore HT blocks, all +Explore 3072 panels, or all Explore 384 panels that were measured) -Negative controls are used to calculate LOD from either PC normalized NPX or counts. For assays with more than 150 counts in one of the negative controls, LOD is calculated using the median PC normalized NPX and adding 3 standard deviations, or 0.2 NPX whichever is larger. For assays with fewer than 150 counts in all negative controls, LOD is calculated using the count values which are then converted into PC normalized NPX. +Negative controls are used to calculate LOD from either PC normalized +NPX or counts. For assays with more than 150 counts in one of the +negative controls, LOD is calculated using the median PC normalized NPX +and adding 3 standard deviations, or 0.2 NPX whichever is larger. For +assays with fewer than 150 counts in all negative controls, LOD is +calculated using the count values which are then converted into PC +normalized NPX. -The resulting LOD is the PC normalized negative control LOD. In the event that the Explore dataset is intensity normalized, an intensity normalization adjustment factor is applied and the resulting intensity normalized LOD is reported in the LOD column and the PC normalized LOD is reported in the PCNormalizedLOD column. +------------------------------------------------------------------------ +#### *A note on calculating LOD from counts* + +*Some assays will use count values as the LOD because the assay receives +very few counts in the negative controls. For the convenience of data +processing, the LOD in count values are converted to NPX values in the +`olink_lod()` function. The LOD value for this assay (in +counts) will become many LOD values in NPX (as extension control counts +will vary across all samples). This is due to the fact that minor changes on the counts scale can result in significant changes on the NPX scale when working with small counts. +The reason for this is that NPX is a relative scale, which is calculated by dividing the counts of the assay by the counts of the extension control. For example, +given that the extension control values remain constant, if a count +value were to change from 1 count to 2 counts, this would be a change of +1 NPX, while a change from 1000 counts to 1001 counts would be +negligible on the NPX scale.* + +*Furthermore, due to the low number of counts, the NPX values calculated +from these counts do not correlate to true background levels. The +converted NPX values should not be used as LOD values for these assays.* + +------------------------------------------------------------------------ + + +The resulting LOD is the PC normalized negative control LOD. In the +event that the Explore dataset is intensity normalized, an intensity +normalization adjustment factor is applied and the resulting intensity +normalized LOD is reported in the LOD column and the PC normalized LOD +is reported in the PCNormalizedLOD column. ```{r NCLOD_example, eval = FALSE, message=FALSE, warning=FALSE} # Integrating negative control LOD for intensity normalized data @@ -110,9 +184,21 @@ olink_lod(explore_npx, lod_method = "NCLOD") ## Integrating Fixed LOD -The fixed LOD method uses fixed LOD values that have been calculated on negative controls used in Olink reference runs using the method described above for negative control LOD. These values are specific to the Data Analysis Reference ID, which can be found in your dataset. The fixed LOD data is available in an external CSV file which can be downloaded from the Document Download Center at [olink.com](https://olink.com/). The fixed LOD values reported in this CSV file are the PC normalized LODs. - -The fixed LOD file is read into the `olink_lod()` function to be integrated into an Explore dataset. In the event that the Explore dataset is intensity normalized, an intensity normalization adjustment factor is applied and the resulting intensity normalized LOD is reported in the LOD column and the PC normalized LOD is reported in the PCNormalizedLOD column. +The fixed LOD method uses fixed LOD values that have been calculated on +negative controls used in Olink reference runs using the method +described above for negative control LOD. These values are specific to +the Data Analysis Reference ID, which can be found in your dataset. The +fixed LOD data is available in an external CSV file which can be +downloaded from the Document Download Center at +[olink.com](https://olink.com/knowledge/documents). The fixed LOD values reported in this +CSV file are the PC normalized LODs. + +The fixed LOD file is read into the `olink_lod()` function to be +integrated into an Explore dataset. In the event that the Explore +dataset is intensity normalized, an intensity normalization adjustment +factor is applied and the resulting intensity normalized LOD is reported +in the LOD column and the PC normalized LOD is reported in the +PCNormalizedLOD column. ```{r FixedLOD, eval = FALSE, message=FALSE, warning=FALSE} # Reading in Fixed LOD file path into R environment @@ -124,37 +210,88 @@ olink_lod(explore_npx, lod_file_path = fixedLOD_filepath, lod_method = "FixedLOD ``` ## When to use Fixed LOD vs NC LOD -For smaller sized studies (<10 NCs) we recommend using fixed LOD to integrate LOD values into your NPX dataset, as LOD calculations on fewer NCs may provide non-accurate values. However, it is important to keep in mind that fixed LOD values are not specific to your project, rather these values are generated by Olink when a new lot of reagents is released. -For larger projects we recommend calculating LOD from NC to obtain LOD values that are specific to your project. However, this requires that the dataset has at least 10 NCs with passing SampleQC. +For smaller sized studies (\<10 NCs) we recommend using fixed LOD to +integrate LOD values into your NPX dataset, as LOD calculations on fewer +NCs may provide non-accurate values. However, it is important to keep in +mind that fixed LOD values are not specific to your project, rather +these values are generated by Olink when a new lot of reagents is +released. + +For larger projects we recommend calculating LOD from NC to obtain LOD +values that are specific to your project. However, this requires that +the dataset has at least 10 NCs with passing SampleQC. -## Adjusting LOD for Intensity Normalized Data +## Integrating Both NC LOD and Fixed LOD -If an Olink Explore dataset is intensity normalized, a normalization adjustment factor is applied to the PC normalized LOD within the `olink_lod()` function. +There is also the option to calculate both NC LOD and Fixed LOD for a data file by setting lod_method to “Both”. The resulting data will have 4 additional columns, starting with NC or Fixed to indicate the method used to calculate LOD, followed by LOD or PCNormalizedLOD as explained above. An example of the file format is shown below. Note that these columns will not automatically be recognized by other functions within Olink Analyze that use LOD (for example `olink_bridgeselector()`). To use these functions, the LOD value to be used should have "LOD" as the column name. -For each assay, this adjustment factor is calculated as the median NPX of all samples (excluding Olink's external controls) within each plate. For Olink Explore 3072, overlapping assays are assessed separately, within their respective panels. The intensity normalized negative control LOD is calculated by subtracting this adjustment factor from the PC normalized negative control LOD. +```{r, echo=FALSE} +table1 |> + dplyr::mutate(Normalization = "Intensity") |> + dplyr::mutate(PCNormalizedNPX = round(NPX,digits = 2)) |> + dplyr::mutate(PCNormalizedLOD = round(LOD, digits = 2)) |> + dplyr::mutate(NPX = round(NPX + 4.16, digits = 2))|> + dplyr::mutate(LOD = round(LOD + 4.16, digits = 2)) |> + dplyr::rename(FixedLOD = LOD, + FixedPCNormalizedLOD = PCNormalizedLOD) |> + dplyr::mutate(NCLOD = FixedLOD - 2.34, + NCPCNormalizedLOD = FixedPCNormalizedLOD - 2.34) |> + dplyr::select(SampleID, SampleType, OlinkID, UniProt, Assay, Count, NPX, Normalization, PCNormalizedNPX, FixedLOD, FixedPCNormalizedLOD, NCLOD, NCPCNormalizedLOD) |> + knitr::kable(caption = "Example results using both LOD calculation methods") |> + kableExtra::kable_styling(font_size = 10) +``` + +## Adjusting LOD for Intensity Normalized Data + +If an Olink Explore dataset is intensity normalized, a normalization +adjustment factor is applied to the PC normalized LOD within the +`olink_lod()` function. -The intensity normalization LOD adjustment is applied to both the negative control and fixed LOD methods. +For each assay, this adjustment factor is calculated as the median NPX +of all samples (excluding Olink's external controls) within each plate. +For Olink Explore 3072, overlapping assays are assessed separately, +within their respective panels. The intensity normalized negative +control LOD is calculated by subtracting this adjustment factor from the +PC normalized negative control LOD. + +The intensity normalization LOD adjustment is applied to both the +negative control and fixed LOD methods. ## Export Olink Explore Data with LOD -Olink Explore data with LOD data can be exported using arrow::write_parquet to export Olink Explore data as a parquet file in long format. + +Olink Explore data with LOD data can be exported using +arrow::write_parquet to export Olink Explore data as a parquet file in +long format. ```{r explore_npx_export, eval = FALSE, message=FALSE, warning=FALSE} # Exporting Olink Explore data with LOD information as a parquet file explore_npx <- read_NPX("Path_to/Explore_NPX_file.parquet") explore_npx_NC_LOD <- explore_npx %>% - olink_lod(lod_method = "NCLOD") %>% - arrow::write_parquet(, file = "NPX_data_NC_LOD.parquet") -``` + olink_lod(lod_method = "NCLOD") + +# Add metadata for export +df <- explore_npx_NC_LOD |> + arrow::as_arrow_table() +df$metadata$FileVersion <- "NA" +df$metadata$ExploreVersion <- "NA" +df$metadata$ProjectName <- "NA" +df$metadata$SampleMatrix <- "NA" +df$metadata$DataFileType <- "Olink Analyze Export File" +df$metadata$ProductType <- "ExploreHT" # "ExploreHT" or "Explore3072" +df$metadata$Product <- "ExploreHT" # "ExploreHT" or "Explore3072" + +arrow::write_parquet(x = df, sink = "path_to_output.parquet") +``` ## Contact Us We are always happy to help. Email us with any questions: -- biostat\@olink.com for statistical services and general stats questions - +- biostat\@olink.com for statistical services and general stats + questions - support\@olink.com for Olink lab product and technical support @@ -164,12 +301,19 @@ We are always happy to help. Email us with any questions: © 2024 Olink Proteomics AB. -Olink products and services are For Research Use Only and not for Use in Diagnostic Procedures. +Olink products and services are For Research Use Only and not for Use in +Diagnostic Procedures. -All information in this document is subject to change without notice. This document is not intended to convey any warranties, representations and/or recommendations of any kind, unless such warranties, representations and/or recommendations are explicitly stated. +All information in this document is subject to change without notice. +This document is not intended to convey any warranties, representations +and/or recommendations of any kind, unless such warranties, +representations and/or recommendations are explicitly stated. -Olink assumes no liability arising from a prospective reader’s actions based on this document. +Olink assumes no liability arising from a prospective reader’s actions +based on this document. -OLINK, NPX, PEA, PROXIMITY EXTENSION, INSIGHT and the Olink logotype are trademarks registered, or pending registration, by Olink Proteomics AB. All third-party trademarks are the property of their respective owners. +OLINK, NPX, PEA, PROXIMITY EXTENSION, INSIGHT and the Olink logotype are +trademarks registered, or pending registration, by Olink Proteomics AB. +All third-party trademarks are the property of their respective owners. -Olink products and assay methods are covered by several patents and patent applications https://olink.com/patents/. +Olink products and assay methods are covered by several patents and patent applications https://olink.com/legal/patents. diff --git a/OlinkAnalyze/vignettes/Vignett.Rmd b/OlinkAnalyze/vignettes/Vignett.Rmd index 72a44034..cde12f31 100644 --- a/OlinkAnalyze/vignettes/Vignett.Rmd +++ b/OlinkAnalyze/vignettes/Vignett.Rmd @@ -99,12 +99,12 @@ The package contains two test data files named npx_data1 and npx_data2. These ar * **UniProt** _\_: UniProt ID. * **Assay** _\_: Common gene name for the assay. * **MissingFreq** _\_: Missing frequency for the OlinkID, i.e. frequency of samples with NPX value below limit of detection (LOD). -* **Panel** _\_: Olink Panel that samples ran on. Read more about Olink Panels here: [https://olink.com/products-services/](https://olink.com/products-services/). +* **Panel** _\_: Olink Panel that samples ran on. Read more about Olink Panels here: [https://olink.com/products/compare](https://olink.com/products/compare). * **Panel_Version** _\_: Version of the panel. A new panel version might include some different or improved assays. * **PlateID** _\_: Name of the plate. -* **QC_Warning** _\_: Indication whether the sample passed Olink QC. Read more here: [https://olink.com/faq/how-is-quality-control-of-the-data-performed/](https://olink.com/faq/how-is-quality-control-of-the-data-performed/). +* **QC_Warning** _\_: Indication whether the sample passed Olink QC. More information about Olink quality control metrics can be found in our [FAQ](https://olink.com/knowledge/faq) by searching "Quality control". * **LOD** _\_: Limit of detection (LOD) is the minimum level of an individual protein that can be measured. LOD is defined as 3 times the standard deviation over background. -* **NPX** _\_: Normalized Protein eXpression, is Olink®’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Read more about NPX here: [https://olink.com/faq/what-is-npx/](https://olink.com/faq/what-is-npx/). +* **NPX** _\_: Normalized Protein eXpression, is Olink®’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Read more about NPX in the Olink [FAQ](https://olink.com/knowledge/faq) (Search "What is NPX?") or in Olink's Data normalization and standardization [white paper](https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf). **Note:** There are 5 additional variables in the sample datasets npx_data1 and npx_data2 that include clinical or other information, namely: Subject _\_, Treatment _\_, Site _\_, Time _\_, Project _\_. @@ -132,12 +132,46 @@ A tibble in long format containing: * UniProt: UniProt ID. * Assay: Common gene name for the assay. * MissingFreq: Missing frequency for the OlinkID, i.e. frequency of samples with NPX value below limit of detection (LOD). -* Panel: Olink Panel that samples ran on. Read more about Olink Panels here: [https://olink.com/products-services/](https://olink.com/products-services/). +* Panel: Olink Panel that samples ran on. Read more about Olink Panels here: [https://olink.com/products/compare](https://olink.com/products/compare) * Panel_Version: Version of the panel. A new panel version might include some different or improved assays. * PlateID: Name of the plate. -* QC_Warning: Indication whether the sample passed Olink QC. Read more here: [https://olink.com/faq/how-is-quality-control-of-the-data-performed/](https://olink.com/faq/how-is-quality-control-of-the-data-performed/). +* QC_Warning: Indication whether the sample passed Olink QC. More information about Olink quality control metrics can be found in our [FAQ](https://olink.com/knowledge/faq) (Search "Quality control"). * LOD: Limit of detection (LOD) is the minimum level of an individual protein that can be measured. LOD is defined as 3 times the standard deviation over background. -* NPX: Normalized Protein eXpression, is Olink’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Read more about NPX here: [https://olink.com/faq/what-is-npx/](https://olink.com/faq/what-is-npx/). +* NPX: Normalized Protein eXpression, is Olink’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Read more about NPX in the Olink [FAQ](https://olink.com/knowledge/faq)(Search "What is NPX?") or in Olink's Data normalization and standardization [white paper](https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf). + + + +## Read multiple NPX data files (read_NPX) + +In order to import multiple NPX data files at once, the read_NPX function can be used in combination with the list.files, lapply and dplyr::bind_rows functions, as seen below. The *pattern* argument of the list.files function specifies the NPX file format (*.csv*, *.parquet*, or either). This method requires that all NPX files are stored in the same folder and have identical column names. No prior alterations to the NPX output file should be made for this method to work as expected. + + +```{r message=FALSE, eval=FALSE} +# Read in multiple NPX files in .csv format +data <- list.files(path = "path/to/dir/with/NPX/files", + pattern = "csv$", + full.names = TRUE) |> + lapply(FUN = function(x){ + OlinkAnalyze::read_NPX(x) |> + dplyr::mutate(File = x) # Optionally add additional columns to add file identifiers + } |> + dplyr::bind_rows() # optional to return a single data frame of all files instead of a list of data frames + +# Read in multiple NPX files in .parquet format +data <- list.files(path = "path/to/dir/with/NPX/files", + pattern = "parquet$", + full.names = TRUE) |> + lapply(OlinkAnalyze::read_NPX) |> + dplyr::bind_rows() + +# Read in multiple NPX files in either format +data <- list.files(path = "path/to/dir/with/NPX/files", + pattern = "parquet$|csv$", + full.names = TRUE) |> + lapply(OlinkAnalyze::read_NPX) |> + dplyr::bind_rows() +``` + ## Randomize samples on plate (olink_plate_randomizer) @@ -253,18 +287,18 @@ A tibble of NPX data in long format containing normalized NPX values, including * UniProt: UniProt ID. * Assay: Common gene name for the assay. * MissingFreq: Missing frequency for the OlinkID, i.e. frequency of samples with NPX value below limit of detection (LOD). -* Panel: Olink Panel that samples ran on. Read more about Olink Panels here: [https://olink.com/products-services/](https://olink.com/products-services/). +* Panel: Olink Panel that samples ran on.Read more about Olink Panels here: [https://olink.com/products/compare](https://olink.com/products/compare). * Panel_Version: Version of the panel. A new panel version might include some different or improved assays. * PlateID: Name of the plate. -* QC_Warning: Indication whether the sample passed Olink QC. Read more here: [https://olink.com/faq/how-is-quality-control-of-the-data-performed/](https://olink.com/faq/how-is-quality-control-of-the-data-performed/). +* QC_Warning: Indication whether the sample passed Olink QC. More information about Olink quality control metrics can be found in our [FAQ](https://olink.com/knowledge/faq) (Search "Quality control") * LOD: Limit of detection (LOD) is the minimum level of an individual protein that can be measured. LOD is defined as 3 times the standard deviation over background. -* NPX: Normalized Protein eXpression, is Olink®’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Read more about NPX here: [https://olink.com/faq/what-is-npx/](https://olink.com/faq/what-is-npx/). +* NPX: Normalized Protein eXpression, is Olink®’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Read more about NPX in the Olink [FAQ](https://olink.com/knowledge/faq) (Search "What is NPX?") or in Olink's Data normalization and standardization [white paper](https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf). * Project: Name given from the dataframe of origin. * Adj_factor: Adjustment factor, i.e. how much was added to or subtracted from the original NPX value. ## Integrating Explore NPX LOD (olink_lod) -The olink_lod function adds LOD information to an Explore HT or Explore 3072 NPX dataframe. This function can incorporate LOD based on either an Explore dataset's negative controls or using predetermined fixed LOD values, which can be downloaded from the Document Download Center at [olink.com](https://olink.com). The default LOD calculation method is based off of the negative controls. If an NPX file is intensity normalized, both intensity normalized and PC normalized LODs are provided. +The olink_lod function adds LOD information to an Explore HT or Explore 3072 NPX dataframe. This function can incorporate LOD based on either an Explore dataset's negative controls or using predetermined fixed LOD values, which can be downloaded from the Document Download Center at [olink.com](https://olink.com/knowledge/documents), or using both methods. The default LOD calculation method is based off of the negative controls. If an NPX file is intensity normalized, both intensity normalized and PC normalized LODs are provided. ### Function arguments @@ -296,13 +330,13 @@ A tibble with the following columns: + **UniProt** _\_: UniProt ID. + **Assay** _\_: Common gene name for the assay. + **AssayType** _\_: Assay type. Indicates whether an assay is a panel assay or an Olink control assay. -+ **Panel** _\_: Olink Panel that samples ran on. Read more about Olink Panels here: https://olink.com/products-services/. ++ **Panel** _\_: Olink Panel that samples ran on. Read more about Olink Panels here: [https://olink.com/products/compare](https://olink.com/products/compare). + **Block** _\_: Olink Block that samples ran on. + **Count** **: Raw counts generated during sequencing. -+ **ExtNPX** **: Extension normalized NPX value that is used in NPX calculation. Read more about ExtNPX here: https://olink.com/faq/how-is-the-npx-value-calculated-in-explore/ -+ **NPX** **: Normalized Protein eXpression, is Olink’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. If Explore data is PC normalized, NPX reflects the PC normalized value. If Explore data is intensity normalized NPX, NPX reflects the intensity normalized value. Read more about NPX here: https://olink.com/faq/what-is-npx/. ++ **ExtNPX** **: Extension normalized NPX value that is used in NPX calculation. Read more about ExtNPX in the [Olink FAQ]((https://olink.com/knowledge/faq)). (Search "extension normalized") ++ **NPX** **: Normalized Protein eXpression, is Olink’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. If Explore data is PC normalized, NPX reflects the PC normalized value. If Explore data is intensity normalized NPX, NPX reflects the intensity normalized value. Read more about NPX in the Olink [FAQ](https://olink.com/knowledge/faq) (Search "What is NPX?") or in Olink's Data normalization and standardization [white paper](https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf). + **Normalization** _\_: The normalization method used. -+ **PCNormalizedNPX** **: Normalized Protein eXpression, is Olink’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Regardless of normalization method, this column always reflects PC normalized NPX values. Read more about NPX here: https://olink.com/faq/what-is-npx/. ++ **PCNormalizedNPX** **: Normalized Protein eXpression, is Olink’s unit of protein expression level in a log2 scale. The majority of the functions of this package use NPX values for calculations. Regardless of normalization method, this column always reflects PC normalized NPX values. Read more about NPX in the Olink [FAQ](https://olink.com/knowledge/faq) (Search "plate control normalization") or in Olink's Data normalization and standardization [white paper](https://7074596.fs1.hubspotusercontent-na1.net/hubfs/7074596/05-white%20paper%20for%20website/1096-olink-data-normalization-white-paper.pdf). + **AssayQC** _\_:Indicates whether an assay contains a QC warning. + **SampleQC** _\_: Indicates whether a sample contains a QC warning within a block. + **ExploreVersion** _\_: The version of the Explore software library that was used to produce the data file. @@ -384,6 +418,8 @@ The olink_anova is a wrapper function that performs an ANOVA F-test for each ass Samples with missing variable information or factor levels are excluded from the analysis. Character columns in the input data frame are converted to factors. The automatic handling of the data from above is announced by a message if the flag *verbose=TRUE*. +Control samples and control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed before using this function. + Crossed/interaction analysis, i.e. A*B formula notation, is inferred from the variable argument in the following cases: * c('A','B') @@ -406,14 +442,23 @@ Adjusted p-values are calculated using the function *p.adjust* from the R librar * verbose: Logical. Default: True. If information about removed samples, factor conversion and final model formula is to be printed to the console. ```{r message=FALSE, eval=FALSE} +# Remove control samples and assays +npx_data1_no_controls <- npx_data1 |> + filter(!str_detect(SampleID, + regex("control|ctrl", + ignore_case = TRUE))) |> + filter(!str_detect(Assay, + regex("control|ctrl", + ignore_case = TRUE))) + # One-way ANOVA, no covariates -anova_results_oneway <- olink_anova(df = npx_data1, +anova_results_oneway <- olink_anova(df = npx_data1_no_controls, variable = 'Site') # Two-way ANOVA, no covariates -anova_results_twoway <- olink_anova(df = npx_data1, +anova_results_twoway <- olink_anova(df = npx_data1_no_controls, variable = c('Site', 'Time')) # One-way ANOVA, Treatment as covariates -anova_results_oneway <- olink_anova(df = npx_data1, +anova_results_oneway <- olink_anova(df = npx_data1_no_controls, variable = 'Site', covariates = 'Treatment') ``` @@ -441,6 +486,8 @@ olink_anova_posthoc performs a post-hoc ANOVA test using the function *emmeans* The function handles both factor and numerical variables and/or covariates. The post-hoc test for a numerical variable compares the difference in means of the outcome variable (default: NPX) for 1 standard deviation (SD) difference in the numerical variable, e.g. mean NPX at mean (numerical variable) versus mean NPX at mean (numerical variable) + 1*SD (numerical variable). +Control samples and control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed before using this function. + ### Function arguments * df: NPX data frame in long format should minimally contain protein name (Assay), OlinkID, UniProt, Panel and an outcome factor with at least 3 levels. @@ -454,13 +501,13 @@ The function handles both factor and numerical variables and/or covariates. The ```{r message=FALSE, eval=FALSE} # calculate the p-value for the ANOVA -anova_results_oneway <- olink_anova(df = npx_data1, +anova_results_oneway <- olink_anova(df = npx_data1_no_controls, variable = 'Site') # extracting the significant proteins anova_results_oneway_significant <- anova_results_oneway %>% filter(Threshold == 'Significant') %>% pull(OlinkID) -anova_posthoc_oneway_results <- olink_anova_posthoc(df = npx_data1, +anova_posthoc_oneway_results <- olink_anova_posthoc(df = npx_data1_no_controls, olinkid_list = anova_results_oneway_significant, variable = 'Site', effect = 'Site') @@ -948,6 +995,9 @@ A list of objects of class *ggplot* (silently returned). Plots are also printed The olink_boxplot function is used to generate boxplots of NPX values stratified on a variable for a given list of proteins. olink_boxplot uses the functions *ggplot* and *geom_boxplot* of the R library *ggplot2*. + +In order to annotate the plot with ANOVA posthoc analysis results, control samples and control assays (AssayType is not "assay", or Assay contains "control" or "ctrl") should be removed. + ### Function arguments * df: NPX data frame in long format should minimally contain protein name (Assay), OlinkID, UniProt and a grouping variable. @@ -959,19 +1009,28 @@ The olink_boxplot function is used to generate boxplots of NPX values stratified * number_of_proteins_per_plot: Number of boxplots to include in the facets plot. Default 6. ```{r message=FALSE} -plot <- npx_data1 %>% +# Remove control samples and assays +npx_data1_no_controls <- npx_data1 |> + filter(!str_detect(SampleID, + regex("control|ctrl", + ignore_case = TRUE))) |> + filter(!str_detect(Assay, + regex("control|ctrl", + ignore_case = TRUE))) + +plot <- npx_data1_no_controls %>% na.omit() %>% # removing missing values which exists for Site olink_boxplot(variable = "Site", olinkid_list = c("OID00488", "OID01276"), number_of_proteins_per_plot = 2) plot[[1]] -anova_posthoc_results<-npx_data1 %>% +anova_posthoc_results<-npx_data1_no_controls %>% olink_anova_posthoc(olinkid_list = c("OID00488", "OID01276"), variable = 'Site', effect = 'Site') -plot2 <- npx_data1 %>% +plot2 <- npx_data1_no_controls %>% na.omit() %>% # removing missing values which exists for Site olink_boxplot(variable = "Site", olinkid_list = c("OID00488", "OID01276"), diff --git a/OlinkAnalyze/vignettes/bridging_introduction.Rmd b/OlinkAnalyze/vignettes/bridging_introduction.Rmd index 48aa936b..0403914f 100644 --- a/OlinkAnalyze/vignettes/bridging_introduction.Rmd +++ b/OlinkAnalyze/vignettes/bridging_introduction.Rmd @@ -69,10 +69,12 @@ library(kableExtra) ```{r brnrtab, message=FALSE, echo=FALSE} data.frame(Platform = c("Target 96", "Explore 384 Cardiometabolic, Inflammation, Neurology, and Oncology", - "Explore 384 Cardiometabolic II, Inflammation II, Neurology II, and Oncology II"), + "Explore 384 Cardiometabolic II, Inflammation II, Neurology II, and Oncology II", + "Explore HT"), BridgingSamples = c("8-16", "8-16", - "16-24")) %>% + "16-24", + "24-32")) %>% kbl(booktabs = TRUE, digits = 2, caption = "Table 1. Recommended number of bridging samples for Olink platforms") %>% @@ -412,10 +414,8 @@ npx_data1 %>% Another way to determine if bridging decreased variability between projects is to calculate the CV of the control samples across both projects before and after bridging, as shown in Figure 7. The CV after normalization is expected to be smaller than the CV prior to normalization, as shown in Figure 7. -Note that the CV calculation formula differs for [Target -96](https://olink.com/faq/how-is-cv-calculated/) and -[Explore](https://olink.com/faq/how-is-cv-in-the-analysis-report-calculated-for-olink-explore/) -projects. +Note that the CV calculation formula differs for Olink Target +96 and Olink Explore products. More information of CV calculation can be found in the [Olink FAQ](https://olink.com/knowledge/faq) ```{r CV_calculation, fig.cap= f7}