Merge pull request #35 from FredHutch/filter_qc_ki3

Start with adding parameters that allow the user to select which column(s) are used in various filtering or filtering-related visualization steps
FredHutch · Jun 28, 2024 · 1c12148 · 1c12148
2 parents abd411b + 076c3ac
commit 1c12148
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -32,6 +32,7 @@ Imports:
     kableExtra, 
     pheatmap,
     purrr,
+    janitor,
 Suggests:
     testthat (>= 3.0.0),
     roxygen2,

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,12 +10,16 @@ export(run_qc)
 export(setup_data)
 import(ggplot2)
 import(kableExtra)
+importFrom(dplyr,across)
+importFrom(dplyr,if_any)
 importFrom(dplyr,mutate)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,labs)
+importFrom(janitor,clean_names)
 importFrom(magrittr,"%>%")
 importFrom(pheatmap,pheatmap)
 importFrom(purrr,map)
 importFrom(purrr,reduce)
 importFrom(tidyr,pivot_longer)
+importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unite)
diff --git a/R/01-qc.R b/R/01-qc.R
@@ -4,6 +4,9 @@
 #' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created
 #' @param overwrite default is FALSE; whether to overwrite the QC Report file
 #' @param output_file default is `QC_Report`; name of the output QC report file
+#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
+#' @param filter_plasmid_target_col default is NULL; Which sample columns(s) should be used to look at log2 CPM expression for plasmid pgRNA constructs? If NULL and not specified, downstream analysis will select the first sample column only
+#' @param filter_replicates_target_col default is NULL; Which sample columns are replicates whose variation you'd like to analyze; If NULL, the last 3 sample columns are used
 #' @param ... additional parameters are sent to `rmarkdown::render()`
 #' @returns a QC report saved locally
 #' @export
@@ -19,6 +22,9 @@ run_qc <- function(gimap_dataset,
                    output_file = "./gimap_QC_Report.Rmd",
                    plots_dir = "./qc_plots",
                    overwrite = FALSE,
+                   filter_zerocount_target_col = NULL,
+                   filter_plasmid_target_col = NULL,
+                   filter_replicates_target_col = NULL,
                    ...) {
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
 
@@ -56,7 +62,10 @@ run_qc <- function(gimap_dataset,
   rmarkdown::render(output_file,
     params = list(
       dataset = gimap_dataset,
-      plots_dir = plots_dir
+      plots_dir = plots_dir,
+      filter_zerocount_target_col = filter_zerocount_target_col,
+      filter_plasmid_target_col = filter_zerocount_target_col,
+      filter_replicates_target_col = filter_zerocount_target_col
     ),
     ...
   )

diff --git a/R/02-filter.R b/R/02-filter.R
@@ -45,6 +45,7 @@ gimap_filter <- function(.data = NULL,
 #' @importFrom purrr reduce map
 #' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
 #' @examples \dontrun{
+#'   gimap_dataset <- get_example_data("gimap")
 #'   qc_filter_zerocounts(gimap_dataset)
 #' }
 #'
@@ -61,30 +62,69 @@ qc_filter_zerocounts <- function(gimap_dataset){
 }
 
 #' Create a filter for pgRNAs which have a low log2 CPM value for the plasmid/Day 0 sample/time point
-#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point
+#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample, 
+#' we pool all the replicate samples to find the lower outlier and flag constructs for which any plasmid replicate has a log2 CPM value below the cutoff
 #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the log2 CPM transformed data
+#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
 #' @importFrom magrittr %>%
-#' @importFrom dplyr mutate
-#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM
+#' @importFrom dplyr mutate across if_any
+#' @importFrom tidyr pivot_wider pivot_longer 
+#' @importFrom janitor clean_names
+#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM
 #' @examples \dontrun{
+#'   gimap_dataset <- get_example_data("gimap")
+#' 
 #'   qc_filter_plasmid(gimap_dataset)
 #'   
+#'   #or to specify a cutoff value to be used in the filter rather than the lower outlier default
 #'   qc_filter_plasmid(gimap_dataset, cutoff=2)
+#'   
+#'   #or to specify a different column (or set of columns to select)
+#'   qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = 1:2)
+#'
+#'   # or to specify a cutoff value that will be used in the filter rather than the lower outlier default as well as to specify a different column (or set of columns) to select
+#'   qc_filter_plasmid(gimap_dataset, cutoff=1.75, filter_plasmid_target_col=c(1,2))
+#' 
 #' }
 #'
 
-qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL){
-  plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm"))
+qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL){
+
+  if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}
+
+  plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()
+
+  if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct
+    plasmid_data <- plasmid_data %>% mutate(construct = rownames(plasmid_data)) %>%
+      pivot_longer(starts_with("plasmid_log2_cpm"), 
+                   values_to = "plasmid_log2_cpm", 
+                   names_to = "rep")
+  }
 
   if (is.null(cutoff)) {
     # if cutoff is null, use lower outlier 
-    quantile_info <- quantile(plasmid_data$log2_cpm)
+    quantile_info <- quantile(plasmid_data$plasmid_log2_cpm)
 
     cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"])) #later step make a function for this in utils since it's used more than once
   }
 
-  plasmid_cpm_filter <- unlist(lapply(1:nrow(plasmid_data), function(x) plasmid_data$log2_cpm[x] < cutoff))
+  if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, take collapsed/pooled data and compare it to the cutoff
+                                            #then pivot_wider so that the constructs are in the same row and we can use if_any to report if any of the replicates were flagged by the cutoff
+                                            #return just that summary column (reporting if any are TRUE) as the filter
+    plasmid_data <- plasmid_data %>% 
+      mutate(filterFlag = plasmid_log2_cpm < cutoff) %>%
+      pivot_wider(id_cols = construct, names_from = rep, values_from = filterFlag)
+    plasmid_cpm_filter <- plasmid_data %>% 
+      mutate(plasmid_cpm_filter=  if_any(.cols = starts_with('plasmid_log2_cpm'))) %>%
+      select(plasmid_cpm_filter)
+
+  } else {
+
+    plasmid_cpm_filter <- as.data.frame(plasmid_data$plasmid_log2_cpm < cutoff) %>%`colnames<-`("plasmid_cpm_filter")
 
+  }
+
+
   plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
     mutate(percent = round(((n / sum(n)) * 100), 2)) #later step make a function for this in utils since it's used more than once
 

diff --git a/R/plots-qc.R b/R/plots-qc.R
@@ -7,6 +7,9 @@
 #' @importFrom ggplot2 ggplot labs
 #' @return counts_cdf a ggplot
 #' @examples \dontrun{
+#' 
+#' gimap_dataset <- get_example_data("gimap")
+#' qc_cdf(gimap_dataset)
 #'
 #' }
 #'
@@ -40,7 +43,8 @@ qc_cdf <- function(gimap_dataset, wide_ar = 0.75) {
 #' @import ggplot2
 #' @return sample_cpm_histogram a ggplot
 #' @examples \dontrun{
-#'
+#' gimap_dataset <- get_example_data("gimap")
+#' qc_sample_hist(gimap_dataset)
 #' }
 #'
 qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) {
@@ -146,7 +150,8 @@ qc_constructs_countzero_bar <- function(gimap_dataset, wide_ar = 0.75){
 #' @importFrom pheatmap pheatmap
 #' @return `sample_cor_heatmap` a pheatmap
 #' @examples \dontrun{
-#'
+#'   gimap_dataset <- get_example_data("gimap")
+#'   qc_cor_heatmap(gimap_dataset)
 #' }
 #'
 qc_cor_heatmap <- function(gimap_dataset) {
@@ -173,21 +178,46 @@ qc_cor_heatmap <- function(gimap_dataset) {
 #' method to tell, especially if there are reps?
 #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
 #' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period; if not specified, The lower outlier (defined by taking the difference of the lower quartile and 1.5 * interquartile range) is used
+#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
 #' @param wide_ar aspect ratio, default is 0.75
 #' @importFrom magrittr %>%
+#' @importFrom janitor clean_names
 #' @import ggplot2
 #' @return a ggplot histogram
+#' @examples \dontrun{
+#' 
+#' gimap_dataset <- get_example_data("gimap")
+#' 
+#' qc_plasmid_histogram(gimap_dataset)
+#' 
+#' # or to specify a "cutoff" value that will be displayed as a dashed vertical line
+#' qc_plasmid_histogram(gimap_dataset, cutoff=1.75)
+#' 
+#' # or to specify a different column (or set of columns) to select
+#' qc_plasmid_histogram(gimap_dataset, filter_plasmid_target_col=c(1,2))
+#'
+#' # or to specify a "cutoff" value that will be displayed as a dashed vertical line as well as to specify a different column (or set of columns) to select
+#' qc_plasmid_histogram(gimap_dataset, cutoff=2, filter_plasmid_target_col=c(1,2))
+#' }
+#'
 
-qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, wide_ar = 0.75) {
-  to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm"))
+qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL, wide_ar = 0.75) {
+
+  if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}
+
+  to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()
+
+  if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector and store in a df to plot 
+    to_plot <- data.frame(unlist(to_plot %>% select(starts_with("plasmid_log2_cpm")), use.names = FALSE)) %>% `colnames<-`(c("plasmid_log2_cpm"))
+  }
 
-    quantile_info <- quantile(to_plot$log2_cpm)
+    quantile_info <- quantile(to_plot$plasmid_log2_cpm)
 
     if (is.null(cutoff)) { cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"]))}
       # if cutoff is null, suggest a cutoff and plot with suggested
 
   return(   
-    ggplot(to_plot, aes(x = log2_cpm)) +
+    ggplot(to_plot, aes(x = plasmid_log2_cpm)) +
       geom_histogram(binwidth = 0.2, color = "black", fill = "gray60") +
       plot_options() +
       plot_theme() +

diff --git a/inst/rmd/gimapQCTemplate.Rmd b/inst/rmd/gimapQCTemplate.Rmd
@@ -5,6 +5,9 @@ author: "`r paste0('gimap (v', utils::packageVersion('gimap'), ')')`"
 params:
   dataset: NULL
   plots_dir: ./plots
+  filter_zerocount_target_col: NULL
+  filter_plasmid_target_col: NULL
+  filter_replicates_target_col: NULL
 output: 
   html_document:
     theme: spacelab
@@ -62,7 +65,7 @@ qc_variance_hist(gimap_dataset)
 ## Plasmid expression (log 2 cpm values for Day 0 sample/time point)
 
 ```{r}
-qc_plasmid_histogram(gimap_dataset)
+qc_plasmid_histogram(gimap_dataset, filter_plasmid_target_col = filter_plasmid_target_col)
 ```
 
 # Applying potential filters
@@ -86,7 +89,7 @@ qc_filter_zerocounts(gimap_dataset)$reportdf
 If this filter is applied, this is the number of pgRNAs that would be filtered out
 
 ```{r}
-qc_filter_plasmid(gimap_dataset)$plasmid_filter_report
+qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter_report
 ```
 
 # Session Info