Merge pull request #38 from FredHutch/filter_qc_ki4

Add ability to select columns for the zerocount filter
FredHutch · Jul 1, 2024 · 6f837ee · 6f837ee
2 parents c1951c7 + e7946af
commit 6f837ee
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 4 deletions.
diff --git a/R/02-filter.R b/R/02-filter.R
@@ -40,19 +40,29 @@ gimap_filter <- function(.data = NULL,
 #' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point
 #' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point
 #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the raw count data
+#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
 #' @importFrom magrittr %>%
 #' @importFrom dplyr mutate
 #' @importFrom purrr reduce map
 #' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
 #' @examples \dontrun{
 #'   gimap_dataset <- get_example_data("gimap")
 #'   qc_filter_zerocounts(gimap_dataset)
+#'   
+#'   #or to specify a different column (or set of columns to select)
+#'   qc_filter_zerocount(gimap_dataset, filter_zerocount_target_col = c(1,2))
 #' }
 #'
 
-qc_filter_zerocounts <- function(gimap_dataset){
+qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NULL){
+
+  if (is.null(filter_zerocount_target_col)) {filter_zerocount_target_col <- c(1:ncol(gimap_dataset$raw_counts))}
 
-  counts_filter <- data.frame(gimap_dataset$raw_counts) %>% map(~.x %in% c(0)) %>% reduce(`|`)
+  if (!all(filter_zerocount_target_col %in% 1:ncol(gimap_dataset$raw_counts))) {
+    stop("The columns selected do not exist. `filter_zerocount_target_col` needs to correspond to the index of the columns in `gimap_dataset$raw_counts` that you need to filter by") 
+   }
+
+  counts_filter <- data.frame(gimap_dataset$raw_counts[,filter_zerocount_target_col]) %>% map(~.x %in% c(0)) %>% reduce(`|`)
 
   zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
     mutate(percent = round(((n/sum(n))*100),2))
@@ -92,6 +102,10 @@ qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_targe
 
   if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}
 
+  if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
+    stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by") 
+  }
+
   plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()
 
   if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct

diff --git a/R/plots-qc.R b/R/plots-qc.R
@@ -121,7 +121,7 @@ qc_constructs_countzero_bar <- function(gimap_dataset, wide_ar = 0.75){
   qc_filter_output <- qc_filter_zerocounts(gimap_dataset)
 
   return(
-    example_counts[qc_filter_output$filter, c(3:5)] %>%
+    gimap_dataset$raw_counts[qc_filter_output$filter, c(3:5)] %>%
       as.data.frame() %>%
       mutate(row = row_number()) %>%
       tidyr::pivot_longer(tidyr::unite(gimap_dataset$metadata$sample_metadata[c(3:5), c("day", "rep")], "colName")$colName,
@@ -205,6 +205,10 @@ qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, filter_plasmid_ta
 
   if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}
 
+  if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
+    stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by") 
+  }
+
   to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()
 
   if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector and store in a df to plot 

diff --git a/inst/rmd/gimapQCTemplate.Rmd b/inst/rmd/gimapQCTemplate.Rmd
@@ -81,7 +81,7 @@ qc_constructs_countzero_bar(gimap_dataset)
 If this filter is applied, this is the number of pgRNAs that would be filtered out
 
 ```{r}
-qc_filter_zerocounts(gimap_dataset)$reportdf
+qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$reportdf
 ```
 
 ### Filter pgRNAs where there is a low log2 CPM value for the plasmid sample/time point