FredHutch · cansavvy · Jul 3, 2024 · Jun 25, 2024 · Jun 26, 2024 · Jul 1, 2024
diff --git a/R/02-filter.R b/R/02-filter.R
@@ -1,9 +1,14 @@
 #' A function to run filtering
-#' @description This is a function here's where we describe what it does
+#' @description This function applies filters to the gimap data. By default it runs both the zero count (across all samples) and the low plasmid cpm filters, but users can select a subset of these filters or even adjust the behavior of each filter
 #' @param .data Data can be piped in with %>% or |> from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
-#' @param filter_type Can be one of the following: `zero_count_only`, `low_plasmid_cpm_only` or `rep_variation`, `zero_in_last_time_point` or a vector that includes multiple of these filters.
+#' @param filter_type Can be one of the following: `zero_count_only`, `low_plasmid_cpm_only` or `both`. Potentially in the future also `rep_variation`, `zero_in_last_time_point` or a vector that includes multiple of these filters.
+#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
+#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
+#' @param cutoff default is NULL, relates to the low_plasmid_cpm filter; the cutoff for low log2 CPM values for the plasmid time period; if not specified, The lower outlier (defined by taking the difference of the lower quartile and 1.5 * interquartile range) is used
+#' @param min_n_filters default is 1; this parameter defines at least how many/the minimum number of independent filters have to flag a pgRNA construct before the construct is filtered when using a combination of filters
 #' You should decide on the appropriate filter based on the results of your QC report.
+#' @importFrom purrr reduce
 #' @returns a filtered version of the gimap_dataset returned in the $filtered_data section
 #' @export
 #' @examples \dontrun{
@@ -18,18 +23,57 @@
 #'
 #' # To see filtered data
 #' gimap_dataset$filtered_data
+#' 
+#' # If you want to only use a single filter or some subset, specify which using the filter_type parameter
+#' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "zero_count_only") 
+#' #or 
+#' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "low_plasmid_cpm_only")
+#' 
+#' # 
 #'
 #' }
 #'
 
 gimap_filter <- function(.data = NULL,
                          gimap_dataset,
-                         filter_type = "both") {
+                         filter_type = "both",
+                         cutoff = NULL,
+                         filter_zerocount_target_col = NULL,
+                         filter_plasmid_target_col = NULL,
+                         min_n_filters = 1) {
 
   if (!is.null(.data)) gimap_dataset <- .data
 
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
-
+
+  #check filter type input to make sure that it is a supportable input
+  if (!(filter_type %in% c("both", "zero_count_only", "low_plasmid_cpm_only"))) stop("Specification for `filter_type` not understood; Need to use 'both', 'zero_count_only', or 'low_plasmid_cpm_only'")
+
+  zc_filter <- NULL
+  p_filter <- NULL
+  #*ADD any new filters here* assigning it a NULL value
+
+  #This section calls the appropriate filtering functions and assigns results to the filter variables assigned NULL earlier (they will stay NULL if there filter wasn't selected to be run according to the input to the function)
+  if (filter_type == "both"){
+    zc_filter <- qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$filter
+    p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter
+  } else if (filter_type == "zero_count_only"){
+    zc_filter <- qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$filter
+  } else if(filter_type == "low_plasmid_cpm_only"){
+    p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter
+  }
+
+
+  possible_filters <- list(zc_filter, p_filter)
+  #*ADD any new filters here* within the list of `possible_filters`
+
+  #this first cbinds each filter enumerated in possible_filters together (no matter how many there are, and ignores the NULLs) using the reduce function
+  #then it finds the row sum (how many are filters flagged each construct e.g., number of TRUE in each row), 
+  #and finally compares the row sum to the `min_n_filters` parameter to report TRUEs and FALSEs according to whether each construct is flagged by the minimum number of required filters
+  #TRUE means it should be filtered, FALSE means it shouldn't be filtered
+  combined_filter <- rowSums(reduce(possible_filters, cbind)) >= min_n_filters 
+
+
   gimap_dataset$filtered <- NULL #TODO: Filtered version of the data can be stored here
 
   return(gimap_dataset)
@@ -75,6 +119,7 @@ qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NU
 #' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample, 
 #' we pool all the replicate samples to find the lower outlier and flag constructs for which any plasmid replicate has a log2 CPM value below the cutoff
 #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the log2 CPM transformed data
+#' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period; if not specified, The lower outlier (defined by taking the difference of the lower quartile and 1.5 * interquartile range) is used
 #' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
 #' @importFrom magrittr %>%
 #' @importFrom dplyr mutate across if_any
@@ -147,5 +192,4 @@ qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_targe
     plasmid_filter_report = plasmid_filter_df
   ))
 
-}
-
+}
diff --git a/R/utils.R b/R/utils.R
@@ -40,7 +40,7 @@ get_example_data <- function(which_data) {
     )
     return(readr::read_tsv(file, show_col_types = FALSE))
   } else {
-    stop("Specification for `which_data` not understood; Need to use 'gimap', count', 'meta', or 'annotation' ")
+    stop("Specification for `which_data` not understood; Need to use 'gimap', 'count', 'meta', or 'annotation' ")
   }
 }