Skip to content

Commit

Permalink
Merge pull request #38 from FredHutch/filter_qc_ki4
Browse files Browse the repository at this point in the history
Add ability to select columns for the zerocount filter
  • Loading branch information
kweav authored Jul 1, 2024
2 parents c1951c7 + e7946af commit 6f837ee
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
18 changes: 16 additions & 2 deletions R/02-filter.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,29 @@ gimap_filter <- function(.data = NULL,
#' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point
#' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the raw count data
#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate
#' @importFrom purrr reduce map
#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
#' @examples \dontrun{
#' gimap_dataset <- get_example_data("gimap")
#' qc_filter_zerocounts(gimap_dataset)
#'
#' #or to specify a different column (or set of columns to select)
#' qc_filter_zerocount(gimap_dataset, filter_zerocount_target_col = c(1,2))
#' }
#'

qc_filter_zerocounts <- function(gimap_dataset){
qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NULL){

if (is.null(filter_zerocount_target_col)) {filter_zerocount_target_col <- c(1:ncol(gimap_dataset$raw_counts))}

counts_filter <- data.frame(gimap_dataset$raw_counts) %>% map(~.x %in% c(0)) %>% reduce(`|`)
if (!all(filter_zerocount_target_col %in% 1:ncol(gimap_dataset$raw_counts))) {
stop("The columns selected do not exist. `filter_zerocount_target_col` needs to correspond to the index of the columns in `gimap_dataset$raw_counts` that you need to filter by")
}

counts_filter <- data.frame(gimap_dataset$raw_counts[,filter_zerocount_target_col]) %>% map(~.x %in% c(0)) %>% reduce(`|`)

zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
mutate(percent = round(((n/sum(n))*100),2))
Expand Down Expand Up @@ -92,6 +102,10 @@ qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_targe

if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}

if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by")
}

plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()

if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct
Expand Down
6 changes: 5 additions & 1 deletion R/plots-qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ qc_constructs_countzero_bar <- function(gimap_dataset, wide_ar = 0.75){
qc_filter_output <- qc_filter_zerocounts(gimap_dataset)

return(
example_counts[qc_filter_output$filter, c(3:5)] %>%
gimap_dataset$raw_counts[qc_filter_output$filter, c(3:5)] %>%
as.data.frame() %>%
mutate(row = row_number()) %>%
tidyr::pivot_longer(tidyr::unite(gimap_dataset$metadata$sample_metadata[c(3:5), c("day", "rep")], "colName")$colName,
Expand Down Expand Up @@ -205,6 +205,10 @@ qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, filter_plasmid_ta

if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}

if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by")
}

to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()

if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector and store in a df to plot
Expand Down
2 changes: 1 addition & 1 deletion inst/rmd/gimapQCTemplate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ qc_constructs_countzero_bar(gimap_dataset)
If this filter is applied, this is the number of pgRNAs that would be filtered out

```{r}
qc_filter_zerocounts(gimap_dataset)$reportdf
qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$reportdf
```

### Filter pgRNAs where there is a low log2 CPM value for the plasmid sample/time point
Expand Down

0 comments on commit 6f837ee

Please sign in to comment.