Skip to content

Commit

Permalink
Merge pull request #35 from FredHutch/filter_qc_ki3
Browse files Browse the repository at this point in the history
Start with adding parameters that allow the user to select which column(s) are used in various filtering or filtering-related visualization steps
  • Loading branch information
kweav authored Jun 28, 2024
2 parents abd411b + 076c3ac commit 1c12148
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 16 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Imports:
kableExtra,
pheatmap,
purrr,
janitor,
Suggests:
testthat (>= 3.0.0),
roxygen2,
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ export(run_qc)
export(setup_data)
import(ggplot2)
import(kableExtra)
importFrom(dplyr,across)
importFrom(dplyr,if_any)
importFrom(dplyr,mutate)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(janitor,clean_names)
importFrom(magrittr,"%>%")
importFrom(pheatmap,pheatmap)
importFrom(purrr,map)
importFrom(purrr,reduce)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,unite)
11 changes: 10 additions & 1 deletion R/01-qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created
#' @param overwrite default is FALSE; whether to overwrite the QC Report file
#' @param output_file default is `QC_Report`; name of the output QC report file
#' @param filter_zerocount_target_col default is NULL; Which sample column(s) should be used to check for counts of 0? If NULL and not specified, downstream analysis will select all sample columns
#' @param filter_plasmid_target_col default is NULL; Which sample columns(s) should be used to look at log2 CPM expression for plasmid pgRNA constructs? If NULL and not specified, downstream analysis will select the first sample column only
#' @param filter_replicates_target_col default is NULL; Which sample columns are replicates whose variation you'd like to analyze; If NULL, the last 3 sample columns are used
#' @param ... additional parameters are sent to `rmarkdown::render()`
#' @returns a QC report saved locally
#' @export
Expand All @@ -19,6 +22,9 @@ run_qc <- function(gimap_dataset,
output_file = "./gimap_QC_Report.Rmd",
plots_dir = "./qc_plots",
overwrite = FALSE,
filter_zerocount_target_col = NULL,
filter_plasmid_target_col = NULL,
filter_replicates_target_col = NULL,
...) {
if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

Expand Down Expand Up @@ -56,7 +62,10 @@ run_qc <- function(gimap_dataset,
rmarkdown::render(output_file,
params = list(
dataset = gimap_dataset,
plots_dir = plots_dir
plots_dir = plots_dir,
filter_zerocount_target_col = filter_zerocount_target_col,
filter_plasmid_target_col = filter_zerocount_target_col,
filter_replicates_target_col = filter_zerocount_target_col
),
...
)
Expand Down
54 changes: 47 additions & 7 deletions R/02-filter.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ gimap_filter <- function(.data = NULL,
#' @importFrom purrr reduce map
#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
#' @examples \dontrun{
#' gimap_dataset <- get_example_data("gimap")
#' qc_filter_zerocounts(gimap_dataset)
#' }
#'
Expand All @@ -61,30 +62,69 @@ qc_filter_zerocounts <- function(gimap_dataset){
}

#' Create a filter for pgRNAs which have a low log2 CPM value for the plasmid/Day 0 sample/time point
#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point
#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample,
#' we pool all the replicate samples to find the lower outlier and flag constructs for which any plasmid replicate has a log2 CPM value below the cutoff
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the log2 CPM transformed data
#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
#' @importFrom magrittr %>%
#' @importFrom dplyr mutate
#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM
#' @importFrom dplyr mutate across if_any
#' @importFrom tidyr pivot_wider pivot_longer
#' @importFrom janitor clean_names
#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM
#' @examples \dontrun{
#' gimap_dataset <- get_example_data("gimap")
#'
#' qc_filter_plasmid(gimap_dataset)
#'
#' #or to specify a cutoff value to be used in the filter rather than the lower outlier default
#' qc_filter_plasmid(gimap_dataset, cutoff=2)
#'
#' #or to specify a different column (or set of columns to select)
#' qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = 1:2)
#'
#' # or to specify a cutoff value that will be used in the filter rather than the lower outlier default as well as to specify a different column (or set of columns) to select
#' qc_filter_plasmid(gimap_dataset, cutoff=1.75, filter_plasmid_target_col=c(1,2))
#'
#' }
#'

qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL){
plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm"))
qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL){

if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}

plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()

if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct
plasmid_data <- plasmid_data %>% mutate(construct = rownames(plasmid_data)) %>%
pivot_longer(starts_with("plasmid_log2_cpm"),
values_to = "plasmid_log2_cpm",
names_to = "rep")
}

if (is.null(cutoff)) {
# if cutoff is null, use lower outlier
quantile_info <- quantile(plasmid_data$log2_cpm)
quantile_info <- quantile(plasmid_data$plasmid_log2_cpm)

cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"])) #later step make a function for this in utils since it's used more than once
}

plasmid_cpm_filter <- unlist(lapply(1:nrow(plasmid_data), function(x) plasmid_data$log2_cpm[x] < cutoff))
if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, take collapsed/pooled data and compare it to the cutoff
#then pivot_wider so that the constructs are in the same row and we can use if_any to report if any of the replicates were flagged by the cutoff
#return just that summary column (reporting if any are TRUE) as the filter
plasmid_data <- plasmid_data %>%
mutate(filterFlag = plasmid_log2_cpm < cutoff) %>%
pivot_wider(id_cols = construct, names_from = rep, values_from = filterFlag)
plasmid_cpm_filter <- plasmid_data %>%
mutate(plasmid_cpm_filter= if_any(.cols = starts_with('plasmid_log2_cpm'))) %>%
select(plasmid_cpm_filter)

} else {

plasmid_cpm_filter <- as.data.frame(plasmid_data$plasmid_log2_cpm < cutoff) %>%`colnames<-`("plasmid_cpm_filter")

}


plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
mutate(percent = round(((n / sum(n)) * 100), 2)) #later step make a function for this in utils since it's used more than once

Expand Down
42 changes: 36 additions & 6 deletions R/plots-qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#' @importFrom ggplot2 ggplot labs
#' @return counts_cdf a ggplot
#' @examples \dontrun{
#'
#' gimap_dataset <- get_example_data("gimap")
#' qc_cdf(gimap_dataset)
#'
#' }
#'
Expand Down Expand Up @@ -40,7 +43,8 @@ qc_cdf <- function(gimap_dataset, wide_ar = 0.75) {
#' @import ggplot2
#' @return sample_cpm_histogram a ggplot
#' @examples \dontrun{
#'
#' gimap_dataset <- get_example_data("gimap")
#' qc_sample_hist(gimap_dataset)
#' }
#'
qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) {
Expand Down Expand Up @@ -146,7 +150,8 @@ qc_constructs_countzero_bar <- function(gimap_dataset, wide_ar = 0.75){
#' @importFrom pheatmap pheatmap
#' @return `sample_cor_heatmap` a pheatmap
#' @examples \dontrun{
#'
#' gimap_dataset <- get_example_data("gimap")
#' qc_cor_heatmap(gimap_dataset)
#' }
#'
qc_cor_heatmap <- function(gimap_dataset) {
Expand All @@ -173,21 +178,46 @@ qc_cor_heatmap <- function(gimap_dataset) {
#' method to tell, especially if there are reps?
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period; if not specified, The lower outlier (defined by taking the difference of the lower quartile and 1.5 * interquartile range) is used
#' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom magrittr %>%
#' @importFrom janitor clean_names
#' @import ggplot2
#' @return a ggplot histogram
#' @examples \dontrun{
#'
#' gimap_dataset <- get_example_data("gimap")
#'
#' qc_plasmid_histogram(gimap_dataset)
#'
#' # or to specify a "cutoff" value that will be displayed as a dashed vertical line
#' qc_plasmid_histogram(gimap_dataset, cutoff=1.75)
#'
#' # or to specify a different column (or set of columns) to select
#' qc_plasmid_histogram(gimap_dataset, filter_plasmid_target_col=c(1,2))
#'
#' # or to specify a "cutoff" value that will be displayed as a dashed vertical line as well as to specify a different column (or set of columns) to select
#' qc_plasmid_histogram(gimap_dataset, cutoff=2, filter_plasmid_target_col=c(1,2))
#' }
#'

qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, wide_ar = 0.75) {
to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm"))
qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL, wide_ar = 0.75) {

if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}

to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()

if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector and store in a df to plot
to_plot <- data.frame(unlist(to_plot %>% select(starts_with("plasmid_log2_cpm")), use.names = FALSE)) %>% `colnames<-`(c("plasmid_log2_cpm"))
}

quantile_info <- quantile(to_plot$log2_cpm)
quantile_info <- quantile(to_plot$plasmid_log2_cpm)

if (is.null(cutoff)) { cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"]))}
# if cutoff is null, suggest a cutoff and plot with suggested

return(
ggplot(to_plot, aes(x = log2_cpm)) +
ggplot(to_plot, aes(x = plasmid_log2_cpm)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "gray60") +
plot_options() +
plot_theme() +
Expand Down
7 changes: 5 additions & 2 deletions inst/rmd/gimapQCTemplate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ author: "`r paste0('gimap (v', utils::packageVersion('gimap'), ')')`"
params:
dataset: NULL
plots_dir: ./plots
filter_zerocount_target_col: NULL
filter_plasmid_target_col: NULL
filter_replicates_target_col: NULL
output:
html_document:
theme: spacelab
Expand Down Expand Up @@ -62,7 +65,7 @@ qc_variance_hist(gimap_dataset)
## Plasmid expression (log 2 cpm values for Day 0 sample/time point)

```{r}
qc_plasmid_histogram(gimap_dataset)
qc_plasmid_histogram(gimap_dataset, filter_plasmid_target_col = filter_plasmid_target_col)
```

# Applying potential filters
Expand All @@ -86,7 +89,7 @@ qc_filter_zerocounts(gimap_dataset)$reportdf
If this filter is applied, this is the number of pgRNAs that would be filtered out

```{r}
qc_filter_plasmid(gimap_dataset)$plasmid_filter_report
qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter_report
```

# Session Info
Expand Down

0 comments on commit 1c12148

Please sign in to comment.