diff --git a/inst/rmd/scratch_gimap_replicateVariation.Rmd b/inst/rmd/scratch_gimap_replicateVariation.Rmd new file mode 100644 index 0000000..33e03d5 --- /dev/null +++ b/inst/rmd/scratch_gimap_replicateVariation.Rmd @@ -0,0 +1,184 @@ +--- +title: "Scratch gimap ReplicateVariation" +author: "Kate Isaac" +date: "`r Sys.Date()`" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r message=FALSE} +library(gimap) +library(tidyverse) + +example_data <- get_example_data("count") + +example_pg_metadata <- get_example_data("meta") + +example_counts <- example_data %>% + select(c("Day00_RepA", "Day05_RepA", "Day22_RepA", "Day22_RepB", "Day22_RepC")) %>% + as.matrix() + +example_pg_id <- example_data %>% + dplyr::select("id") + +example_sample_metadata <- data.frame( + id = 1:5, + day = as.factor(c("Day00", "Day05", "Day22", "Day22", "Day22")), + rep = as.factor(c("RepA", "RepA", "RepA", "RepB", "RepC")) +) + +gimap_dataset <- setup_data(counts = example_counts, + pg_ids = example_pg_id, + sample_metadata = example_sample_metadata) + +``` +```{r warning=FALSE} +qc_filter_zerocounts <- function(gimap_dataset){ + + counts_filter <- unlist(lapply(1:nrow(gimap_dataset$raw_counts), + function(x) 0 %in% gimap_dataset$raw_counts[x,] + ) + ) + + zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), + n = c(sum(!counts_filter), sum(counts_filter)) + ) %>% + mutate(percent = round(((n/sum(n))*100),2)) + + return(list(filter = counts_filter, reportdf = zerocount_df)) + +} + +qc_filter_output <- qc_filter_zerocounts(gimap_dataset) + + +eda <- example_counts[qc_filter_output$filter,c(3:5)] %>% + cbind(example_pg_id[qc_filter_output$filter,]) %>% + pivot_longer(starts_with("Day22"), values_to = "counts") %>% + group_by(id) %>% + summarize(numzero = sum(counts == 0), + max_diff = max(counts) - min(counts), + sec_diff = min(counts[counts > 0]) - min(counts) + ) +``` +### How many day 22 pgRNAs have counts of 0 across x number of replicates + +```{r} +countdf <- eda %>% + group_by(numzero) %>% + summarize(count = n()) + +countdf %>% + ggplot(aes(x=numzero, y = count)) + + geom_bar(stat="identity") + + theme_classic() + + ylab("Number of pgRNAs") + + xlab("Number of replicates with a zero") + + geom_text(aes(label = count, group = numzero), vjust = -0.5, size=2) + +``` + +Those nearly `r countdf[which(countdf$numzero == 3),"count"]` with all 3 replicates having a count of 0 could be real biological signal that we don't want to remove or at least want to take note of for special analysis. While those with only 1 or 2 replicates with a count of 0 may have high variation and may be worth filtering. + + +```{r} + +example_counts[which(example_pg_id == unlist(eda[which(eda$numzero == 0)[1],"id"])),] + +example_counts[which(example_pg_id == unlist(eda[which(eda$numzero == 0)[2],"id"])),] + +``` + +The two with 0 replicates have a zero count on day 0 (`r unlist(eda[which(eda$numzero == 0)[1],"id"])`) and a zero count on day 5 (`r unlist(eda[which(eda$numzero == 0)[2],"id"])`) + +### What's the observed differences among the replicates + +```{r warning=FALSE} +eda %>% + filter(numzero > 0) %>% + pivot_longer(c("max_diff", "sec_diff"), values_to = "diff") %>% + ggplot(aes(y = diff)) + + geom_boxplot() + + facet_wrap(~name) + + theme_bw() + + theme(panel.background = element_blank(), + panel.grid = element_blank(), + axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + + scale_y_log10() +``` + +```{r warning=FALSE} +eda %>% + filter(numzero > 0) %>% + pivot_longer(c("max_diff", "sec_diff"), values_to = "diff") %>% + ggplot(aes(y = diff)) + + geom_boxplot() + + facet_wrap(~name) + + theme_bw() + + theme(panel.background = element_blank(), + panel.grid = element_blank(), + axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) +``` + +```{r} +vardf <- gimap_dataset$transformed_data$log2_cpm[,3:5] %>% + as.data.frame() %>% + mutate(row = row_number()) %>% + tidyr::pivot_longer(-row) %>% + group_by(row) %>% + summarize(mean = mean(value), + var = var(value)) + +``` + +```{r} +vardf %>% + ggplot(aes(x=var)) + + geom_histogram(binwidth=0.1) + + theme(panel.background = element_blank(), + panel.grid = element_blank()) + + xlab("variance") + + ylab("pgRNA construct count") +``` + +```{r} +vardf %>% + ggplot(aes(y=var)) + + geom_boxplot() + + theme(panel.background = element_blank(), + panel.grid = element_blank()) + + ylab("variance") + + +``` + +Taking a similar approach to the low plasmid count filter, we construct a distribution of the variation within replicates for each pgRNA and find the quantiles as well as the upper outlier and then use the upper outlier as a potential filter value, filtering out contstructs who have a higher variance than the upper outlier. + +```{r} +statdf <- vardf %>% + summarize(median = median(var), + Q1 = quantile(var, probs = 0.25), + Q3 = quantile(var, probs = 0.75), + upper_outlier = (Q3 + 1.5*(Q3 - Q1))) + +statdf +``` + +```{r} +sum(vardf$var > statdf$upper_outlier) +``` + +The upper outlier variance is pretty low, `r statdf$upper_outlier` and `r sum(vardf$var > statdf$upper_outlier)` (number of pgRNA constructs that would be filtered) is nearly three times the number of pgRNAs that are flagged by the zero count filter (`r qc_filter_output$reportdf[which(qc_filter_output$reportdf$RawCount0 == TRUE),"n"]`). I want to compare how this potential filter overlaps with the original, and then look into setting a more stringent (higher variance filter) with this approach. Can we decrease the number of filtered pgRNAs while still respecting the intent of the original filter? + +```{r} +sessionInfo() +``` + + \ No newline at end of file diff --git a/inst/rmd/scratch_gimap_replicateVariation.html b/inst/rmd/scratch_gimap_replicateVariation.html new file mode 100644 index 0000000..f0fca2e --- /dev/null +++ b/inst/rmd/scratch_gimap_replicateVariation.html @@ -0,0 +1,603 @@ + + + + + + + + + + + + + + + +Scratch gimap ReplicateVariation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
library(gimap)
+
## Warning: replacing previous import 'dplyr::group_rows' by
+## 'kableExtra::group_rows' when loading 'gimap'
+
library(tidyverse)
+
+example_data <- get_example_data("count")
+
+example_pg_metadata <- get_example_data("meta")
+
+example_counts <- example_data %>%
+  select(c("Day00_RepA", "Day05_RepA", "Day22_RepA", "Day22_RepB", "Day22_RepC")) %>%
+  as.matrix()
+
+example_pg_id <- example_data %>%
+  dplyr::select("id")
+
+example_sample_metadata <- data.frame(
+  id = 1:5,
+  day = as.factor(c("Day00", "Day05", "Day22", "Day22", "Day22")),
+  rep = as.factor(c("RepA", "RepA", "RepA", "RepB", "RepC"))
+)
+
+gimap_dataset <- setup_data(counts = example_counts,
+                            pg_ids = example_pg_id,
+                            sample_metadata = example_sample_metadata)
+
qc_filter_zerocounts <- function(gimap_dataset){
+  
+  counts_filter <- unlist(lapply(1:nrow(gimap_dataset$raw_counts), 
+                                 function(x) 0 %in% gimap_dataset$raw_counts[x,]
+                                 )
+                          )
+  
+  zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), 
+                             n = c(sum(!counts_filter), sum(counts_filter))
+                             ) %>%
+    mutate(percent = round(((n/sum(n))*100),2))
+  
+  return(list(filter = counts_filter, reportdf = zerocount_df))
+  
+}
+
+qc_filter_output <- qc_filter_zerocounts(gimap_dataset)
+
+
+eda <- example_counts[qc_filter_output$filter,c(3:5)] %>% 
+  cbind(example_pg_id[qc_filter_output$filter,]) %>% 
+  pivot_longer(starts_with("Day22"), values_to = "counts") %>% 
+  group_by(id) %>% 
+  summarize(numzero = sum(counts == 0), 
+            max_diff = max(counts) - min(counts), 
+            sec_diff = min(counts[counts > 0]) - min(counts)
+            )
+
+

How many day 22 pgRNAs have counts of 0 across x number of +replicates

+
countdf <- eda %>% 
+  group_by(numzero) %>% 
+  summarize(count = n())
+
+countdf %>%
+  ggplot(aes(x=numzero, y = count)) + 
+  geom_bar(stat="identity") + 
+  theme_classic() + 
+  ylab("Number of pgRNAs") + 
+  xlab("Number of replicates with a zero") + 
+  geom_text(aes(label = count, group = numzero), vjust = -0.5, size=2)
+

+

Those nearly 159 with all 3 replicates having a count of 0 could be +real biological signal that we don’t want to remove or at least want to +take note of for special analysis. While those with only 1 or 2 +replicates with a count of 0 may have high variation and may be worth +filtering.

+
example_counts[which(example_pg_id == unlist(eda[which(eda$numzero == 0)[1],"id"])),]
+
## Day00_RepA Day05_RepA Day22_RepA Day22_RepB Day22_RepC 
+##          0         43         16         50        124
+
example_counts[which(example_pg_id == unlist(eda[which(eda$numzero == 0)[2],"id"])),]
+
## Day00_RepA Day05_RepA Day22_RepA Day22_RepB Day22_RepC 
+##         32          0         28         42         57
+

The two with 0 replicates have a zero count on day 0 (KHDRBS3_nt2) +and a zero count on day 5 (NXT2_nt6)

+
+
+

What’s the observed differences among the replicates

+
eda %>% 
+  filter(numzero > 0) %>% 
+  pivot_longer(c("max_diff", "sec_diff"), values_to = "diff") %>% 
+  ggplot(aes(y = diff)) + 
+  geom_boxplot() + 
+  facet_wrap(~name) + 
+  theme_bw() + 
+  theme(panel.background = element_blank(), 
+        panel.grid = element_blank(), 
+        axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank()) +
+  scale_y_log10()
+

+
eda %>% 
+  filter(numzero > 0) %>% 
+  pivot_longer(c("max_diff", "sec_diff"), values_to = "diff") %>% 
+  ggplot(aes(y = diff)) + 
+  geom_boxplot() + 
+  facet_wrap(~name) + 
+  theme_bw() + 
+  theme(panel.background = element_blank(), 
+        panel.grid = element_blank(), 
+        axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank())
+

+
vardf <- gimap_dataset$transformed_data$log2_cpm[,3:5] %>% 
+  as.data.frame() %>% 
+  mutate(row = row_number()) %>%
+    tidyr::pivot_longer(-row) %>%
+    group_by(row) %>%
+    summarize(mean = mean(value),
+              var = var(value))
+
vardf %>% 
+  ggplot(aes(x=var)) + 
+  geom_histogram(binwidth=0.1) + 
+  theme(panel.background = element_blank(), 
+        panel.grid = element_blank()) + 
+  xlab("variance") +
+  ylab("pgRNA construct count")
+

+
vardf %>% 
+  ggplot(aes(y=var)) + 
+  geom_boxplot() + 
+  theme(panel.background = element_blank(), 
+        panel.grid = element_blank()) + 
+  ylab("variance")
+

+

Taking a similar approach to the low plasmid count filter, we +construct a distribution of the variation within replicates for each +pgRNA and find the quantiles as well as the upper outlier and then use +the upper outlier as a potential filter value, filtering out contstructs +who have a higher variance than the upper outlier.

+
statdf <- vardf %>% 
+  summarize(median = median(var),
+            Q1 = quantile(var, probs = 0.25),
+            Q3 = quantile(var, probs = 0.75),
+            upper_outlier = (Q3 + 1.5*(Q3 - Q1)))
+
+statdf
+
## # A tibble: 1 × 4
+##   median     Q1    Q3 upper_outlier
+##    <dbl>  <dbl> <dbl>         <dbl>
+## 1  0.155 0.0565 0.379         0.862
+
sum(vardf$var > statdf$upper_outlier)
+
## [1] 2907
+

The upper outlier variance is pretty low, 0.8618247 and 2907 (number +of pgRNA constructs that would be filtered) is nearly three times the +number of pgRNAs that are flagged by the zero count filter (1193. I want +to compare how this potential filter overlaps with the original, and +then look into setting a more stringent (higher variance filter) with +this approach. Can we decrease the number of filtered pgRNAs while still +respecting the intent of the original filter?

+
sessionInfo()
+
## R version 4.4.0 (2024-04-24)
+## Platform: aarch64-apple-darwin20
+## Running under: macOS Sonoma 14.4.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: America/Indiana/Indianapolis
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+##  [1] lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
+##  [5] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
+##  [9] ggplot2_3.5.1   tidyverse_2.0.0 gimap_0.1.0    
+## 
+## loaded via a namespace (and not attached):
+##  [1] sass_0.4.9         utf8_1.2.4         generics_0.1.3     xml2_1.3.6        
+##  [5] stringi_1.8.3      hms_1.1.3          digest_0.6.35      magrittr_2.0.3    
+##  [9] timechange_0.3.0   evaluate_0.23      grid_4.4.0         RColorBrewer_1.1-3
+## [13] fastmap_1.1.1      jsonlite_1.8.8     fansi_1.0.6        viridisLite_0.4.2 
+## [17] scales_1.3.0       jquerylib_0.1.4    cli_3.6.2          crayon_1.5.2      
+## [21] rlang_1.1.3        bit64_4.0.5        munsell_0.5.1      withr_3.0.0       
+## [25] cachem_1.0.8       yaml_2.3.8         parallel_4.4.0     tools_4.4.0       
+## [29] tzdb_0.4.0         colorspace_2.1-0   kableExtra_1.4.0   vctrs_0.6.5       
+## [33] R6_2.5.1           lifecycle_1.0.4    bit_4.0.5          vroom_1.6.5       
+## [37] pkgconfig_2.0.3    pillar_1.9.0       bslib_0.7.0        gtable_0.3.5      
+## [41] glue_1.7.0         systemfonts_1.0.6  highr_0.10         xfun_0.43         
+## [45] tidyselect_1.2.1   rstudioapi_0.16.0  knitr_1.46         farver_2.1.1      
+## [49] htmltools_0.5.8.1  labeling_0.4.3     rmarkdown_2.26     svglite_2.1.3     
+## [53] pheatmap_1.0.12    compiler_4.4.0
+
+ + + + +
+ + + + + + + + + + + + + + +