From f7a7bc751050c36f853824831ff65e30ea6aef43 Mon Sep 17 00:00:00 2001
From: Kate Isaac <41767733+kweav@users.noreply.github.com>
Date: Wed, 31 Jul 2024 15:59:35 -0400
Subject: [PATCH 1/2] try to add expected crispr score

---
 R/06-calculate_gi.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/06-calculate_gi.R b/R/06-calculate_gi.R
index fdb1414..c6ec04c 100644
--- a/R/06-calculate_gi.R
+++ b/R/06-calculate_gi.R
@@ -99,7 +99,8 @@ calc_gi <- function(.data = NULL,
       rep,
       double_target_gi_score,
       single_target_gi_score_1,
-      single_target_gi_score_2
+      single_target_gi_score_2,
+      expected_crispr
     )
 
   # Store this

From d88f9166400176e9738b8ac3a5281cd4ce416908 Mon Sep 17 00:00:00 2001
From: Kate Isaac <41767733+kweav@users.noreply.github.com>
Date: Wed, 14 Aug 2024 18:59:39 -0400
Subject: [PATCH 2/2] make cell_line a required argument and remove pg_metadata
 argument as not used

---
 R/00-setup_data.R                |  5 ++---
 R/03-annotate.R                  |  8 ++++----
 R/04-normalize.R                 |  2 +-
 R/05-crispr-calc.R               |  2 +-
 R/06-calculate_gi.R              |  2 +-
 tests/testthat/test-setup_data.R |  2 --
 vignettes/getting_started.Rmd    | 21 ++++++++++-----------
 7 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/R/00-setup_data.R b/R/00-setup_data.R
index 8ddfb36..15165af 100644
--- a/R/00-setup_data.R
+++ b/R/00-setup_data.R
@@ -2,7 +2,6 @@
 #' @description This function allows people to have their data ready to be processed by gimap
 #' @param counts a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample
 #' @param pg_ids the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data
-#' @param pg_metadata construct metadata
 #' @param sample_metadata metadata associated with the samples of the dataset that correspond to the columns of the counts data.
 #' Should include a column that has replicate information as well as a column that contains timepoint information respectively (this will be used for log fold calculations). These columns should be factors.
 #' @return A special gimap_dataset to be used with the other functions in this package.
@@ -20,7 +19,6 @@
 #' }
 setup_data <- function(counts = NULL,
                        pg_ids = NULL,
-                       pg_metadata = NULL,
                        sample_metadata = NULL) {
   new_data <- list(
     raw_counts = NULL,
@@ -52,8 +50,9 @@ setup_data <- function(counts = NULL,
   if (is.null(counts)) stop("counts cannot be NULL")
   if (!is.matrix(counts)) stop("counts can only be in the form of a matrix")
 
+  
   # If they don't give sample metadata, then we will make up a row id
-  if (is.null(sample_metadata)) sample_metadata <- data.frame(id = 1:ncol(counts))
+  if (is.null(sample_metadata)) stop("sample metadata will be required in later steps. Please provide it")
   if (!is.data.frame(sample_metadata)) stop("metadata can only be in the form of a data.frame")
   if (nrow(sample_metadata) != ncol(counts)) stop("the number of rows in the sample metadata is not equal to the number of columns in the counts")
 
diff --git a/R/03-annotate.R b/R/03-annotate.R
index 0508da3..e6fc489 100644
--- a/R/03-annotate.R
+++ b/R/03-annotate.R
@@ -2,7 +2,7 @@
 #' @description In this function, a `gimap_dataset` is annotated as far as which genes should be used as controls.
 #' @param .data Data can be piped in with tidyverse pipes from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
-#' @param cell_line which cell line are you using? Default is "HELA"
+#' @param cell_line which cell line are you using? (e.g., HELA, PC9, etc.). Required argument
 #' @param cn_annotate TRUE or FALSE you'd also like to have Copy number annotation from DepMap. These data are optional
 #' @param annotation_file If no file is given, will attempt to use the design file from https://media.addgene.org/cms/filer_public/a9/9a/a99a9328-324b-42ff-8ccc-30c544b899e4/pgrna_library.xlsx
 #' @param control_genes A vector of gene symbols (e.g. AAMP) that should be labeled as control genes. These will be used for log fold change calculations. If no list is given then DepMap Public 23Q4 Achilles_common_essentials.csv is used https://depmap.org/portal/download/all/
@@ -17,14 +17,14 @@
 #'
 #' gimap_dataset <- gimap_dataset %>%
 #'   gimap_filter() %>%
-#'   gimap_annotate()
+#'   gimap_annotate(cell_line = "HELA")
 #'
 #' # To see anotations
 #' gimap_dataset$annotation
 #' }
 gimap_annotate <- function(.data = NULL,
                            gimap_dataset,
-                           cell_line = "HELA",
+                           cell_line,
                            control_genes = NULL,
                            cn_annotate = TRUE,
                            annotation_file = NULL) {
@@ -59,7 +59,7 @@ gimap_annotate <- function(.data = NULL,
   depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)
 
   my_depmap_id <- depmap_metadata %>%
-    dplyr::filter(stripped_cell_line_name == cell_line) %>%
+    dplyr::filter(stripped_cell_line_name == toupper(cell_line)) %>%
     dplyr::pull(DepMap_ID)
 
   tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")
diff --git a/R/04-normalize.R b/R/04-normalize.R
index e0578a2..1f09c05 100644
--- a/R/04-normalize.R
+++ b/R/04-normalize.R
@@ -19,7 +19,7 @@
 #'
 #' gimap_dataset <- gimap_dataset %>%
 #'   gimap_filter() %>%
-#'   gimap_annotate() %>%
+#'   gimap_annotate(cell_line = "HELA") %>%
 #'   gimap_normalize(
 #'     timepoints = "day",
 #'     replicates = "rep"
diff --git a/R/05-crispr-calc.R b/R/05-crispr-calc.R
index d20f88c..d360fd2 100644
--- a/R/05-crispr-calc.R
+++ b/R/05-crispr-calc.R
@@ -13,7 +13,7 @@
 #'
 #' gimap_dataset <- gimap_dataset %>%
 #'   gimap_filter() %>%
-#'   gimap_annotate() %>%
+#'   gimap_annotate(cell_line = "HELA") %>%
 #'   gimap_normalize(
 #'     timepoints = "day",
 #'     replicates = "rep"
diff --git a/R/06-calculate_gi.R b/R/06-calculate_gi.R
index c6ec04c..209f55c 100644
--- a/R/06-calculate_gi.R
+++ b/R/06-calculate_gi.R
@@ -11,7 +11,7 @@
 #'
 #'   gimap_dataset <- gimap_dataset %>%
 #'     gimap_filter() %>%
-#'     gimap_annotate() %>%
+#'     gimap_annotate(cell_line = "HELA") %>%
 #'     gimap_normalize(
 #'       timepoints = "day",
 #'       replicates = "rep"
diff --git a/tests/testthat/test-setup_data.R b/tests/testthat/test-setup_data.R
index fcc9a8f..08ee8b5 100644
--- a/tests/testthat/test-setup_data.R
+++ b/tests/testthat/test-setup_data.R
@@ -1,14 +1,12 @@
 # Example data for testing
 example_counts <- matrix(1:20, nrow = 4, ncol = 5)
 example_pg_ids <- data.frame(id = 1:4)
-example_pg_metadata <- data.frame(info = c("A", "B", "C", "D"))
 example_sample_metadata <- data.frame(id = 1:5, replicate = factor(c(1, 1, 2, 2, 3)), timepoint = factor(c("T0", "T0", "T1", "T1", "T2")))
 
 # Test elements inside output list
 test_that("setup_data() works correctly", {
   result <- setup_data(counts = example_counts,
                        pg_ids = example_pg_ids,
-                       pg_metadata = example_pg_metadata,
                        sample_metadata = example_sample_metadata)
 
   expect_s3_class(result, "gimap_dataset")
diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd
index 7cf0b38..cf4f2a0 100644
--- a/vignettes/getting_started.Rmd
+++ b/vignettes/getting_started.Rmd
@@ -85,20 +85,13 @@ The next datasets are metadata that describe the dimensions of the count data.
 - These both need to be data frames.
 - The sizes of these metadata must correspond to the dimensions of the counts data.
 
-`pg_id` are just the unique IDs listed in the same order/sorted the same way as the count data and can be used for mapping between the count data and the metadata.
+`pg_id` are just the unique IDs listed in the same order/sorted the same way as the count data and can be used for mapping between the count data and the metadata. This is required and very important because it is necessary to know the IDs and be able to map them to pgRNA constructs and counts data.
 
 ```{r}
 example_pg_id <- example_data %>%
   dplyr::select("id")
 ```
 
-One of these (`example_pg_metadata`) is required because it is necessary to know the IDs and be able to map them to pgRNA constructs.
-
-```{r}
-example_pg_metadata <- example_data %>%
-  select(c("id", "seq_1", "seq_2"))
-```
-
 Sample metadata is the information that describes the samples and is sorted the same order as the columns in the count data.
 
 You need to have two columns in the metadata you provide (they must be named exactly this).
@@ -115,14 +108,13 @@ example_sample_metadata <- data.frame(
 )
 ```
 
-We'll need to provide `example_counts`, `pg_ids` and `pg_metadata` to `setup_data()`. We can provide `sample_metadata`, but it is not required at the moment.
+We'll need to provide `example_counts`, `pg_ids` and `sample_metadata` to `setup_data()`. 
 
 Now let's setup our data using `setup_data()`. Optionally we can provide the metadata in this function as well so that it is stored with the data.
 
 ```{r}
 gimap_dataset <- setup_data(counts = example_counts,
                             pg_ids = example_pg_id,
-                            pg_metadata = example_pg_metadata,
                             sample_metadata = example_sample_metadata)
 ```
 
@@ -206,9 +198,16 @@ The filtering step also stores two tables of information that you may want to us
 - `$filtered_data$all_reps_zerocount_ids` is a table that lists the IDs of pgRNA constructs which had a count of 0 for all final timepoint replicates. These pgRNA constructs are NOT necessarily filtered out
 
 
+Now that you've performed QC and filtering, the rest of the pipeline can be run 
+
+- First annotating the data set (expression levels, copy number, etc.) with DepMap data. For the annotation step, you *must* specify which `cell_line` your data uses so that the correct corresponding DepMap data is used for annotation. This function is `gimap_annotate()`
+- Then the data is normalized with the `gimap_normalize()` function. `timepoints` and `replicates` need to be specified pointing to the correct column names from the `sample_data` passed to the `setup_data()` function earlier.
+- CRISPR scores are calculated with the `calc_crispr()` function.
+- Genetic interaction scores are computed with the `calc_gi()` function.
+
 ```{r}
 gimap_dataset <- gimap_dataset %>%
-  gimap_annotate() %>%
+  gimap_annotate(cell_line = "HELA") %>%
   gimap_normalize(
     timepoints = "day",
     replicates = "rep") %>%