FredHutch · cansavvy · Aug 23, 2024 · Jul 31, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/R/00-setup_data.R b/R/00-setup_data.R
@@ -2,7 +2,6 @@
 #' @description This function allows people to have their data ready to be processed by gimap
 #' @param counts a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample
 #' @param pg_ids the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data
-#' @param pg_metadata construct metadata
 #' @param sample_metadata metadata associated with the samples of the dataset that correspond to the columns of the counts data.
 #' Should include a column that has replicate information as well as a column that contains timepoint information respectively (this will be used for log fold calculations). These columns should be factors.
 #' @return A special gimap_dataset to be used with the other functions in this package.
@@ -20,7 +19,6 @@
 #' }
 setup_data <- function(counts = NULL,
                        pg_ids = NULL,
-                       pg_metadata = NULL,
                        sample_metadata = NULL) {
   new_data <- list(
     raw_counts = NULL,
@@ -52,8 +50,9 @@ setup_data <- function(counts = NULL,
   if (is.null(counts)) stop("counts cannot be NULL")
   if (!is.matrix(counts)) stop("counts can only be in the form of a matrix")
 
+
   # If they don't give sample metadata, then we will make up a row id
-  if (is.null(sample_metadata)) sample_metadata <- data.frame(id = 1:ncol(counts))
+  if (is.null(sample_metadata)) stop("sample metadata will be required in later steps. Please provide it")
   if (!is.data.frame(sample_metadata)) stop("metadata can only be in the form of a data.frame")
   if (nrow(sample_metadata) != ncol(counts)) stop("the number of rows in the sample metadata is not equal to the number of columns in the counts")
 

diff --git a/R/03-annotate.R b/R/03-annotate.R
@@ -2,7 +2,7 @@
 #' @description In this function, a `gimap_dataset` is annotated as far as which genes should be used as controls.
 #' @param .data Data can be piped in with tidyverse pipes from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
-#' @param cell_line which cell line are you using? Default is "HELA"
+#' @param cell_line which cell line are you using? (e.g., HELA, PC9, etc.). Required argument
 #' @param cn_annotate TRUE or FALSE you'd also like to have Copy number annotation from DepMap. These data are optional
 #' @param annotation_file If no file is given, will attempt to use the design file from https://media.addgene.org/cms/filer_public/a9/9a/a99a9328-324b-42ff-8ccc-30c544b899e4/pgrna_library.xlsx
 #' @param control_genes A vector of gene symbols (e.g. AAMP) that should be labeled as control genes. These will be used for log fold change calculations. If no list is given then DepMap Public 23Q4 Achilles_common_essentials.csv is used https://depmap.org/portal/download/all/
@@ -17,14 +17,14 @@
 #'
 #' gimap_dataset <- gimap_dataset %>%
 #'   gimap_filter() %>%
-#'   gimap_annotate()
+#'   gimap_annotate(cell_line = "HELA")
 #'
 #' # To see anotations
 #' gimap_dataset$annotation
 #' }
 gimap_annotate <- function(.data = NULL,
                            gimap_dataset,
-                           cell_line = "HELA",
+                           cell_line,
                            control_genes = NULL,
                            cn_annotate = TRUE,
                            annotation_file = NULL) {
@@ -59,7 +59,7 @@ gimap_annotate <- function(.data = NULL,
   depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)
 
   my_depmap_id <- depmap_metadata %>%
-    dplyr::filter(stripped_cell_line_name == cell_line) %>%
+    dplyr::filter(stripped_cell_line_name == toupper(cell_line)) %>%
     dplyr::pull(DepMap_ID)
 
   tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")

diff --git a/R/04-normalize.R b/R/04-normalize.R
@@ -19,7 +19,7 @@
 #'
 #' gimap_dataset <- gimap_dataset %>%
 #'   gimap_filter() %>%
-#'   gimap_annotate() %>%
+#'   gimap_annotate(cell_line = "HELA") %>%
 #'   gimap_normalize(
 #'     timepoints = "day",
 #'     replicates = "rep"

diff --git a/R/05-crispr-calc.R b/R/05-crispr-calc.R
@@ -13,7 +13,7 @@
 #'
 #' gimap_dataset <- gimap_dataset %>%
 #'   gimap_filter() %>%
-#'   gimap_annotate() %>%
+#'   gimap_annotate(cell_line = "HELA") %>%
 #'   gimap_normalize(
 #'     timepoints = "day",
 #'     replicates = "rep"

diff --git a/R/06-calculate_gi.R b/R/06-calculate_gi.R
@@ -11,7 +11,7 @@
 #'
 #'   gimap_dataset <- gimap_dataset %>%
 #'     gimap_filter() %>%
-#'     gimap_annotate() %>%
+#'     gimap_annotate(cell_line = "HELA") %>%
 #'     gimap_normalize(
 #'       timepoints = "day",
 #'       replicates = "rep"

diff --git a/tests/testthat/test-setup_data.R b/tests/testthat/test-setup_data.R
@@ -1,14 +1,12 @@
 # Example data for testing
 example_counts <- matrix(1:20, nrow = 4, ncol = 5)
 example_pg_ids <- data.frame(id = 1:4)
-example_pg_metadata <- data.frame(info = c("A", "B", "C", "D"))
 example_sample_metadata <- data.frame(id = 1:5, replicate = factor(c(1, 1, 2, 2, 3)), timepoint = factor(c("T0", "T0", "T1", "T1", "T2")))
 
 # Test elements inside output list
 test_that("setup_data() works correctly", {
   result <- setup_data(counts = example_counts,
                        pg_ids = example_pg_ids,
-                       pg_metadata = example_pg_metadata,
                        sample_metadata = example_sample_metadata)
 
   expect_s3_class(result, "gimap_dataset")

diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd
@@ -85,20 +85,13 @@ The next datasets are metadata that describe the dimensions of the count data.
 - These both need to be data frames.
 - The sizes of these metadata must correspond to the dimensions of the counts data.
 
-`pg_id` are just the unique IDs listed in the same order/sorted the same way as the count data and can be used for mapping between the count data and the metadata.
+`pg_id` are just the unique IDs listed in the same order/sorted the same way as the count data and can be used for mapping between the count data and the metadata. This is required and very important because it is necessary to know the IDs and be able to map them to pgRNA constructs and counts data.
 
 ```{r}
 example_pg_id <- example_data %>%
   dplyr::select("id")
 ```
 
-One of these (`example_pg_metadata`) is required because it is necessary to know the IDs and be able to map them to pgRNA constructs.
-
-```{r}
-example_pg_metadata <- example_data %>%
-  select(c("id", "seq_1", "seq_2"))
-```
-
 Sample metadata is the information that describes the samples and is sorted the same order as the columns in the count data.
 
 You need to have two columns in the metadata you provide (they must be named exactly this).
@@ -115,14 +108,13 @@ example_sample_metadata <- data.frame(
 )
 ```
 
-We'll need to provide `example_counts`, `pg_ids` and `pg_metadata` to `setup_data()`. We can provide `sample_metadata`, but it is not required at the moment.
+We'll need to provide `example_counts`, `pg_ids` and `sample_metadata` to `setup_data()`. 
 
 Now let's setup our data using `setup_data()`. Optionally we can provide the metadata in this function as well so that it is stored with the data.
 
 ```{r}
 gimap_dataset <- setup_data(counts = example_counts,
                             pg_ids = example_pg_id,
-                            pg_metadata = example_pg_metadata,
                             sample_metadata = example_sample_metadata)
 ```
 
@@ -206,9 +198,16 @@ The filtering step also stores two tables of information that you may want to us
 - `$filtered_data$all_reps_zerocount_ids` is a table that lists the IDs of pgRNA constructs which had a count of 0 for all final timepoint replicates. These pgRNA constructs are NOT necessarily filtered out
 
 
+Now that you've performed QC and filtering, the rest of the pipeline can be run 
+
+- First annotating the data set (expression levels, copy number, etc.) with DepMap data. For the annotation step, you *must* specify which `cell_line` your data uses so that the correct corresponding DepMap data is used for annotation. This function is `gimap_annotate()`
+- Then the data is normalized with the `gimap_normalize()` function. `timepoints` and `replicates` need to be specified pointing to the correct column names from the `sample_data` passed to the `setup_data()` function earlier.
+- CRISPR scores are calculated with the `calc_crispr()` function.
+- Genetic interaction scores are computed with the `calc_gi()` function.
+
 ```{r}
 gimap_dataset <- gimap_dataset %>%
-  gimap_annotate() %>%
+  gimap_annotate(cell_line = "HELA") %>%
   gimap_normalize(
     timepoints = "day",
     replicates = "rep") %>%