Skip to content

Commit

Permalink
Merge pull request #25 from FredHutch/cansavvy/annotation-schema
Browse files Browse the repository at this point in the history
Annotation schema
  • Loading branch information
cansavvy authored Jun 21, 2024
2 parents e2ccc8a + efe8b20 commit 15affcd
Show file tree
Hide file tree
Showing 24 changed files with 1,158 additions and 2,267 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ inst/doc
git_token.txt
/doc/
/Meta/
inst/extdata/CCLE_gene_cn.csv
inst/extdata/CCLE_expression.csv
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ Suggests:
roxygen2,
Config/testthat/edition: 3
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
LazyData: true
VignetteBuilder: knitr
210 changes: 181 additions & 29 deletions R/03-annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
#' @description In this function, a `gimap_dataset` is annotated as far as which genes should be used as controls.
#' @param .data Data can be piped in with %>% or |> from function to function. But the data must still be a gimap_dataset
#' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
#' @param gene_id_type Specify what kind of gene IDs are specified in the `pg_ids`. By default will assume gene symbol.
#' @param control_genes A list of genes that should be labeled as control genes. These will be used for log fold change calculations.
#' @param cell_line which cell line are you using? Default is "HELA"
#' @param cn_annotate TRUE or FALSE you'd also like to have Copy number annotation from DepMap. These data are optional
#' @param annotation_file If no file is given, will attempt to use the design file from https://media.addgene.org/cms/filer_public/a9/9a/a99a9328-324b-42ff-8ccc-30c544b899e4/pgrna_library.xlsx
#' @param control_genes A vector of gene symbols (e.g. AAMP) that should be labeled as control genes. These will be used for log fold change calculations. If no list is given then DepMap Public 23Q4 Achilles_common_essentials.csv is used https://depmap.org/portal/download/all/
#' @export
#' @examples \dontrun{
#'
Expand All @@ -17,46 +19,196 @@
#' gimap_annotate()
#'
#' # To see anotations
#' gimap_dataset$annotations
#' gimap_dataset$annotation
#' }
gimap_annotate <- function(.data = NULL,
gimap_dataset,
cell_line = "HELA",
control_genes = NULL,
cn_annotate = TRUE,
annotation_file = NULL) {

if (!is.null(.data)) gimap_dataset <- .data

if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

# Get the annotation data based on the pg construct design
if (!is.null(annotation_file)) {
if (!file.exists(annotation_file)) stop("The annotation_file specified cannot be found. Please double check the file path")
annotation_df <- read_table(annotation_file)
} else {
annotation_df <- get_example_data("annotation")
}

# We'll take a look at the gimap_dataset$pg_ids and see what kinds of gene ids are there
# If we need to do gene conversion we'd do something like:

# biocLite('org.Hs.eg.db')
# mapIds(org.Hs.eg.db, <column of gene IDs>, 'ENTREZID', 'SYMBOL')
# https://github.com/FredHutch/GI_mapping/blob/main/workflow/scripts/02-get_pgRNA_annotations.Rmd

# This file is from https://depmap.org/portal/download/all/ and from DepMap Public 19Q3 All Files
# Essential gene labeling is from inst/extdata/Achilles_common_essentials.csv
# ctrl vs gene labels are from inst/extdata/pgPEN_annotations.txt

# This is the core of the code we'll need but we'll need to refactor
#d.annot <- d.annot %>%
#mutate(norm_ctrl_flag = case_when(
# target_type == "gene_gene" ~ "double_targeting",
# target_type == "gene_ctrl" & gene1_essential_flag == TRUE ~ "positive_control",
# target_type == "ctrl_gene" & gene2_essential_flag == TRUE ~ "positive_control",
# target_type == "gene_ctrl" & gene1_essential_flag != TRUE ~ "single_targeting",
# target_type == "ctrl_gene" & gene2_essential_flag != TRUE ~ "single_targeting",
# target_type == "ctrl_ctrl" ~ "negative_control")) %>%
#mutate(norm_ctrl_flag = factor(norm_ctrl_flag, levels = c("negative_control",
# "positive_control",
# "single_targeting",
# "double_targeting")))

gimap_dataset$annotation <- NULL #TODO: Final step is annotations that line up to the same order as the pg gene data should be stored here.
############################ CONTROL GENE ANNOTATION #########################
# If control genes aren't provided then we get some from DepMap
if (!is.null(control_genes)) {
if (!file.exists(control_genes)) stop("The annotation_file specified cannot be found. Please double check the file path")
control_genes <- read_table(control_genes)[, 1]
} else {
# This file is from https://depmap.org/portal/download/all/ and from DepMap Public 19Q3 All Files
# Essential gene labeling is from inst/extdata/Achilles_common_essentials.csv
control_genes <- readr::read_tsv("https://figshare.com/ndownloader/files/40448429", show_col_types = FALSE)
control_genes <- control_genes %>%
tidyr::separate(col = Gene, into = c("gene_symbol", "entrez_id"), remove = FALSE, extra = "drop") %>%
dplyr::pull(gene_symbol)
}

############################ Get TPM data ####################################
# This is not optional because its used to flag things
## get TPM and CN information (w/ option for user to upload their own info)
depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)

my_depmap_id <- depmap_metadata %>%
dplyr::filter(stripped_cell_line_name == cell_line) %>%
dplyr::pull(DepMap_ID)

tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")

if (!file.exists(tpm_file)) tpm_setup()

depmap_tpm <- readr::read_csv(tpm_file,
show_col_types = FALSE,
col_select = c("genes", dplyr::all_of(my_depmap_id))
) %>%
dplyr::rename(log2_tpm = my_depmap_id) %>%
dplyr::mutate(expressed_flag = dplyr::case_when(
log2_tpm < 1 ~ FALSE,
log2_tpm >= 1 ~ TRUE,
is.na(log2_tpm) ~ NA
))

############################ COPY NUMBER ANNOTATION ##########################
if (cn_annotate) {

cn_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_gene_cn.csv")
if (!file.exists(cn_file)) cn_setup()

# Read in the CN data
depmap_cn <- readr::read_csv(cn_file,
show_col_types = FALSE,
col_select = c("genes", my_depmap_id)
) %>%
dplyr::rename(log2_cn = my_depmap_id)

annotation_df <- annotation_df %>%
dplyr::left_join(depmap_cn, by = c("gene1_symbol" = "genes")) %>%
dplyr::left_join(depmap_cn, by = c("gene2_symbol" = "genes"), suffix = c("_gene1", "_gene2"))
}

############################ ANNOTATION COMBINING ############################
# This set up is more or less the same as the original
# https://github.com/FredHutch/GI_mapping/blob/41ac7d5ed7025252343e2c823fba22f8a363e25c/workflow/scripts/02-get_pgRNA_annotations.Rmd#L435
annotation_df <- annotation_df %>%
dplyr::mutate(
gene1_essential_flag = gene1_symbol %in% control_genes,
gene2_essential_flag = gene2_symbol %in% control_genes) %>%
dplyr::left_join(depmap_tpm, by = c("gene1_symbol" = "genes")) %>%
dplyr::rename(gene1_expressed_flag = expressed_flag) %>%
dplyr::left_join(depmap_tpm, by = c("gene2_symbol" = "genes"), suffix = c("_gene1", "_gene2")) %>%
dplyr::rename(gene2_expressed_flag = expressed_flag) %>%
dplyr::mutate(norm_ctrl_flag = dplyr::case_when(
target_type == "gene_gene" ~ "double_targeting",
target_type == "gene_ctrl" & gene1_essential_flag == TRUE ~ "positive_control",
target_type == "ctrl_gene" & gene2_essential_flag == TRUE ~ "positive_control",
target_type == "gene_ctrl" & gene1_essential_flag != TRUE ~ "single_targeting",
target_type == "ctrl_gene" & gene2_essential_flag != TRUE ~ "single_targeting",
target_type == "ctrl_ctrl" ~ "negative_control"
)) %>%
dplyr::mutate(norm_ctrl_flag = factor(norm_ctrl_flag, levels = c(
"negative_control",
"positive_control",
"single_targeting",
"double_targeting"
)))

################################ STORE IT ####################################
gimap_dataset$annotation <- annotation_df

return(gimap_dataset)
}


# This function sets up the tpm data from DepMap is called by the `gimap_annotate()` function
tpm_setup <- function() {
tpm_file <- file.path(
system.file("extdata", package = "gimap"),
"CCLE_expression.csv"
)

download.file("https://figshare.com/ndownloader/files/34989919",
destfile = tpm_file,
method = "wget"
)

data_df <- readr::read_csv(tpm_file,
show_col_types = FALSE,
name_repair = make.names
)

cell_line_ids <- data_df$X

genes <- stringr::word(colnames(data_df)[-1], sep = "\\.\\.", 1)

colnames(data_df) <- c("cell_line_ids", genes)

data_df <- as.data.frame(t(data_df[, -1]))
colnames(data_df) <- cell_line_ids
data_df$genes <- genes

data_df %>%
dplyr::select(genes, dplyr::everything()) %>%
readr::write_csv(tpm_file)

return(tpm_file)
}

# This function sets up the tpm data from DepMap is called by the `gimap_annotate()` function if the cn_annotate = TRUE
cn_setup <- function() {
cn_file <- file.path(
system.file("extdata", package = "gimap"),
"CCLE_gene_cn.csv"
)

download.file("https://figshare.com/ndownloader/files/34989937",
destfile = cn_file,
method = "wget"
)

data_df <- readr::read_csv(cn_file,
show_col_types = FALSE,
name_repair = make.names
)

cell_line_ids <- data_df$X

genes <- stringr::word(colnames(data_df)[-1], sep = "\\.\\.", 1)

colnames(data_df) <- c("cell_line_ids", genes)

data_df <- as.data.frame(t(data_df[, -1]))
colnames(data_df) <- cell_line_ids
data_df$genes <- genes

data_df %>%
dplyr::select(genes, dplyr::everything()) %>%
readr::write_csv(cn_file)

return(cn_file)
}

# This function sets up the control genes file from DepMap is called by the `gimap_annotate()`
crtl_genes <- function() {
crtl_genes_file <- file.path(
system.file("extdata", package = "gimap"),
"Achilles_common_essentials.csv"
)

download.file("https://figshare.com/ndownloader/files/34989871",
destfile = crtl_genes_file,
method = "wget"
)

return(crtl_genes_file)
}

11 changes: 10 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,16 @@ get_example_data <- function(which_data) {
full.names = TRUE
)
return(readr::read_rds(file))
} else if (which_data == "annotation") {
file <- list.files(
pattern = "pgPEN_annotations.txt",
recursive = TRUE,
system.file("extdata", package = "gimap"),
full.names = TRUE
)
return(readr::read_tsv(file, show_col_types = FALSE))
} else {
stop("Specification for `which_data` not understood; Need to use 'gimap', count', or 'meta'")
stop("Specification for `which_data` not understood; Need to use 'gimap', count', 'meta', or 'annotation' ")
}
}

Expand Down Expand Up @@ -116,6 +124,7 @@ example_data_folder <- function() {
dirname(file)
}

# This function sets up the example count data
save_example_data <- function() {
example_data <- get_example_data("count")

Expand Down
Loading

0 comments on commit 15affcd

Please sign in to comment.