move to github

abcwcm · Mar 3, 2022 · 1d8f25e · 1d8f25e
1 parent 369f267
commit 1d8f25e
Show file tree

Hide file tree

Showing 65 changed files with 6,578 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,26 @@
+Package: Klebanoff0606T1
+Type: Package
+Title: Single-Cell-Sequencing Data Generated by The Klebanoff Lab for Sample 0606T1
+Version: 0.1.3
+Author: c(person("Friederike", "Dündar", email = "frd2007@med.cornell.edu", role = c("aut","cre")),
+    person("Paul","Zumbo", email="paz2005@med.cornell.edu", role = c("aut","ctb")))
+Description: SingleCellExperiment object and list of differentially expressed genes
+    as determined using 5'-DGE and V(D)J sequencing of tumor-antigen-specific T cells
+    and corresponding control cells; all obtained from donor 0606T1. The tumor-
+    specific antigen is a mutant form of PIK3CA.
+Depends:
+    R (>= 3.5.0)
+Imports:
+    data.table,
+    EnsDb.Hsapiens.v86,
+    ggplot2,
+    magrittr,
+    scater,
+    SingleCellExperiment
+Suggests:
+    stringr,
+    usethis
+License: MIT
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,20 @@
+# Generated by roxygen2: do not edit by hand
+
+export(check_columns)
+export(extract_markers)
+export(load_0606T1filt)
+export(load_0606T1merged)
+export(load_0606T1shared)
+export(load_DE_results)
+export(load_RDSdata_from_Box)
+export(load_data_from_Box)
+export(load_sce)
+export(make_long_dt)
+export(my_table)
+export(plot_tgram)
+export(prep_data4tgram)
+export(run_DE)
+import(data.table)
+import(kableExtra)
+import(magrittr)
+import(scater)
diff --git a/NEWS b/NEWS
@@ -0,0 +1,25 @@
+Klebanoff0606T1 v.0.1.3 (2022-02-09)
+====================================
+
+* removed dependencies on `ABCUtilities` and `scABC2`
+
+Klebanoff0606T1 v.0.1.2 (2021-07-14)
+====================================
+
+* updated use of `scABC2::make_long_dt()` in `tgrams.R` to reflect that function's change of parameter names (genes --> features)
+
+Klebanoff0606T1 v.0.1.1 (2020-04-15)
+====================================
+
+* added list of clonotypes of interest (`cdrs0606T1`)
+* re-worked the addition of the clonotype ID and clonotype frequency calculation
+because I had formerly neglected to take into account the doublet removal
+from sce.shared, i.e. the clonotype counts weren't reflecting the cell numbers
+after that filtering step
+* added additional entry to colData: `fit_for_test` that assesses whether a
+given clonotype has sufficient cells in *both* conditions to be used for a
+t-test
+* `add_frequencies()` now allows to specify what type of feature should be counted
+(sensibly, cdr3s_aa or id)
+* `abbreviations_of_clonotypes.Rmd` has been removed as it is now part of the first
+vignette (01_FilteringAndProcessing.Rmd)
diff --git a/R/data_DEresults.R b/R/data_DEresults.R
@@ -0,0 +1,43 @@
+#' Load the list of DE test results (any direction) for all clonotypes present
+#' in both conditions ('shared' clonotypes)
+#'
+#' @details The p- and q-values here represent a two-tailed test for any direction
+#' of the logFC.
+#' For details on how the DE analysis was done, see the vignette "DE_genes"
+#' and the wrapper function \code{\link{run_DE}}.
+#'
+#' @format Nested list where names correspond to the abbreviated clonotype IDs.
+#' For every clonotype, there's a list that contains:
+#' \describe{
+#' \item{findMarkers_results:}{A SimpleDataFrameList, i.e. the original result
+#' of \code{scran::findMarkers()}, but only for the MUT comparison}
+#' \item{marker_IDs:}{a data.table with the genes that passed the FDR threshold; if that is NULL, this implies that there were no DEG for that particular clonotype comparing MUT vs WT}
+#' }
+#' @usage load_DE_results("0606T1")
+#' @examples \dontrun{
+#' library(Klebanoff0606T1)
+#'
+#' sce.shared <- load_0606T1shared()
+#' sce.shared$antigen <-  factor(gsub("\\..*","",sce.shared$Sample),
+#'     levels = c("WT", "MUT"), ordered = TRUE)
+#'
+#' delist.both <- lapply( unique(sce.shared$id), function(x){
+#' run_DE(
+#'    sce.shared[, sce.shared$id == x],
+#'    group_identifier = "antigen",
+#'    direction = "any",
+#'    FDR = 0.05, rank = Inf,
+#'    comp_name = paste0(x, "_"))
+#'    })
+#' names(delist.both) <- unique(sce.shared$id)
+#' }
+#'
+#'@export
+#'
+load_DE_results <- function(sample = "0606T1"){
+    fn <- system.file("extdata", "delist.both",
+        package = paste0("Klebanoff", sample), mustWork = TRUE)
+
+    fin <- read.table(fn, stringsAsFactor = FALSE)[[1]]
+    load_data_from_Box(fin, load_rds = FALSE)
+}
diff --git a/R/data_clonotypelist.R b/R/data_clonotypelist.R
@@ -0,0 +1,7 @@
+#' List of the winning and unspecifically reactive clonotypes for 21LT2
+#'
+#' @format Each level of the list contains the CDR3s_aa entry for the clonotype
+#' that showed the highest reactivity towards the MUT antigen ("winner") as
+#' well as a non-reactive clonotype as well as a clonotype with stronger IFNG
+#' expression in the WT compared to the MUT situation.
+"cdrs0606T1"
diff --git a/R/data_sce.R b/R/data_sce.R
@@ -0,0 +1,177 @@
+#' Load the SCE for the batch-corrected merged data set
+#'
+#' @description Load the SingleCellExperiment object that holds the batch-corrected
+#' reduced dimensionality results (MUT/WT = batch).
+#'
+#' @usage sce.filt <- load_0606T1filt()
+#'
+#' @seealso \code{\link{load_0606T1shared}}, \code{\link{load_0606T1merged}}
+#' @return an SCE object that needs to be assigned to an object in the environment
+#'
+#' @export
+#'
+load_0606T1filt <- function(){
+    out <- load_sce(which_assays = "all", sample = "0606T1")
+
+}
+
+#' Load the SCE for the batch-corrected merged data set
+#'
+#' @description Load the SingleCellExperiment object that holds the batch-corrected
+#' reduced dimensionality results (MUT/WT = batch).
+#'
+#' @usage sce.merged <- load_0606T1merged()
+#' @return an SCE object that needs to be assigned to an object in the environment
+#'
+#' @export
+#'
+load_0606T1merged <- function(){
+    out <- load_sce(which_assays = "reconstructed", sample = "0606T1Merged")
+    return(out)
+}
+
+#' Load the SCE for the batch-corrected merged data set
+#'
+#' @description Load the SingleCellExperiment object that holds the SCE
+#' representing cells with clonotypes that are present in both conditions.
+#' The UMAP coordinates were re-calculated on the reduced subset after removal
+#' of suspected doublets (see the vignette about the filtering).
+#'
+#' @return an SCE object that needs to be assigned to an object in the environment
+#'
+#' @usage sce.shared <- load_0606T1shared()
+#'
+#' @export
+load_0606T1shared <- function(){
+    out <- load_sce(which_assays = "logcounts", sample = "0606T1Shared")
+    return(out)
+}
+
+
+#' Filtered cells ' ' @format Named list of cell numbers following the filtering steps
+#described in this vignette.
+"cell_filt"
+
+#' Filtered genes ' ' @format Named list of gene numbers following the filtering steps
+#described in this vignette.
+"gene_filt"
+
+
+#' Load the filtered and processed SingleCellExperiment object
+#'
+#' @description Use this function to load the processed and filtered gene expression
+#' data plus the clonotype information stored within one SingleCellExperiment
+#' object.
+#'
+#'
+#' @param which_assays can be "all" (default) or individual assays, e.g. c("logcounts",
+#' "counts") etc. If space and memory are problematic, definitely limit the selection here!
+#' assays that are available are: "counts", "logcounts"
+#' @param ... Additional parameters passed on to \code{load_RDSdata_from_Box}, e.g.
+#' \code{check_for_updates = TRUE}
+#'
+#' @details
+#' For the entire code of the filtering and processing, see the vignette
+#' \code{01_processing.Rmd}.
+#'
+#' The resulting SCE object contains the usual content: colData with information
+#' about individidual cells, rowData with info about individual genes, reducedDims,
+#' etc.
+#'
+#' The \code{colData} includes:
+#'
+#' \describe{
+#' \item{Barcode:}{Each cell's barcode used for keeping track of its identity during sequencing.}
+#' \item{Sample:}{'WT' or 'MUT'}
+#' \item{raw_clonotype_id:}{e.g. 'clonotype94'}
+#' \item{cdr3s_aa:}{The amino acid sequence of the CDR3 portion, e.g. "TRA:CIARGGGGADGLTF;TRA:CGADRNGNEKLTF;TRB:CASSLTTDREPYEQYF"}
+#' \item{multiTRA:}{TRUE/FALSE entries based on whether \code{cdr3s_aa} contained more than one entry for TRA}
+#' \item{multiTRB:}{TRUE/FALSE entries based on whether \code{cdr3s_aa} contained more than one entry for TRB}
+#' \item{numTRA:}{Number of TRA sequences within \code{cdr3s_aa}}
+#' \item{numTRB:}{Number of TRB sequences within \code{cdr3s_aa}}
+#' \item{cluster:}{clustering results of all cells}
+#' }
+#'
+#' The object also containes the coordinates from dimensionality reductions (see
+#' examples for more details).
+#'
+#'
+#' @usage sce.filt <- load_sce(which_assays = "logcounts", sample = "21LT2")
+#'
+#' @return A SingleCellExperiment object with cells from 'mutant' samples
+#'  (stimulation with tumor antigen) and from the 'wt' sample (stimulation
+#'  with an irrelevant antigen)).
+#'
+#'
+#' @examples \dontrun{
+#'
+#' > library(SingleCellExperiment)
+#' > sce.21LT2 <- load_sce(which_assays = "all", sample = "21LT2")
+#'
+#' > reducedDimNames(sce.21LT2)
+#' "corrected" "TSNE"      "UMAP"
+#'
+#' > assayNames(sce.21LT2)
+#' [1] "counts"                "logcounts"
+#' }
+#'
+#' @return SCE object
+#'
+#'
+#' @export
+#'
+load_sce <- function(which_assays = "all", sample = "Sample", ...){
+
+    ## the Box links are noted in the text file
+    fl <- system.file("extdata", paste0("sce_storage_", sample, ".txt"),
+        package = "Klebanoff0606T1")
+    if(fl == ""){stop(paste("sce_storage_", sample, ".txt does not exist in package 'Klebanoff0606T1'."))}
+
+    inf <- read.table(fl,stringsAsFactors = FALSE)
+
+    if(unique(inf$V3) != sample){stop("The sce_storage.txt file must contain a third column holding the sample name. Which should be the same as the one specified via sample = .")}
+
+    ## DOWNLOAD AND CACHE THE FILES FROM THE BOX ==============================
+    ## note: using the default cache of BioC here, we may want to change that
+    ## to something more specific via the `cache_path` option of `load_RDSdata_fromBox()`
+
+    ## load colData
+    cold <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "colData",]$V2, data_name = paste0("KlebColData",sample), ...)
+
+    ## load rowData
+    rowd <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "rowData",]$V2, data_name = paste0("KlebRowData", sample) , ...)
+
+    ## get reducedDims
+    rdms <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "reducedDims",]$V2, data_name = paste0("KlebRedDims", sample), ... )
+
+    ## metadata
+    metd <- load_RDSdata_from_Box(
+        shared_link = inf[inf$V1 == "metadata",]$V2, data_name = paste0("KlebMetadata", sample), ... )
+
+    ## get assayData
+    if(which_assays == "all"){
+        ## extract corresponding assay entry from the text file
+        asss <- grep("^assay:", unique(inf$V1), value = TRUE)
+    }else{
+        asss <- unlist(lapply(which_assays, function(x) grep(paste0(":",x,"$"), unique(inf$V1), ignore.case = TRUE, value=TRUE)))
+        if(length(which_assays) == 0){
+            warning("None of the assays you specified are part of the file stored in inst/extdata, i.e. we can't find the links.")
+        }
+    }
+
+    assl <- list()
+    for(i in asss){
+        j <- gsub("^assay:","", i)
+        assl[[j]] <-  load_RDSdata_from_Box(
+            shared_link = inf[inf$V1 == i,]$V2, data_name = paste0("Kleb",j, sample), ...)
+    }
+
+    ## construct the SCE object =============================================
+    return(SingleCellExperiment::SingleCellExperiment(assays = assl,
+        colData = cold, rowData = rowd,
+        metadata = metd, reducedDims = rdms))
+}
+
diff --git a/R/data_sharedClonotypes.R b/R/data_sharedClonotypes.R
@@ -0,0 +1,46 @@
+#' Shared clonotypes
+#'
+#' @description \code{data.table} of the clonotypes that are found in both
+#' conditions MUT and WT in the 0606T1 data set.
+#'
+#' @usage data(shared_clonotypes)
+#'
+#' @examples \dontrun{
+#'
+#' ## count TRA/TRB
+#' clono_freq <- colData(sce.filt)[, c("Sample" ,"cdr3s_aa")] %>%
+#'    as.data.frame %>% data.table(., keep.rownames = TRUE) %>%
+#'     .[!is.na(cdr3s_aa), .N, c("cdr3s_aa","Sample")]
+#' setorder(clono_freq, N)
+#'
+#' ## formatting the TRA/TRB notations
+#' ## will only work if there's just one TRA
+#' ct <- dcast(clono_freq, cdr3s_aa ~ Sample, value.var = "N") %>%
+#'  .[!is.na(MUT.0606T1) & !is.na(WT.0606T1)]
+#'
+#' ct[, TRA := gsub(";*TRB:[A-Z]+", "", cdr3s_aa)]
+#' ct[, TRA := ifelse(TRA == "", NA, TRA)]
+#' ct[, TRB := gsub(".*(TRB:[A-Z]+)", "\\1", cdr3s_aa)]
+#' ct[, TRB := ifelse(grepl("^TRA", TRB), NA, TRB)] # if only TRB was present,
+#'  I need to fill in the NA
+#'  setorder(ct, -MUT.0606T1, -WT.0606T1 )
+#'  shared_clonotypes <- copy(ct)
+#' }
+#'
+#' @seealso \code{clonotype_ids}
+"shared_clonotypes"
+
+
+
+#' Table of customized clonotype IDs for sample 0606T1
+#'
+#' @description \code{data.table} with our customized clonotype IDs for ease of
+#' visualization and comparison. I.e., the CDR3s amino acid sequences are re-
+#' placed with arbitrary IDs. Note thate these clonotypes are those that are
+#' found in both conditions of patient 0606T1, i.e. MUT and WT.
+#'
+#' @details See the section "Adding the clonotype IDs" in the vignette "Filtering and Processing" 
+#' about how the consolidation and clean up of the TRA/TRB sequences was done.
+#'
+#' @seealso \code{shared_clonotypes}
+"clonotype_ids"