Merge pull request #179 from pachterlab/devel

Version 0.30.0
pachterlab · Jun 6, 2018 · e84554f · e84554f
2 parents 8308dfd + bc0fddc
commit e84554f
Show file tree

Hide file tree

Showing 64 changed files with 2,144 additions and 578 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,35 @@
+# version 0.30.0
+
+This version integrates [p-value aggregation](https://github.com/pachterlab/sleuth/pull/148) as described in [Yi et al.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1419-z).
+The behavior of gene-level differential expression testing now follows this procedure:
+
+1. Isoform-level testing.
+2. P-value aggregation at the gene level (using `target_mapping`) by the lancaster method.
+
+Thank you to [Lynn Yi](https://github.com/lynnyi) for implementing p-value aggregation.
+Please see [pull request #148](https://github.com/pachterlab/sleuth/pull/148) for details.
+
+The API has also slightly changed. Particularly, for `sleuth_prep`, several options have been moved to optional arguments via `...`. See [pull request #168](https://github.com/pachterlab/sleuth/pull/168) for more information or `?sleuth_prep` in R.
+
+A fair amount of speed up and bug fixes have also been implemented.
+
+- [Patch: bugs in sleuth_results & other miscellaneous fixes](https://github.com/pachterlab/sleuth/pull/163)
+- [Fix behavior of sleuth_results when gene_mode is TRUE (and error reporting)](https://github.com/pachterlab/sleuth/pull/160)
+- [Shiny and Plot Fixes / Enhancements](https://github.com/pachterlab/sleuth/pull/159)
+- [Quick Patch: UseMethod typo](https://github.com/pachterlab/sleuth/pull/157)
+- [Update `write_kallisto_hdf5` function and add ability ot subset kallisto object (address #131)](https://github.com/pachterlab/sleuth/pull/150)
+- [extend sleuth to model TPMs](https://github.com/pachterlab/sleuth/pull/145)
+- [Fixes to various miscellaneous issues (#73, #84, #97, #122, #135, #142)](https://github.com/pachterlab/sleuth/pull/144)
+- [Improvements to shiny and plot functions (solving several open issues)](https://github.com/pachterlab/sleuth/pull/143)
+- [Possible solution to NAs in sleuth_lrt, addressing #68](https://github.com/pachterlab/sleuth/pull/118)
+- [bug fix patches](https://github.com/pachterlab/sleuth/pull/117)
+- [address #113 - patch bug where TPM bootstrap summary target_ids are moved](https://github.com/pachterlab/sleuth/pull/116)
+- [New tests for ".N" target mappings](https://github.com/pachterlab/sleuth/pull/115)
+- [Misc bug fixes + Allow sleuth_prep to process just one sample](https://github.com/pachterlab/sleuth/pull/114)
+
+A major thanks to [Warren McGee](https://github.com/warrenmcg) for doing the majority of the heavy lifting on all of the bug fixes.
+
+
 # version 0.29.0
 
 This version has numerous bug fixes and several performance upgrades.

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,31 +1,36 @@
 Package: sleuth
 Title: Tools for investigating RNA-Seq
-Version: 0.29.0
-Authors@R: c(person("Harold", "Pimentel", , "haroldpimentel@gmail.com", role = c("aut", "cre")))
+Version: 0.30.0
+Authors@R: c(
+  person("Harold", "Pimentel", , "haroldpimentel@gmail.com", role = c("aut", "cre")),
+  person("Warren", "McGee", , "warren-mcgee@fsm.northwestern.edu", role = "aut"))
 Description: Investigate transcript abundance from "kallisto" and differential
     expression analysis from RNA-Seq data.
 License: GPL-3
+Encoding: UTF-8
 LazyData: true
 URL: https://github.com/pachterlab/sleuth
 BugReports: https://github.com/pachterlab/sleuth/issues
 Depends:
     R (>= 3.2.1),
-    methods,
-    ggplot2,
-    dplyr
+    methods
 Imports:
+    ggplot2,
+    dplyr,
     data.table,
     tidyr,
     reshape2,
     rhdf5,
     parallel,
     lazyeval,
     matrixStats,
-    shiny
+    pheatmap,
+    shiny,
+    aggregation
 Suggests:
     MASS,
     lintr,
     testthat,
     knitr
 VignetteBuilder: knitr
-RoxygenNote: 5.0.1
+RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,27 +6,35 @@ S3method(bias_table,sleuth)
 S3method(get_bootstraps,kallisto)
 S3method(get_bootstraps,sleuth)
 S3method(head,kallisto)
+S3method(is_kallisto_subset,kallisto)
+S3method(is_kallisto_subset,sleuth)
 S3method(models,sleuth)
 S3method(models,sleuth_model)
 S3method(plot_fld,kallisto)
 S3method(plot_fld,sleuth)
 S3method(print,kallisto)
 S3method(print,sleuth)
 S3method(print,sleuth_model)
+S3method(subset_kallisto,kallisto)
+S3method(subset_kallisto,sleuth)
 S3method(summary,sleuth)
 S3method(tests,sleuth)
-export("transform_fun<-")
+S3method(transform_status,sleuth)
+S3method(transform_status,sleuth_model)
+export("transform_fun_counts<-")
+export("transform_fun_tpm<-")
 export(basic_filter)
 export(bias_table)
-export(bs_sigma_summary)
 export(counts_to_fpkm)
 export(counts_to_tpm)
 export(design_matrix)
 export(enclosed_brush)
+export(excluded_ids)
 export(extract_model)
 export(get_bootstrap_summary)
 export(get_bootstraps)
 export(get_quantile)
+export(is_kallisto_subset)
 export(kallisto_table)
 export(log_transform)
 export(melt_bootstrap_sleuth)
@@ -64,13 +72,17 @@ export(sleuth_save)
 export(sleuth_to_matrix)
 export(sleuth_wt)
 export(sliding_window_grouping)
+export(subset_kallisto)
 export(tests)
 export(tpm_to_alpha)
 export(transcripts_from_gene)
 export(transform_status)
-export(transform_status.sleuth)
-export(transform_status.sleuth_model)
+export(write_kallisto_hdf5)
 import(dplyr)
+import(ggplot2)
 importFrom(data.table,fread)
+importFrom(dplyr,"%>%")
 importFrom(lazyeval,interp)
 importFrom(lazyeval,lazy)
+importFrom(rhdf5,h5write)
+importFrom(rhdf5,h5write.default)
diff --git a/R/bootstrap.R b/R/bootstrap.R
@@ -101,10 +101,12 @@ get_bootstraps.kallisto <- function(kal, transcript, max_bs = 30) {
 # @param kal a kallisto object
 # @param column the column to pull out of the kallisto results (default = "tpm")
 # @return a molten data.frame with columns "target_id", "sample" and the selected variable
+# @importFrom dplyr %>%
 # @export
 melt_bootstrap <- function(kal, column = "tpm", transform = identity) {
     stopifnot(is(kal, "kallisto"))
   stopifnot(length(kal$bootstrap) > 0)
+    `%>%` <- dplyr::`%>%`
 
     all_boot <- kal$bootstrap
     boot <- data.frame(lapply(all_boot, select_, .dots = list(column)))
@@ -129,11 +131,13 @@ melt_bootstrap <- function(kal, column = "tpm", transform = identity) {
 # @param aggregate_fun a function to aggregate
 # @return a data.frame nrow(mapping) rows that has been aggregated
 # groupwise using \code{aggregate_fun}
+# @importFrom dplyr %>%
 # @export
 aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
   column = "tpm", aggregate_fun = sum) {
 
   stopifnot( is(kal, "kallisto") )
+  `%>%` <- dplyr::`%>%`
 
   if ( !(column %in% c("tpm", "est_counts")) ) {
     stop("Unit must be 'tpm' or 'est_counts'")
@@ -177,9 +181,12 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
 # @param kal a kallisto object with a non-null bootstrap list
 # @param column the column to select (rho, tpm, est_counts
 # @return a summarized data.frame
+# @importFrom dplyr %>%
 # @export
 summarize_bootstrap <- function(kal, column = "tpm", transform = identity) {
     stopifnot(is(kal, "kallisto"))
+    `%>%` <- dplyr::`%>%`
+
     bs <- melt_bootstrap(kal, column, transform)
 
     mean_col <- paste0("bs_mean_", column)
@@ -256,11 +263,12 @@ get_bootstrap_summary <- function(obj, target_id, units = 'est_counts') {
     stop(paste0("'", units, "' is invalid for 'units'. please see documentation"))
   }
 
-  if (is.null(obj$bs_quants)) {
-    if (units == 'est_counts') {
-      stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE'")
+  if (is.null(obj$bs_quants) | is.null(obj$bs_quants[[1]][[units]])) {
+    if (units %in% c('est_counts', 'scaled_reads_per_base')) {
+      stop("bootstrap summary appears to be missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE'")
     } else {
-      stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE' and 'read_bootstrap_tpm = TRUE'")
+      stop("bootstrap summary appears to be missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE' ",
+           "and 'read_bootstrap_tpm = TRUE'")
     }
   }
 
@@ -312,7 +320,7 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
       mat <- matrix(NA_real_, nrow = nrow(obj$kal[[1]]$abundance),
         ncol = nrow(which_samp))
       rownames(mat) <- obj$kal[[1]]$abundance$target_id
-      colnames(mat) <- obj$sample_to_condition$sample
+      colnames(mat) <- obj$sample_to_covariates$sample
       mat
     })
 
@@ -376,13 +384,15 @@ process_bootstrap <- function(i, samp_name, kal_path,
                               read_bootstrap_tpm, gene_mode,
                               extra_bootstrap_summary,
                               target_id, mappings, which_ids,
-                              aggregation_column, transform_fun)
+                              aggregation_column, transform_fun_counts,
+                              transform_fun_tpm, max_bootstrap)
 {
   dot(i)
   bs_quants <- list()
 
   num_bootstrap <- as.integer(rhdf5::h5read(kal_path$path,
                                             "aux/num_bootstrap"))
+  num_bootstrap <- min(num_bootstrap, max_bootstrap)
   if (num_bootstrap == 0) {
     stop(paste0("File ", kal_path, " has no bootstraps.",
                 "Please generate bootstraps using \"kallisto quant -b\"."))
@@ -396,17 +406,16 @@ process_bootstrap <- function(i, samp_name, kal_path,
                                est_count_sf = est_count_sf)
 
   if (read_bootstrap_tpm) {
-    bs_quant_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm,
+    bs_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm,
                                 eff_len))
-    colnames(bs_quant_tpm) <- colnames(bs_mat)
+    colnames(bs_tpm) <- colnames(bs_mat)
 
     # gene level code is analogous here to below code
     if (gene_mode) {
-      colnames(bs_quant_tpm) <- target_id
       # Make bootstrap_num an explicit column; each is treated as a "sample"
       bs_tpm_df <- data.frame(bootstrap_num = c(1:num_bootstrap),
-                              bs_quant_tpm, check.names = F)
-      rm(bs_quant_tpm)
+                              bs_tpm, check.names = F)
+      rm(bs_tpm)
       # Make long tidy table; this step is much faster
       # using data.table melt rather than tidyr gather
       tidy_tpm <- data.table::melt(bs_tpm_df, id.vars = "bootstrap_num",
@@ -423,13 +432,14 @@ process_bootstrap <- function(i, samp_name, kal_path,
       # see: http://stackoverflow.com/a/31295592
       quant_tpm_formula <- paste("bootstrap_num ~",
                                  aggregation_column)
-      bs_quant_tpm <- data.table::dcast(tidy_tpm,
+      bs_tpm <- data.table::dcast(tidy_tpm,
                                         quant_tpm_formula, value.var = "tpm",
                                         fun.aggregate = sum)
-      bs_quant_tpm <- as.matrix(bs_quant_tpm[, -1])
+      bs_tpm <- as.matrix(bs_tpm[, -1])
       rm(tidy_tpm) # these tables are very large
     }
-    bs_quant_tpm <- aperm(apply(bs_quant_tpm, 2,
+    bs_tpm <- transform_fun_tpm(bs_tpm[, which_ids])
+    bs_quant_tpm <- aperm(apply(bs_tpm, 2,
                                 quantile))
     colnames(bs_quant_tpm) <- c("min", "lower", "mid",
                                 "upper", "max")
@@ -483,6 +493,7 @@ process_bootstrap <- function(i, samp_name, kal_path,
     rm(tidy_bs, scaled_bs)
   }
 
+  bs_mat <- transform_fun_counts(bs_mat[, which_ids])
   if (extra_bootstrap_summary) {
     bs_quant_est_counts <- aperm(apply(bs_mat, 2,
                                        quantile))
@@ -491,20 +502,24 @@ process_bootstrap <- function(i, samp_name, kal_path,
     bs_quants$est_counts <- bs_quant_est_counts
   }
 
-  bs_mat <- transform_fun(bs_mat)
   # If bs_mat was made at gene-level, already has column names
   # If at transcript-level, need to add target_ids
-  if(!gene_mode) {
-    colnames(bs_mat) <- target_id
-  } else {
+  if(gene_mode & extra_bootstrap_summary) {
     # rename est_counts to scaled_reads_per_base
     bs_quants$scaled_reads_per_base <- bs_quants$est_counts
     bs_quants$est_counts <- NULL
   }
   # all_sample_bootstrap[, i] bootstrap point estimate of the inferential
   # variability in sample i
   # NOTE: we are only keeping the ones that pass the filter
-  bootstrap_result <- matrixStats::colVars(bs_mat[, which_ids])
+  bootstrap_result <- matrixStats::colVars(bs_mat)
 
-  list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result)
+  if(read_bootstrap_tpm) {
+    tpm_result <- matrixStats::colVars(bs_tpm)
+    list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result,
+         bootstrap_tpm_result = tpm_result)
+  } else {
+    list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result,
+         bootstrap_tpm_result = NULL)
+  }
 }