Option in scDissim method to not use the step function.

VCCRI · Dec 6, 2016 · 63fb614 · 63fb614
1 parent d884b0c
commit 63fb614
Show file tree

Hide file tree

Showing 10 changed files with 269 additions and 8 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,3 +22,4 @@ LinkingTo: Rcpp, RcppParallel, RcppEigen
 RoxygenNote: 5.0.1
 NeedsCompilation: yes
 SystemRequirements: GNU make
+Suggests: testthat
diff --git a/R/CIDR.R b/R/CIDR.R
@@ -63,6 +63,8 @@ setClass("scData", representation(tags="matrix",
                                   priorTPM="numeric",
                                   dThreshold="vector",
                                   wThreshold="numeric",
+                                  pDropoutCoefA="numeric",
+                                  pDropoutCoefB="numeric",
                                   dropoutCandidates="matrix",
                                   PC="matrix",
                                   variation="vector",
@@ -188,21 +190,29 @@ setGeneric("wThreshold", function(object, cutoff=0.5, plotTornado=FALSE){
 #' @title Imputation Weighting Threshold
 #'
 #' @description
-#' determines the imputation weighting threshold.
+#' Determines the imputation weighting threshold.
 #'
 #' @rdname wThreshold
 #' @name wThreshold
 #'
 #' @param object an scData class object.
-#' @param cutoff parameter in the range (0,1), used in the calculation of imputation weighting threshold.
+#' @param cutoff parameter in the range (0,1), used in the calculation of imputation weighting threshold. Default is 0.5.
 #' @param plotTornado Boolean; if \code{TRUE}, the \emph{Tornado Plot} is produced.
 #'
+#' @details
+#' This method finds a function P(u) that maps the average expression level of a gene to the
+#' probability of a dropout occurring. It does this by fitting a negative logistic function
+#' to the empirical dropouts vs average expression data. The imputation weighting threshold
+#' is calculated as the value of u at which P(u) = \code{cutoff}.
+#'
 #' @importFrom minpack.lm nlsLM
 #' @export
 #'
 #' @return an updated scData class object with the following attribute updated
 #'
 #' \item{wThreshold}{imputation weighting threshold.}
+#' \item{pDropoutCoefA}{the steepness parameter of the negative logistic function that fits the data.}
+#' \item{pDropoutCoefB}{the midpoint parameter of the negative logistic function that fits the data.}
 #'
 #' @examples
 #' example(cidr)
@@ -238,10 +248,12 @@ setMethod("wThreshold", "scData", function(object, cutoff, plotTornado){
     }
 
     object@wThreshold <- threshold
+    object@pDropoutCoefA <- a
+    object@pDropoutCoefB <- b
     return(object)
 })
 
-setGeneric("scDissim", function(object, correction=FALSE, threads=0) {
+setGeneric("scDissim", function(object, correction=FALSE, threads=0, useStepFunction=TRUE) {
     standardGeneric("scDissim")
 })
 
@@ -256,6 +268,9 @@ setGeneric("scDissim", function(object, correction=FALSE, threads=0) {
 #' @param object an scData class object.
 #' @param correction Boolean; if \code{TRUE}, Cailliez correction is applied; by default \code{FALSE}.
 #' @param threads integer; number of threads to be used; by default \code{0}, which uses all available threads.
+#' @param useStepFunction Boolean; if \code{TRUE}, uses the step function as the imputation weighting function.
+#'                                 If \code{FALSE}, uses the negative logistic function that was fit by the \code{\link{wThreshold}} function.
+#'                                 Default is \code{TRUE}.
 #'
 #' @importFrom Rcpp evalCpp
 #' @importFrom ade4 cailliez
@@ -268,7 +283,7 @@ setGeneric("scDissim", function(object, correction=FALSE, threads=0) {
 #'
 #' @examples
 #' example(cidr)
-setMethod("scDissim", "scData", function(object, correction, threads){
+setMethod("scDissim", "scData", function(object, correction, threads, useStepFunction){
       ## the user can choose the number of threads
       threads_int <- as.integer(threads)
       if(!is.na(threads_int) && (threads_int > 0) && (threads_int < defaultNumThreads())) {
@@ -282,7 +297,11 @@ setMethod("scDissim", "scData", function(object, correction, threads){
       }
       N <- ncol(object@nData)
       Dist <- array(0, dim=c(N, N))
-      D <- cpp_dist(Dist, object@dropoutCandidates, object@nData, N, object@wThreshold)
+      if (useStepFunction) {
+        D <- cpp_dist(Dist, object@dropoutCandidates, object@nData, N, object@wThreshold)
+      } else {
+        D <- cpp_dist_weighted(Dist, object@dropoutCandidates, object@nData, N, object@pDropoutCoefA, object@pDropoutCoefB)
+      }
       D <- sqrt(D)
       D <- D+t(D)
       if(correction){

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -13,3 +13,7 @@ cpp_dist <- function(dist, truth, counts, ncol, threshold) {
     .Call('cidr_cpp_dist', PACKAGE = 'cidr', dist, truth, counts, ncol, threshold)
 }
 
+cpp_dist_weighted <- function(dist, truth, counts, ncol, a, b) {
+    .Call('cidr_cpp_dist_weighted', PACKAGE = 'cidr', dist, truth, counts, ncol, a, b)
+}
+
diff --git a/man/scDissim.Rd b/man/scDissim.Rd
diff --git a/man/wThreshold.Rd b/man/wThreshold.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -44,3 +44,19 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cpp_dist_weighted
+NumericMatrix cpp_dist_weighted(NumericMatrix dist, IntegerMatrix truth, NumericMatrix counts, int ncol, double a, double b);
+RcppExport SEXP cidr_cpp_dist_weighted(SEXP distSEXP, SEXP truthSEXP, SEXP countsSEXP, SEXP ncolSEXP, SEXP aSEXP, SEXP bSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericMatrix >::type dist(distSEXP);
+    Rcpp::traits::input_parameter< IntegerMatrix >::type truth(truthSEXP);
+    Rcpp::traits::input_parameter< NumericMatrix >::type counts(countsSEXP);
+    Rcpp::traits::input_parameter< int >::type ncol(ncolSEXP);
+    Rcpp::traits::input_parameter< double >::type a(aSEXP);
+    Rcpp::traits::input_parameter< double >::type b(bSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_dist_weighted(dist, truth, counts, ncol, a, b));
+    return rcpp_result_gen;
+END_RCPP
+}
diff --git a/src/scPCA.cpp b/src/scPCA.cpp
@@ -74,3 +74,99 @@ NumericMatrix cpp_dist(NumericMatrix dist, IntegerMatrix truth, NumericMatrix co
 
     return dist;
 }
+
+
+
+struct ProbDropoutFunc {
+  double a;
+  double b;
+
+  inline double getProb(double x) {
+    return 1 / ( 1 + exp(a*(x - b)) );
+  }
+
+  inline double distance(double count1, int isDropoutCandidate1, double count2, int isDropoutCandidate2) {
+    if (isDropoutCandidate1 && isDropoutCandidate2) {
+      return 0;
+    } else {
+      if (isDropoutCandidate1 /*&& !isDropoutCandidate2*/) {
+        double p = getProb(count2);
+        count1 = p*count2 + (1 - p)*count1;
+      }
+      if (isDropoutCandidate2 /*&& !isDropoutCandidate1*/) {
+        double p = getProb(count1);
+        count2 = p*count1 + (1 - p)*count2;
+      }
+      double diff = count1 - count2;
+      return diff*diff;
+    }
+  }
+};
+
+// helper inline code to calculate individual dissimilarity entries - given
+// particular count and truth "column" vectors
+template <typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4>
+inline double
+  weighted_accumulate(InputIterator1 count_col1_it,
+                      InputIterator1 count_col1_end,
+                      InputIterator2 count_col2_it,
+                      InputIterator3 truth_col1_it,
+                      InputIterator4 truth_col2_it,
+                      ProbDropoutFunc func) {
+    double part_dist=0;
+    for (; count_col1_it != count_col1_end ; ++count_col1_it) {
+      part_dist += func.distance(*count_col1_it, *truth_col1_it, *count_col2_it, *truth_col2_it);
+      ++count_col2_it;
+      ++truth_col1_it;
+      ++truth_col2_it;
+    }
+    return part_dist;
+  }
+
+// worker object for use in parallel code - calculates dissimilarity matrix entries
+// for a given begin/end range
+struct WeightedCidrDistance : public Worker {
+
+  // Distance matrix - input & output
+  RMatrix<double> dist;
+
+  // truth matrix - input
+  const RMatrix<int> truth;
+
+  // Counts matrix - input
+  const RMatrix<double> counts;
+
+  const ProbDropoutFunc coefs;
+
+  // cidrDistance constructor - convert input matrices to RMatrix type
+  WeightedCidrDistance(NumericMatrix dist, const IntegerMatrix truth, const NumericMatrix counts, const ProbDropoutFunc coefs) 
+    : dist(dist), truth(truth), counts(counts), coefs(coefs) {}
+
+  // TODO - only expecting begin & end  - as parallelFor only works with begin/end 
+  // maybe don't need all the others - just counts_begin & counts_end
+  void operator()(std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i<end; ++i) {
+      // hold first columns in memory
+      RMatrix<double>::Column count_col_1 = counts.column(i);
+      RMatrix<int>::Column truth_col_1 = truth.column(i);
+      for (std::size_t j=0; j<i; ++j) {
+        RMatrix<double>::Column count_col_2 = counts.column(j);
+        RMatrix<int>::Column truth_col_2 = truth.column(j);
+        // this needs changing to new namenclature
+        dist(j,i) = weighted_accumulate(count_col_1.begin(), count_col_1.end(), count_col_2.begin(), truth_col_1.begin(), truth_col_2.begin(), coefs);
+      }
+    }
+  }
+};
+
+// [[Rcpp::export]]
+NumericMatrix cpp_dist_weighted(NumericMatrix dist, IntegerMatrix truth, NumericMatrix counts, int ncol, double a, double b) {
+  // create worker object for parallel code
+  ProbDropoutFunc coefs = { a, b }; 
+  WeightedCidrDistance cidrDistance(dist, truth, counts, coefs); 
+
+  // do the work in parallel
+  parallelFor(0, ncol, cidrDistance);
+
+  return dist;
+}
diff --git a/tests/testthat.R b/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(cidr)
+
+test_check("cidr")
diff --git a/tests/testthat/test-cpp_dist.R b/tests/testthat/test-cpp_dist.R
@@ -0,0 +1,55 @@
+context("cpp_dist - C++ function to create dissimilarity matrix")
+
+test_that("distance calculation works", {
+  N <- 2
+
+  # Both dropout candidates
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(1, dim=c(1, N))
+  data <- array(c(1.2, 0.9), dim=c(1,N))
+  threshold <- 5.0
+  cpp_dist(dist, dropouts, data, N, threshold)
+  expect_equal(dist, array(c(c(0, 0), c(0, 0)), dim=c(2,2)))
+
+  # Neither dropouts, same counts
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(0, dim=c(1, N))
+  data <- array(c(3.5, 3.5), dim=c(1,N))
+  threshold <- 5.0
+  cpp_dist(dist, dropouts, data, N, threshold)
+  expect_equal(dist, array(c(c(0, 0), c(0, 0)), dim=c(2,2)))
+
+  # Neither dropouts, different counts
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(0, dim=c(1, N))
+  data <- array(c(3.5, 3.6), dim=c(1,N))
+  threshold <- 5.0
+  cpp_dist(dist, dropouts, data, N, threshold)
+  expect_equal(dist, array(c(c(0, 0), c(0.01, 0)), dim=c(2,2)))
+
+  # #1 is dropout, #2 below threshold (imputation occurs)
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(c(1, 0), dim=c(1, N))
+  data <- array(c(1.2, 4.5), dim=c(1,N))
+  threshold <- 5.0
+  cpp_dist(dist, dropouts, data, N, threshold)
+  expect_equal(dist, array(c(c(0, 0), c(0, 0)), dim=c(2,2)))
+
+  # #1 is dropout, #2 above threshold
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(c(1, 0), dim=c(1, N))
+  data <- array(c(1.2, 5.2), dim=c(1,N))
+  threshold <- 5.0
+  cpp_dist(dist, dropouts, data, N, threshold)
+  expect_equal(dist, array(c(c(0, 0), c(16, 0)), dim=c(2,2)))
+
+  # 3 features
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(c(c(0, 0, 0),
+                      c(1, 1, 0)), dim=c(3, N))
+  data <- array(c(c(4.2, 5.7, 4.5),
+                  c(1.2, 0.7, 2.5)), dim=c(3,N))
+  threshold <- 5.0
+  cpp_dist(dist, dropouts, data, N, threshold)
+  expect_equal(dist, array(c(c(0, 0), c(29, 0)), dim=c(2,2)))
+})
diff --git a/tests/testthat/test-cpp_dist_weighted.R b/tests/testthat/test-cpp_dist_weighted.R
@@ -0,0 +1,53 @@
+context("cpp_dist_weighted - dissimilarity matrix using weighted imputation")
+
+test_that("distance calculation works", {
+  N <- 2
+  a <- 1.5
+  b <- 5.0
+
+  # Both dropout candidates
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(1, dim=c(1, N))
+  data <- array(c(1.2, 0.9), dim=c(1,N))
+  cpp_dist_weighted(dist, dropouts, data, N, a, b)
+  expect_equal(dist, array(c(c(0, 0), c(0, 0)), dim=c(2,2)))
+
+  # Neither dropouts, same counts
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(0, dim=c(1, N))
+  data <- array(c(3.5, 3.5), dim=c(1,N))
+  cpp_dist_weighted(dist, dropouts, data, N, a, b)
+  expect_equal(dist, array(c(c(0, 0), c(0, 0)), dim=c(2,2)))
+
+  # Neither dropouts, different counts
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(0, dim=c(1, N))
+  data <- array(c(3.5, 3.6), dim=c(1,N))
+  cpp_dist_weighted(dist, dropouts, data, N, a, b)
+  expect_equal(dist, array(c(c(0, 0), c(0.01, 0)), dim=c(2,2)))
+
+  # #1 dropout, #2 not dropout (imputation occurs)
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(c(1, 0), dim=c(1, N))
+  data <- array(c(0.5, 5.0), dim=c(1,N))
+  # P(5.0) = 0.5, imputed value = 2.75
+  cpp_dist_weighted(dist, dropouts, data, N, a, b)
+  expect_equal(dist, array(c(c(0, 0), c(5.0625, 0)), dim=c(2,2)))
+
+  # #1 not dropout, #2 dropout (imputation occurs)
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(c(0, 1), dim=c(1, N))
+  data <- array(c(3.5, 0.5), dim=c(1,N))
+  # P(3.5) = 0.9046505351, imputed value = 3.2139516053
+  cpp_dist_weighted(dist, dropouts, data, N, a, b)
+  expect_equal(dist, array(c(c(0, 0), c(0.081823684110447, 0)), dim=c(2,2)))
+
+  # 3 features
+  dist <- array(0, dim=c(N, N))
+  dropouts <- array(c(c(0, 0, 0),
+                      c(1, 1, 0)), dim=c(3, N))
+  data <- array(c(c(5.0, 5.0, 4.5),
+                  c(0.5, 1.0, 2.5)), dim=c(3,N))
+  cpp_dist_weighted(dist, dropouts, data, N, a, b)
+  expect_equal(dist, array(c(c(0, 0), c(13.0625, 0)), dim=c(2,2)))
+})