Merge pull request #40 from mayer79/explicit_args

mayer79 · web-flow · commit d7bf90c28a7a · 2022-09-21T19:43:59.000+02:00
added explicit args for hybrid_degree and m
diff --git a/NEWS.md b/NEWS.md
@@ -33,9 +33,9 @@ Kernel SHAP in the Python implementation "shap" uses a quite similar hybrid stra
 
 ## User visible changes
 
-- The default value of `m` (`NULL`) was reduced from $8p$ to $2p$ except when `hybrid_degree = 0` (pure sampling).
+- The default value of `m` is reduced from $8p$ to $2p$ except when `hybrid_degree = 0` (pure sampling).
 - The default value of `exact` is now `TRUE` for $p \le 8$ instead of $p \le 5$.
-- A new argument `hybrid_degree` is introduced to control the exact part of the hybrid algorithm. The default, `NULL`, ensures hybrid degree 2 up to $p\le 16$ and degree 1 for $p > 16$. Set to 0 to force a pure sampling strategy (not recommended but useful to demonstrate superiority of hybrid approaches).
+- A new argument `hybrid_degree` is introduced to control the exact part of the hybrid algorithm. The default is 2 for $4 \le p \le 16$ and degree 1 otherwise. Set to 0 to force a pure sampling strategy (not recommended but useful to demonstrate superiority of hybrid approaches).
 - The default value of `tol` was reduced from 0.01 to 0.005.
 - The default of `max_iter` was reduced from 250 to 100.
 - The order of some of the arguments behind the first four has been changed.
diff --git a/R/kernelshap.R b/R/kernelshap.R
@@ -3,7 +3,7 @@
 #' Multidimensional refinement of the Kernel SHAP Algorithm described in Covert and Lee (2021), 
 #' in the following abbreviated by "CL21". 
 #' The function allows to calculate Kernel SHAP values in an exact way, by iterative sampling 
-#' as in CL21, or by a hybrid of the two. As soon as sampling is involved, 
+#' as in CL21, or by a hybrid of these two options. As soon as sampling is involved, 
 #' the algorithm iterates until convergence, and standard errors are provided.
 #' The default behaviour depends on the number of features p:
 #' \itemize{
@@ -17,7 +17,7 @@
 #' m on-off vectors z so that their sum follows the SHAP Kernel weight distribution 
 #' (renormalized to the range from 1 to p-1). Based on these vectors, many predictions 
 #' are formed. Then, Kernel SHAP values are derived as the solution of a constrained 
-#' linear regression, see CL21 for details. This is done multiple times until convergence.
+#' linear regression. This is done multiple times until convergence, see CL21 for details.
 #' 
 #' A drawback of this strategy is that many (at least 75%) of the z vectors will have 
 #' sum(z) equal to 1 or p-1, producing many duplicates. Similarly, at least 92% of 
@@ -69,12 +69,12 @@
 #' with respect to the background data. In this case, the arguments \code{hybrid_degree}, 
 #' \code{m}, \code{paired_sampling}, \code{tol}, and \code{max_iter} are ignored.
 #' The default is \code{TRUE} up to eight features, and \code{FALSE} otherwise. 
-#' @param hybrid_degree Integer controlling the exactness of the hybrid strategy. The
-#' default, \code{NULL}, equals 2 for p <= 16 and 1 otherwise. Ignored if \code{exact = TRUE}.
+#' @param hybrid_degree Integer controlling the exactness of the hybrid strategy. For
+#' 4 <= p <= 16, the default is 2, otherwise it is 1. Ignored if \code{exact = TRUE}.
 #' \itemize{
 #'   \item \code{0}: Pure sampling strategy not involving any exact part. It is strictly
 #'   worse than the hybrid strategy and should therefore only be used for 
-#'   studying properties of Kernel SHAP algorithms.
+#'   studying properties of the Kernel SHAP algorithm.
 #'   \item \code{1}: Uses all 2p on-off vectors z with sum(z) equal to 1 or p-1 for the exact 
 #'   part, which covers at least 75% of the mass of the Kernel weight distribution. 
 #'   The remaining mass is covered by sampling.
@@ -89,8 +89,8 @@
 #' CL21 shows its superiority compared to standard sampling, therefore the 
 #' default (\code{TRUE}) should usually not be changed except for studying properties
 #' of Kernel SHAP algorithms. Ignored if \code{exact = TRUE}.
-#' @param m Even number of on-off vectors sampled during one iteration. The default,
-#' \code{NULL}, equals 8p for \code{hybrid_degree == 0} and 2p otherwise.
+#' @param m Even number of on-off vectors sampled during one iteration. 
+#' The default is 2p, except when \code{hybrid_degree == 0}. Then it is set to 8p. 
 #' Ignored if \code{exact = TRUE}.
 #' @param tol Tolerance determining when to stop. The algorithm keeps iterating until
 #' max(sigma_n)/diff(range(beta_n)) < tol, where the beta_n are the SHAP values 
@@ -172,8 +172,10 @@ kernelshap <- function(object, ...){
 #' @describeIn kernelshap Default Kernel SHAP method.
 #' @export
 kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict, bg_w = NULL, 
-                               exact = (ncol(X) <= 8L) && is.null(hybrid_degree), 
-                               hybrid_degree = NULL, paired_sampling = TRUE, m = NULL, 
+                               exact = ncol(X) <= 8L, 
+                               hybrid_degree = 1L + ncol(X) %in% 4:16, 
+                               paired_sampling = TRUE, 
+                               m = 2L * ncol(X) * (1L + 3L * (hybrid_degree == 0L)), 
                                tol = 0.005, max_iter = 100L, parallel = FALSE, 
                                parallel_args = NULL, verbose = TRUE, ...) {
   stopifnot(
@@ -189,9 +191,9 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict, bg_w
     all(nms %in% colnames(bg_X)),
     is.function(pred_fun),
     exact %in% c(TRUE, FALSE),
-    is.null(hybrid_degree) || hybrid_degree %in% 0:(p / 2),
+    p == 1L || hybrid_degree %in% 0:(p / 2),
     paired_sampling %in% c(TRUE, FALSE),
-    "m must be even or NULL" = is.null(m) || trunc(m / 2) == m / 2
+    "m must be even" = trunc(m / 2) == m / 2
   )
   if (!is.null(bg_w)) {
     stopifnot(length(bg_w) == bg_n, all(bg_w >= 0), !all(bg_w == 0))
@@ -208,15 +210,7 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict, bg_w
     return(case_p1(n = n, nms = nms, v0 = v0, v1 = v1, X = X, verbose = verbose))
   }
 
-  # Set hybrid_degree and sampling m (both are ignored if exact = TRUE)
-  if (is.null(hybrid_degree)) {
-    hybrid_degree <- 1L + (p <= 16L)
-  }
-  if (is.null(m)) {
-    m <- 2L*p + 6L*p*(hybrid_degree == 0L)
-  }
-  
-  # Precalculations
+  # Precalculations for the real Kernel SHAP
   if (exact || hybrid_degree >= 1L) {
     precalc <- if (exact) input_exact(p) else input_partly_exact(p, hybrid_degree)
     m_exact <- nrow(precalc[["Z"]])
@@ -237,7 +231,7 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict, bg_w
     message(txt)
   }
   if (verbose && max(m, m_exact) * bg_n > 2e5) {
-    warning("Predictions on large data sets with ", max(m, m_exact), "x", bg_n,
+    warning("\nPredictions on large data sets with ", max(m, m_exact), "x", bg_n,
             " observations are being done. Consider reducing the computational burden ",
             "(e.g. exact = FALSE, low hybrid_degree, smaller background data, smaller m)")
   }
@@ -304,7 +298,7 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict, bg_w
     m = m,
     m_exact = m_exact,
     prop_exact = prop_exact,
-    exact = exact || p %in% (0:1 + (2L * hybrid_degree)),
+    exact = exact || trunc(p / 2) == hybrid_degree,
     txt = txt
   )
   class(out) <- "kernelshap"
@@ -315,9 +309,10 @@ kernelshap.default <- function(object, X, bg_X, pred_fun = stats::predict, bg_w
 #' @export
 kernelshap.ranger <- function(object, X, bg_X,
                               pred_fun = function(m, X, ...) stats::predict(m, X, ...)$predictions, 
-                              bg_w = NULL, 
-                              exact = (ncol(X) <= 8L) && is.null(hybrid_degree), 
-                              hybrid_degree = NULL, paired_sampling = TRUE, m = NULL, 
+                              bg_w = NULL, exact = ncol(X) <= 8L, 
+                              hybrid_degree = 1L + ncol(X) %in% 4:16, 
+                              paired_sampling = TRUE, 
+                              m = 2L * ncol(X) * (1L + 3L * (hybrid_degree == 0L)), 
                               tol = 0.005, max_iter = 100L, parallel = FALSE, 
                               parallel_args = NULL, verbose = TRUE, ...) {
   kernelshap.default(
@@ -343,9 +338,10 @@ kernelshap.ranger <- function(object, X, bg_X,
 #' @export
 kernelshap.Learner <- function(object, X, bg_X,
                                pred_fun = function(m, X) m$predict_newdata(X)$response, 
-                               bg_w = NULL, 
-                               exact = (ncol(X) <= 8L) && is.null(hybrid_degree), 
-                               hybrid_degree = NULL, paired_sampling = TRUE, m = NULL, 
+                               bg_w = NULL, exact = ncol(X) <= 8L, 
+                               hybrid_degree = 1L + ncol(X) %in% 4:16, 
+                               paired_sampling = TRUE, 
+                               m = 2L * ncol(X) * (1L + 3L * (hybrid_degree == 0L)),
                                tol = 0.005, max_iter = 100L, parallel = FALSE, 
                                parallel_args = NULL, verbose = TRUE, ...) {
   kernelshap.default(
diff --git a/README.md b/README.md
@@ -313,20 +313,31 @@ y <- rnorm(10000L)
 fit <- lm(y ~ ., data = cbind(y = y, X))
 
 s <- kernelshap(fit, X[1L, ], bg_X = X)
+summary(s)
 s$S[1:5]
-# Kernel SHAP values by the iterative hybrid strategy of degree 2 
-# (m_exact = 110, m/iter = 80)
+# Kernel SHAP values by the hybrid strategy of degree 2
+#   - SHAP matrix of dim 1 x 10
+#   - baseline: -0.005390948
+#   - average number of iterations: 2 
+#   - rows not converged: 0 
+#   - proportion exact: 0.9487952 
+#   - m/iter: 20
+#   - m_exact: 110
 # 0.0101998581  0.0027579289 -0.0002294437  0.0005337086  0.0001179876
 ```
 
-The algorithm converged in the minimal possible number of two iterations and used $110 + 2\cdot 80 = 270$ on-off vectors $z$. For each $z$, predictions on a data set with the same size as the background data are done. Three calls to `predict()` were necessary (one for the exact part and one per sampling iteration).
+The algorithm converged in the minimal possible number of two iterations and used $110 + 2\cdot 20 = 150$ on-off vectors $z$. For each $z$, predictions on a data set with the same size as the background data are done. Three calls to `predict()` were necessary (one for the exact part and one per sampling iteration).
 
 Since $p$ is not very large in this case, we can also force the algorithm to use exact calculations:
 
 ```r
 s <- kernelshap(fit, X[1L, ], bg_X = X, exact = TRUE)
+summary(s)
 s$S[1:5]
-# Exact Kernel SHAP values (m_exact = 1022)
+# Exact Kernel SHAP values
+#   - SHAP matrix of dim 1 x 10
+#   - baseline: -0.005390948
+#   - m_exact: 1022
 # 0.0101998581  0.0027579289 -0.0002294437  0.0005337086  0.0001179876
 ```
 
@@ -336,8 +347,16 @@ Pure sampling can be enforced by setting the hybrid degree to 0:
 
 ```r
 s <- kernelshap(fit, X[1L, ], bg_X = X, hybrid_degree = 0)
+summary(s)
 s$S[1:5]
-# Kernel SHAP values by iterative sampling (m/iter = 80)
+# Kernel SHAP values by iterative sampling
+#   - SHAP matrix of dim 1 x 10
+#   - baseline: -0.005390948
+#   - average number of iterations: 2 
+#   - rows not converged: 0 
+#   - proportion exact: 0 
+#   - m/iter: 80
+#   - m_exact: 0
 # 0.0101998581  0.0027579289 -0.0002294437  0.0005337086  0.0001179876
 ```
 
diff --git a/compare_with_python.R b/compare_with_python.R
@@ -27,7 +27,7 @@ ks
 
 # Pure sampling version takes a bit longer (13 seconds)
 system.time(
-  ks2 <- kernelshap(fit, X_small, bg_X = bg_X, hybrid_degree = 0)  
+  ks2 <- kernelshap(fit, X_small, bg_X = bg_X, exact = FALSE, hybrid_degree = 0)  
 )
 ks2
 
@@ -65,7 +65,7 @@ fit <- lm(
 X_small <- diamonds[seq(1, nrow(diamonds), 53), setdiff(names(diamonds), "price")]
 
 # Exact KernelSHAP on X_small, using X_small as background data 
-# (71/59 seconds for exact, 25/17 for hybrid deg 2, 16/9 for hybrid deg 1, 
+# (71/59 seconds for exact, 27/17 for hybrid deg 2, 17/9 for hybrid deg 1, 
 # 26/15 for pure sampling; second number with 2 parallel sessions on Windows)
 system.time(
   ks <- kernelshap(fit, X_small, bg_X = bg_X)  
diff --git a/man/kernelshap.Rd b/man/kernelshap.Rd
diff --git a/tests/testthat/test-kernelshap.R b/tests/testthat/test-kernelshap.R
@@ -15,12 +15,9 @@ test_that("SHAP + baseline = prediction", {
 })
 
 test_that("Exact hybrid calculation is similar to exact (non-hybrid)", {
-  s1 <- kernelshap(fit, iris[c(1, 51, 101), x], bg_X = iris, hybrid_degree = 1)
-  expect_equal(s$S, s1$S)
-})
-
-test_that("Pure sampling is very similar to exact", {
-  s1 <- kernelshap(fit, iris[c(1, 51, 101), x], bg_X = iris, hybrid_degree = 0)
+  s1 <- kernelshap(
+    fit, iris[c(1, 51, 101), x], bg_X = iris, exact = FALSE, hybrid_degree = 1
+  )
   expect_equal(s$S, s1$S)
 })
 
@@ -126,12 +123,4 @@ test_that("kernelshap works for large p (hybrid case)", {
   expect_equal(rowSums(s$S) + s$baseline, unname(stats::predict(fit, X[1L, ])))
 })
 
-# Pure sampling case does not converge in 100 iterations, but result matches 
-# test_that("Hybrid large p case matches approximately the pure sampler", {
-#   s1 <- kernelshap(fit, X[1L, ], bg_X = X, hybrid_degree = 0)
-#   
-#   expect_equal(s$S[1, 1], s1$S[1, 1])
-#   expect_equal(rowSums(s$S) + s$baseline, unname(stats::predict(fit, X[1L, ])))
-# })
-