Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions R/X_cup_from_X.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,28 @@ X_cup_from_X <- function(X,J){

return(X_cup)
}

X_cup_from_X_fast <- function(X, J) {
n <- nrow(X)
p <- ncol(X)

# Total number of rows in final matrix
total_rows <- n * J

# Construct i index (row indices)
i_coords <- rep(1:total_rows, each = p)

# Construct j index (column indices)
j_block <- rep(0:(J - 1), each = p) * p
j_coords <- rep(j_block, times = n) + rep(1:p, times = total_rows)

# Construct values
X_rep <- X[rep(1:n, each = J), ]
x_vals <- as.vector(t(X_rep)) # row-wise unrolling

X_cup <- Matrix::sparseMatrix(i = i_coords,
j = j_coords,
x = x_vals)

return(X_cup)
}
24 changes: 20 additions & 4 deletions R/emuFit.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
#' @param tolerance tolerance for stopping criterion in full model fitting; once
#' no element of B is updated by more than this value in a single step, we exit
#' optimization. Defaults to 1e-3.
#' @param max_abs_B maximum allowed value for elements of B (in absolute value) in full model fitting. In
#' most cases this is not needed as Firth penalty will prevent infinite estimates
#' under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many
#' nearly collinear regressors). Default is 250.
#' @param rho_init numeric: value at which to initiate rho parameter in augmented Lagrangian
#' algorithm. Default is 1.
#' @param tau numeric: value to scale rho by in each iteration of augmented Lagrangian
Expand Down Expand Up @@ -104,6 +108,8 @@
#' If a value between 0 and 1, all zero-comparison p-values below the value will be set to NA.
#' Default is \code{0.01}.
#' @param unobserved_taxon_error logical: should an error be thrown if Y includes taxa that have 0 counts for all samples? Default is TRUE.
#' @param est_par logical: when possible should computation be parallelized for estimation under the alternative. This is suggested for large
#' \code{n}, especially if when running with \code{verbose = "development"}, the augmentation steps appear to be taking a long time.
#'
#' @return A list containing elements 'coef', 'B', 'penalized', 'Y_augmented',
#' 'z_hat', 'I', 'Dy', and 'score_test_hyperparams' if score tests are run.
Expand Down Expand Up @@ -179,6 +185,7 @@ emuFit <- function(Y,
constraint_param = 0.1,
verbose = FALSE,
tolerance = 1e-4,
max_abs_B = 250,
B_null_tol = 1e-3,
rho_init = 1,
inner_tol = 1,
Expand All @@ -195,7 +202,8 @@ emuFit <- function(Y,
return_score_components = FALSE,
return_both_score_pvals = FALSE,
remove_zero_comparison_pvals = 0.01,
unobserved_taxon_error = TRUE) {
unobserved_taxon_error = TRUE,
est_par = FALSE) {

# Record call
call <- match.call(expand.dots = FALSE)
Expand Down Expand Up @@ -234,7 +242,7 @@ emuFit <- function(Y,
# check for zero-comparison parameters
zero_comparison_res <- zero_comparison_check(X = X, Y = Y)

X_cup <- X_cup_from_X(X,J)
X_cup <- X_cup_from_X_fast(X,J)



Expand Down Expand Up @@ -263,7 +271,9 @@ emuFit <- function(Y,
max_step = max_step,
tolerance = tolerance,
verbose = (verbose == "development"),
j_ref = j_ref)
max_abs_B = max_abs_B,
j_ref = j_ref,
par = est_par)
Y_test <- fitted_model$Y_augmented
fitted_B <- fitted_model$B
converged_estimates <- fitted_model$convergence
Expand All @@ -281,11 +291,17 @@ emuFit <- function(Y,
max_stepsize = max_step,
tolerance = tolerance,
j_ref = j_ref,
max_abs_B = max_abs_B,
verbose = (verbose == "development"))
fitted_B <- fitted_model
Y_test <- Y
}

max_est_B <- max(abs(fitted_B))
if (max_est_B >= 0.9 * max_abs_B) {
warning("At least one estimated B value is within 10% of your `max_abs_B` boundary. We suggest that you rerun estimation with a larger `max_abs_B` value.")
}

} else {
if (is.null(B) & is.null(fitted_model)) {
stop("If refit is FALSE, B or fitted_model must be provided")
Expand All @@ -307,7 +323,7 @@ emuFit <- function(Y,
} else {
fitted_B <- B
if (penalize) {
G <- get_G_for_augmentations(X, J, n, X_cup)
G <- get_G_for_augmentations_fast(X, J, n, X_cup)
Y_test <- Y_augmented <- Y +
get_augmentations(X = X, G = G, Y = Y, B = fitted_B)
} else {
Expand Down
24 changes: 15 additions & 9 deletions R/emuFit_micro_penalized.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@
#' @param max_abs_B numeric: maximum allowed value for elements of B (in absolute value). In
#' most cases this is not needed as Firth penalty will prevent infinite estimates
#' under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many
#' nearly collinear regressors). Default is 50.
#' nearly collinear regressors). Default is 250.
#' @param use_legacy_augmentation logical: should an older (slower) implementation of
#' data augmentation be used? Only used for testing - there is no advantage to using
#' the older implementation in applied settings.
#' @param j_ref which column of B to set to zero as a convenience identifiability
#' during optimization. Default is NULL, in which case this column is chosen based
#' on characteristics of Y (i.e., j_ref chosen to maximize number of entries of
#' Y_j_ref greater than zero).
#' @param par ogical: when possible should computation be parallelized for computing augmentation values.
#' This is suggested for large \code{n}, especially if when running with \code{verbose = "development"},
#' the augmentation steps appear to be taking a long time.
#'
#' @return A p x J matrix containing regression coefficients (under constraint
#' g(B_k) = 0)
#'
Expand All @@ -43,14 +47,15 @@ emuFit_micro_penalized <-
verbose = TRUE,
max_abs_B = 250,
use_legacy_augmentation = FALSE,
j_ref = NULL
j_ref = NULL,
par = FALSE
){

J <- ncol(Y)
p <- ncol(X)
n <- nrow(Y)
if(use_legacy_augmentation){
X_tilde <- X_cup_from_X(X,J)
X_tilde <- X_cup_from_X_fast(X,J)
}
Y_augmented <- Y
if (is.null(B)) {
Expand All @@ -67,9 +72,9 @@ emuFit_micro_penalized <-
may take a moment.")
}
if(is.null(X_cup)){
X_cup <- X_cup_from_X(X,J)
X_cup <- X_cup_from_X_fast(X,J)
}
G <- get_G_for_augmentations(X,J,n,X_cup)
G <- get_G_for_augmentations_fast(X,J,n,X_cup)
}
while(!converged){
# print(counter)
Expand Down Expand Up @@ -99,10 +104,11 @@ maintained only for testing purposes.")
J = J,
verbose = verbose)
} else{
augmentations <- get_augmentations(X = X,
G = G,
Y = Y,
B = fitted_model)
augmentations <- get_augmentations_par(X = X,
G = G,
Y = Y,
B = fitted_model,
par = par)
Y_augmented <- Y + augmentations
}
}
Expand Down
2 changes: 1 addition & 1 deletion R/fit_null.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ fit_null <- function(B,

#get X_cup for later use
if (is.null(X_cup)) {
X_cup = X_cup_from_X(X,J)
X_cup = X_cup_from_X_fast(X,J)
}

#set iteration to zero
Expand Down
19 changes: 18 additions & 1 deletion R/get_G_for_augmentations.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ get_G_for_augmentations <- function(X,
p <- ncol(X)
#calculate indices of columns to remove from X_cup (aka X_tilde, the expanded design matrix)
to_delete <- p*(J - 1) + 1:p #col indices for elements of B^J in long B format
# X_tilde_J <- X_cup_from_X(X,J)
X_tilde_J <- X_cup
for(i in 1:n){
X_tilde_J[(i - 1)*J + J,] <- 0
Expand All @@ -22,3 +21,21 @@ get_G_for_augmentations <- function(X,
G <- cbind(X_tilde_J,Z)
return(G)
}

get_G_for_augmentations_fast <- function(X, J, n, X_cup) {
p <- ncol(X)
#calculate indices of columns to remove from X_cup (aka X_tilde, the expanded design matrix)
to_delete <- p*(J - 1) + 1:p #col indices for elements of B^J in long B format
last_class_rows <- seq(from = J, to = n * J, by = J)
X_cup[last_class_rows, ] <- 0

X_cup <- X_cup[, -to_delete]

#design matrix in z
Z <- Matrix::sparseMatrix(i = 1:(n*J),
j = rep(1:n, each = J),
x = 1,
dims = c(n * J, n))
G <- cbind(X_cup,Z)
return(G)
}
71 changes: 71 additions & 0 deletions R/get_augmentations.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,74 @@ get_augmentations <- function(X,


}

# this actually isn't faster in tests
get_augmentations_par <- function(X,
G, #design matrix in beta_vec and z
Y,
B,
par = FALSE){


#dimensions
n <- nrow(Y)
p <- nrow(B)
J <- ncol(Y)

#parametrize so last column of B is / can be dropped
for(k in 1:p){
B[k, ] <- B[k, ] - B[k, J]
}
B_tilde_J <- B_cup_from_B(B)
to_delete <- p*(J - 1) + 1:p
B_tilde_J <- B_tilde_J[-to_delete, , drop = FALSE]


Y_rowsums <- rowSums(Y)
z_tilde <- numeric(n)
for(i in 1:n){
z_tilde[i] <- log(Y_rowsums[i]) - sum_of_logs(X[i, , drop= FALSE] %*% B)
}

theta_tilde <- rbind(B_tilde_J, matrix(z_tilde, ncol = 1))

log_means <- G %*% theta_tilde

W <- Matrix::Diagonal(x = exp(log_means@x))

#information matrix (we're using tricky equivalency to poisson model)
info <- Matrix::crossprod(G, W) %*% G
# info <- methods::as(info, "symmetricMatrix")
info <- Matrix::forceSymmetric(info)
#cholesky decomposition
info_chol <- Matrix::chol(info, pivot = FALSE)
#info^(-0.5) by inverting cholesky decomp
info_half_inv <- Matrix::solve(info_chol)



if (.Platform$OS.type == "unix" & par) {

cores <- parallel::detectCores()
aug_res <- parallel::mclapply(1:n, function(i) {
G_i <- G[1:J + (i - 1)*J, , drop = FALSE]
G_i <- G_i %*% info_half_inv
Matrix::rowSums(Matrix::Diagonal(x = exp(log_means[(i - 1)*J + 1:J, ])) %*% (G_i^2) )/2
}, mc.cores = cores - 1)
augmentations <- do.call(rbind, aug_res)

} else {
augmentations <- matrix(0, nrow = n, ncol = J)
for(i in 1:n){
G_i <- G[1:J + (i - 1)*J, , drop = FALSE]
G_i <- G_i %*% info_half_inv

augmentations[i, ] <-
Matrix::rowSums(Matrix::Diagonal(x = exp(log_means[(i - 1)*J + 1:J, ])) %*% (G_i^2) )/2

}
}

return(augmentations)

}
12 changes: 11 additions & 1 deletion man/emuFit.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions man/emuFit_micro_penalized.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions tests/testthat/test-X_cup_from_X.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
test_that("X_cup_from_X_fast works and is faster than X_cup_from_X", {
X <- cbind(1, rnorm(100), runif(100))
J <- 200
start1 <- proc.time()
Xcup1 <- X_cup_from_X(X, J)
end1 <- proc.time() - start1
start2 <- proc.time()
Xcup2 <- X_cup_from_X_fast(X, J)
end2 <- proc.time() - start2

expect_true(all.equal(Xcup1, Xcup2))
expect_true(end2[3] < end1[3])
})
Loading