diff --git a/R/emuFit_micro.R b/R/emuFit_micro.R
index 2147dcc..0d7bb71 100644
--- a/R/emuFit_micro.R
+++ b/R/emuFit_micro.R
@@ -24,6 +24,7 @@
 #' @param optimize_rows If use_working_constraint is TRUE, update overall location of
 #' rows of B relative to column constrained to equal zero under working constraint before
 #' iterating through updates to columns of B individually. Default is TRUE.
+#' @param use_discrete If discrete design matrix, use fast discrete implementation.
 #' @return A p x J matrix containing regression coefficients (under constraint
 #' g(B_k) = 0)
 #'
@@ -42,9 +43,11 @@ emuFit_micro <- function(
   max_abs_B = 50,
   use_working_constraint = TRUE,
   j_ref = NULL,
-  optimize_rows = TRUE
+  optimize_rows = TRUE,
+  use_discrete = TRUE
 ) {
-  #extract dimensions
+  
+  # extract dimensions
   n <- nrow(Y)
   J <- ncol(Y)
   p <- ncol(X)
@@ -58,7 +61,7 @@ emuFit_micro <- function(
     )
   }
 
-  #populate B if not provided
+  # populate B if not provided
   if (is.null(B)) {
     if (!warm_start) {
       B <- matrix(0, nrow = p, ncol = J)
@@ -70,7 +73,7 @@ emuFit_micro <- function(
       }
       B <- matrix(nrow = p, ncol = J)
 
-      ##Checking if design matrix is rank-deficient and soln_mat can be found
+      # Checking if design matrix is rank-deficient and soln_mat can be found
       if (qr(t(X) %*% X)$rank < ncol(X)) {
         stop(
           "Design matrix X inputted for the model is rank-deficient, preventing proper model fitting.
@@ -112,120 +115,143 @@ emuFit_micro <- function(
       B[k, ] <- B[k, ] - constraint_fn[[k]](B[k, ])
     }
   }
+  
+  # if we have zeros, don't use discrete implementation 
+  # if we are being routed from emuFit_micro_penalized, we don't expect to ever have zeros
+  if (min(Y) == 0) {
+    use_discrete <- FALSE
+    }
+  
+  # check if X is discrete
+  distinct_X <- unique(X)
 
-  #create object to store log likelihoods at each iteration in
-  lls <- numeric(maxit)
-
-  #initiate z
-  z <- update_z(Y, X, B)
-
-  #compute initial mean
-  log_mean <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
-
-  lls[1] <- sum(Y * log_mean - exp(log_mean))
-
-  converged <- FALSE
-
-  if (optimize_rows & use_working_constraint) {
-    eps_outcome <- rowSums(Y[, -which_to_omit, drop = FALSE])
-  }
-
-  iter <- 1
-  deriv_norm <- Inf
-  B_diff <- Inf
-  iter <- 1
-  while (!converged) {
-    old_B <- B
+  if (ncol(X) == nrow(distinct_X) & use_discrete) {
+    
+    B <- emuFit_micro_discrete(X = X, Y = Y, j_ref = j_ref)
+    B[B < -max_abs_B] <- -max_abs_B
+    B[B > max_abs_B] <- max_abs_B
+    
+    if (!use_working_constraint) {
+      for (k in 1:p) {
+        B[k, ] <- B[k, ] - constraint_fn[[k]](B[k, ])
+      }
+    }
+    
+  } else {
+    #create object to store log likelihoods at each iteration in
+    lls <- numeric(maxit)
+    
+    #initiate z
+    z <- update_z(Y, X, B)
+    
+    #compute initial mean
+    log_mean <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+    
+    lls[1] <- sum(Y * log_mean - exp(log_mean))
+    
+    converged <- FALSE
+    
     if (optimize_rows & use_working_constraint) {
-      epsilon_step <- rep(10, p)
-      while (max(abs(epsilon_step)) > 0.01) {
-        log_mean <- X %*%
-          B +
-          matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
-        eps_offset = log(rowSums(exp(log_mean[,
-          -which_to_omit,
-          drop = FALSE
-        ])))
-        epsilon_step <- suppressWarnings(
-          stats::glm(
-            eps_outcome ~ X - 1,
-            family = "poisson",
-            offset = eps_offset,
-            control = list(maxit = 3)
-          )$coef
-        )
-
-        for (k in 1:p) {
-          B[k, -which_to_omit] <- B[k, -which_to_omit] + epsilon_step[k]
+      eps_outcome <- rowSums(Y[, -which_to_omit, drop = FALSE])
+    }
+    
+    iter <- 1
+    deriv_norm <- Inf
+    B_diff <- Inf
+    iter <- 1
+    while (!converged) {
+      old_B <- B
+      if (optimize_rows & use_working_constraint) {
+        epsilon_step <- rep(10, p)
+        while (max(abs(epsilon_step)) > 0.01) {
+          log_mean <- X %*%
+            B +
+            matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+          eps_offset = log(rowSums(exp(log_mean[,
+                                                -which_to_omit,
+                                                drop = FALSE
+          ])))
+          epsilon_step <- suppressWarnings(
+            stats::glm(
+              eps_outcome ~ X - 1,
+              family = "poisson",
+              offset = eps_offset,
+              control = list(maxit = 3)
+            )$coef
+          )
+          
+          for (k in 1:p) {
+            B[k, -which_to_omit] <- B[k, -which_to_omit] + epsilon_step[k]
+          }
+          z <- update_z(Y, X, B)
         }
-        z <- update_z(Y, X, B)
       }
-    }
-
-    for (j in loop_js) {
-      update <- micro_fisher(
-        X,
-        Yj = Y[, j, drop = FALSE],
-        Bj = B[, j, drop = FALSE],
-        z,
-        stepsize = max_stepsize,
-        c1 = c1
-      )
-
-      B[, j] <- B[, j] + update
-
-      if (!use_working_constraint) {
-        for (k in 1:p) {
-          B[k, ] <- B[k, ] - constraint_fn[[k]](B[k, ])
+      
+      for (j in loop_js) {
+        update <- micro_fisher(
+          X,
+          Yj = Y[, j, drop = FALSE],
+          Bj = B[, j, drop = FALSE],
+          z,
+          stepsize = max_stepsize,
+          c1 = c1
+        )
+        
+        B[, j] <- B[, j] + update
+        
+        if (!use_working_constraint) {
+          for (k in 1:p) {
+            B[k, ] <- B[k, ] - constraint_fn[[k]](B[k, ])
+          }
         }
       }
-    }
-
-    # keep any elements of B from flying off to infinity
-    B[B < -max_abs_B] <- -max_abs_B
-    B[B > max_abs_B] <- max_abs_B
-    z <- update_z(Y = Y, X = X, B = B)
-
-    log_mean <- X %*%
-      B +
-      matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
-    lls[iter + 1] <- sum(Y * log_mean - exp(log_mean))
-
-    deriv <- do.call(
-      c,
-      lapply(1:J, function(j) {
-        crossprod(X, Y[, j, drop = FALSE] - exp(log_mean[, j, drop = FALSE]))
-      })
-    )
-
-    deriv_norm = sqrt(sum(deriv^2))
-
-    B_diff <- max(abs(B - old_B)[abs(B) < (0.5 * max_abs_B)])
-
-    if (verbose) {
-      message(
-        "Scaled norm of derivative ",
-        signif(sqrt((1 / (n * J)) * deriv_norm), 4)
+      
+      # keep any elements of B from flying off to infinity
+      B[B < -max_abs_B] <- -max_abs_B
+      B[B > max_abs_B] <- max_abs_B
+      z <- update_z(Y = Y, X = X, B = B)
+      
+      log_mean <- X %*%
+        B +
+        matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+      lls[iter + 1] <- sum(Y * log_mean - exp(log_mean))
+      
+      deriv <- do.call(
+        c,
+        lapply(1:J, function(j) {
+          crossprod(X, Y[, j, drop = FALSE] - exp(log_mean[, j, drop = FALSE]))
+        })
       )
-      if (iter > 1) {
+      
+      deriv_norm = sqrt(sum(deriv^2))
+      
+      B_diff <- max(abs(B - old_B)[abs(B) < (0.5 * max_abs_B)])
+      
+      if (verbose) {
         message(
-          "Max absolute difference in elements of B since last iteration: ",
-          signif(B_diff, 3)
+          "Scaled norm of derivative ",
+          signif(sqrt((1 / (n * J)) * deriv_norm), 4)
         )
+        if (iter > 1) {
+          message(
+            "Max absolute difference in elements of B since last iteration: ",
+            signif(B_diff, 3)
+          )
+        }
       }
-    }
-
-    if (B_diff < tolerance) {
-      converged <- TRUE
-    }
-
-    if (iter > maxit) {
-      converged <- TRUE
-      if (verbose) {
-        message("Iteration limit reached; exiting optimization.")
+      
+      if (B_diff < tolerance) {
+        converged <- TRUE
+      }
+      
+      if (iter > maxit) {
+        converged <- TRUE
+        if (verbose) {
+          message("Iteration limit reached; exiting optimization.")
+        }
       }
+      iter <- iter + 1
     }
-    iter <- iter + 1
   }
 
   if (use_working_constraint) {
diff --git a/R/emuFit_micro_discrete.R b/R/emuFit_micro_discrete.R
new file mode 100644
index 0000000..16d89f9
--- /dev/null
+++ b/R/emuFit_micro_discrete.R
@@ -0,0 +1,38 @@
+emuFit_micro_discrete <- function(
+    X,
+    Y,
+    j_ref = NULL
+) {
+  
+  if (is.null(j_ref)) {
+    j_ref <- 1
+  }
+  n <- nrow(Y)
+  J <- ncol(Y)
+  p <- ncol(X)
+  
+  distinct_xx <- unique(X)
+
+  # fitted values eta = X %*% beta
+  etahats <- matrix(NA, nrow = p, ncol = J) # p x J
+  etahats[, j_ref] <- 0
+  
+  groups <- split(
+    seq_len(nrow(X)),                        # row indices of original data
+    apply(X, 1, function(r)
+      paste(r, collapse = "_"))               # grouping key by row contents
+  )
+  groups <- groups[order(sapply(groups, min))]
+  
+  for (the_cat in 1:length(groups)) {
+    the_xs <- groups[[the_cat]]
+    totals <- apply(Y[the_xs, , drop = FALSE], 2, sum)
+    pihats <- totals / sum(totals)
+    etahats[the_cat, setdiff(1:J, j_ref)] <- log(pihats[-j_ref] / pihats[j_ref])
+  }
+  # beta = X^{-1} %*% eta
+  betahats <- round(MASS::ginv(distinct_xx), 8) %*% etahats
+  #list(betahats, etahats)
+  betahats
+}
+
diff --git a/R/emuFit_micro_penalized.R b/R/emuFit_micro_penalized.R
index 57ce844..385f72a 100644
--- a/R/emuFit_micro_penalized.R
+++ b/R/emuFit_micro_penalized.R
@@ -1,4 +1,3 @@
-
 #' Fit radEmu model with Firth penalty
 #'
 #' @param X a p x J design matrix
@@ -20,38 +19,34 @@
 #' most cases this is not needed as Firth penalty will prevent infinite estimates
 #' under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many
 #' nearly collinear regressors). Default is 50.
-#' @param use_legacy_augmentation logical: should an older (slower) implementation of
-#' data augmentation be used? Only used for testing - there is no advantage to using
-#' the older implementation in applied settings.
 #' @param j_ref which column of B to set to zero as a convenience identifiability
 #' during optimization. Default is NULL, in which case this column is chosen based
 #' on characteristics of Y (i.e., j_ref chosen to maximize number of entries of
 #' Y_j_ref greater than zero).
+#' @param use_discrete If discrete design matrix, use fast discrete implementation.
+#' 
 #' @return A p x J matrix containing regression coefficients (under constraint
 #' g(B_k) = 0)
 #'
 emuFit_micro_penalized <-
-  function(X,
-           Y,
-           B = NULL,
-           X_cup = NULL,
-           constraint_fn = NULL,
-           maxit = 500,
-           ml_maxit = 5,
-           tolerance = 1e-3,
-           max_step = 5,
-           verbose = TRUE,
-           max_abs_B = 250,
-           use_legacy_augmentation = FALSE,
-           j_ref = NULL
-  ){
-
+  function(
+    X,
+    Y,
+    B = NULL,
+    X_cup = NULL,
+    constraint_fn = NULL,
+    maxit = 500,
+    ml_maxit = 5,
+    tolerance = 1e-3,
+    max_step = 5,
+    verbose = TRUE,
+    max_abs_B = 250,
+    j_ref = NULL,
+    use_discrete = TRUE
+  ) {
     J <- ncol(Y)
     p <- ncol(X)
     n <- nrow(Y)
-    if(use_legacy_augmentation){
-      X_tilde <- X_cup_from_X(X,J)
-    }
     Y_augmented <- Y
     if (is.null(B)) {
       fitted_model <- NULL
@@ -61,88 +56,82 @@ emuFit_micro_penalized <-
     converged <- FALSE
     counter <- 0
     #get design matrix we'll use for computing augmentations
-    if(!use_legacy_augmentation){
-      if(verbose){
-        message("Constructing expanded design matrix. For larger datasets this
-may take a moment.")
-      }
-      if(is.null(X_cup)){
-        X_cup <- X_cup_from_X(X,J)
-      }
-      G <- get_G_for_augmentations(X,J,n,X_cup)
+
+    if (verbose) {
+      message(
+        "Constructing expanded design matrix. For larger datasets this
+may take a moment."
+      )
+    }
+    if (is.null(X_cup)) {
+      X_cup <- X_cup_from_X(X, J)
     }
-    while(!converged){
+    G <- get_G_for_augmentations(X, J, n, X_cup)
+
+    while (!converged) {
       # print(counter)
 
-      if(counter ==0 & is.null(B)){
-        Y_augmented <- Y + 1e-3*mean(Y) #ensures we don't diverge to 
-                                        #infinity in first iteration 
-                                        #after which point we use 
-                                        #data augmentations based on B
-                                        #is there a smarter way to start?
-                                        #probably.
-      } else{
-        if(verbose){
-          message("Computing data augmentations for Firth penalty. For larger models, this may take some time.")
+      if (counter == 0 & is.null(B)) {
+        Y_augmented <- Y + 1e-3 * mean(Y) #ensures we don't diverge to
+        #infinity in first iteration
+        #after which point we use
+        #data augmentations based on B
+        #is there a smarter way to start?
+        #probably.
+      } else {
+        if (verbose) {
+          message(
+            "Computing data augmentations for Firth penalty. For larger models, this may take some time."
+          )
         }
 
-        if(use_legacy_augmentation){
-          message("Using legacy implementation of data augmentation. This is less
-computationally efficient than the default implementation and is
-maintained only for testing purposes.")
-          Y_augmented <-
-            update_data(Y = Y,
-                        X_tilde = X_tilde,
-                        B = fitted_model,
-                        p = p,
-                        n = n,
-                        J = J,
-                        verbose = verbose)
-        } else{
-          augmentations <- get_augmentations(X = X,
-                                             G = G,
-                                             Y = Y,
-                                             B = fitted_model)
-          Y_augmented <- Y + augmentations
-        }
+        augmentations <- get_augmentations(
+          X = X,
+          G = G,
+          Y = Y,
+          B = fitted_model
+        )
+        Y_augmented <- Y + augmentations
       }
-      if(!is.null(fitted_model)){
+      if (!is.null(fitted_model)) {
         old_B <- fitted_model
-      } else{
+      } else {
         old_B <- Inf
       }
       #fit model by ML to data with augmentations
-      fitted_model <- emuFit_micro(X,
-                                   Y_augmented,
-                                   B = fitted_model,
-                                   constraint_fn = constraint_fn,
-                                   # maxit = maxit,
-                                   maxit = ml_maxit,
-                                   warm_start = TRUE,
-                                   max_abs_B = max_abs_B,
-                                   use_working_constraint = TRUE,
-                                   max_stepsize = max_step,
-                                   tolerance = tolerance,
-                                   verbose = verbose,
-                                   j_ref = j_ref)
+      fitted_model <- emuFit_micro(
+        X,
+        Y_augmented,
+        B = fitted_model,
+        constraint_fn = constraint_fn,
+        maxit = ml_maxit,
+        warm_start = TRUE,
+        max_abs_B = max_abs_B,
+        use_working_constraint = TRUE,
+        max_stepsize = max_step,
+        tolerance = tolerance,
+        verbose = verbose,
+        j_ref = j_ref,
+        use_discrete = use_discrete
+      )
 
-  
-      B_diff <- max(abs(fitted_model - old_B)[abs(fitted_model)<max_abs_B])
+      B_diff <- max(abs(fitted_model - old_B)[abs(fitted_model) < max_abs_B])
 
-      if(B_diff < tolerance){
+      if (B_diff < tolerance) {
         converged <- TRUE
         actually_converged <- TRUE
       }
 
-      if(counter>maxit){
+      if (counter > maxit) {
         converged <- TRUE
         actually_converged <- FALSE
       }
       counter <- counter + 1
     }
 
-    return(list("Y_augmented" = Y_augmented,
-                "B" = fitted_model,
-                "convergence" = actually_converged))
-
+    return(list(
+      "Y_augmented" = Y_augmented,
+      "B" = fitted_model,
+      "convergence" = actually_converged
+    ))
   }
diff --git a/R/micro_fisher.R b/R/micro_fisher.R
index a879814..e4e5694 100644
--- a/R/micro_fisher.R
+++ b/R/micro_fisher.R
@@ -1,60 +1,56 @@
+# Fisher scoring update for j-th column of B in unconstrained optimization with
+# z held constant
 
-#fisher scoring update for j-th column of B in unconstrained optimization
-#z held constant
-micro_fisher <- function(X,Yj,Bj,z,
-                         stepsize = 1,
-                         c1 = 0.1){
-  log_means <- X%*%Bj + z
+micro_fisher <- function(X, Yj, Bj, z, stepsize = 1, c1 = 0.1) {
+  log_means <- X %*% Bj + z
   means <- exp(log_means)
 
   #info in Bj
-  info <- t(X)%*%diag(as.numeric(means))%*%X
-  lj_grad <- colSums(diag(as.numeric(Yj - means))%*%X)
+  info <- t(X) %*% diag(as.numeric(means)) %*% X
+  lj_grad <- colSums(diag(as.numeric(Yj - means)) %*% X)
 
   #make update a try-error to start
-  update <- try(stop(),silent = TRUE)
+  update <- try(stop(), silent = TRUE)
 
-
-  if(nrow(info) >1){
-    info_avg_diag <- diag(rep(sqrt(mean(diag(info)^2)),nrow(info)))
-  } else{
+  if (nrow(info) > 1) {
+    info_avg_diag <- diag(rep(sqrt(mean(diag(info)^2)), nrow(info)))
+  } else {
     info_avg_diag <- abs(info)
   }
 
-  #try to compute update direction as is, but if we run into numerical 
+  #try to compute update direction as is, but if we run into numerical
   #invertibility issues, regularize info and try again
   regularization <- 0
-  while(inherits(update,"try-error")){
-    update <- try(qr.solve(info + regularization*info_avg_diag,lj_grad),silent = TRUE)
-    regularization <- ifelse(regularization ==0, 0.01,10*regularization)
+  while (inherits(update, "try-error")) {
+    update <- try(
+      qr.solve(info + regularization * info_avg_diag, lj_grad),
+      silent = TRUE
+    )
+    regularization <- ifelse(regularization == 0, 0.01, 10 * regularization)
   }
 
-
   #use armijo rule to choose step size
-  obj <- -sum(Yj*log_means - means)
+  obj <- -sum(Yj * log_means - means)
   obj_grad <- -lj_grad
 
-  suff_decrease_term <- c1*sum(obj_grad*update)
+  suff_decrease_term <- c1 * sum(obj_grad * update)
 
   suff_decrease <- FALSE
-  while(!(suff_decrease)){
-
-    prop_Bj <- Bj + stepsize*update
-    prop_log_mu <- X%*%prop_Bj + z
-    prop_obj <- -sum(Yj*prop_log_mu - exp(prop_log_mu))
+  while (!(suff_decrease)) {
+    prop_Bj <- Bj + stepsize * update
+    prop_log_mu <- X %*% prop_Bj + z
+    prop_obj <- -sum(Yj * prop_log_mu - exp(prop_log_mu))
 
-    suff_decrease <- prop_obj <= obj + suff_decrease_term*stepsize
-
-    stepsize <- stepsize*0.5
+    suff_decrease <- prop_obj <= obj + suff_decrease_term * stepsize
 
+    stepsize <- stepsize * 0.5
   }
 
-  stepsize <- stepsize/0.5
+  stepsize <- stepsize / 0.5
 
-  return(stepsize*update)
+  return(stepsize * update)
 }
 
-
 #
 #
 # micro_fisher <- function(X,Yj,Bj,z,
diff --git a/man/emuFit_micro.Rd b/man/emuFit_micro.Rd
index 039c845..bbd09b2 100644
--- a/man/emuFit_micro.Rd
+++ b/man/emuFit_micro.Rd
@@ -18,7 +18,8 @@ emuFit_micro(
   max_abs_B = 50,
   use_working_constraint = TRUE,
   j_ref = NULL,
-  optimize_rows = TRUE
+  optimize_rows = TRUE,
+  use_discrete = TRUE
 )
 }
 \arguments{
@@ -59,6 +60,8 @@ maximize the number of nonzero entries of Y_j_ref.}
 \item{optimize_rows}{If use_working_constraint is TRUE, update overall location of
 rows of B relative to column constrained to equal zero under working constraint before
 iterating through updates to columns of B individually. Default is TRUE.}
+
+\item{use_discrete}{If discrete design matrix, use fast discrete implementation.}
 }
 \value{
 A p x J matrix containing regression coefficients (under constraint
diff --git a/man/emuFit_micro_penalized.Rd b/man/emuFit_micro_penalized.Rd
index 4cbf5b1..ee65ca8 100644
--- a/man/emuFit_micro_penalized.Rd
+++ b/man/emuFit_micro_penalized.Rd
@@ -16,8 +16,8 @@ emuFit_micro_penalized(
   max_step = 5,
   verbose = TRUE,
   max_abs_B = 250,
-  use_legacy_augmentation = FALSE,
-  j_ref = NULL
+  j_ref = NULL,
+  use_discrete = TRUE
 )
 }
 \arguments{
@@ -51,14 +51,12 @@ most cases this is not needed as Firth penalty will prevent infinite estimates
 under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many
 nearly collinear regressors). Default is 50.}
 
-\item{use_legacy_augmentation}{logical: should an older (slower) implementation of
-data augmentation be used? Only used for testing - there is no advantage to using
-the older implementation in applied settings.}
-
 \item{j_ref}{which column of B to set to zero as a convenience identifiability
 during optimization. Default is NULL, in which case this column is chosen based
 on characteristics of Y (i.e., j_ref chosen to maximize number of entries of
 Y_j_ref greater than zero).}
+
+\item{use_discrete}{If discrete design matrix, use fast discrete implementation.}
 }
 \value{
 A p x J matrix containing regression coefficients (under constraint
diff --git a/tests/testthat/test-emuFit_micro_discrete.R b/tests/testthat/test-emuFit_micro_discrete.R
new file mode 100644
index 0000000..516c7d8
--- /dev/null
+++ b/tests/testthat/test-emuFit_micro_discrete.R
@@ -0,0 +1,208 @@
+
+test_that("sim with single binary covariate", {
+  set.seed(4323)
+  X <- cbind(1,rep(c(0,1),each = 20))
+  J <- 10
+  n <- 40
+  b1 <- 1:J - mean(1:J)
+  Y <- radEmu:::simulate_data(n = n,
+                              J = J,
+                              X = X,
+                              b0 = rnorm(J),
+                              b1 = b1,
+                              distn = "Poisson",
+                              mean_z = 10)
+  
+  ml_fit <- emuFit_micro(X = X,
+                         Y = Y,
+                         constraint_fn = rep(list(function(x) mean(x)), 2), 
+                         maxit = 200,
+                         tolerance = 1e-3,
+                         verbose = FALSE)
+  ml_fit_fisher <- emuFit_micro(X = X,
+                                Y = Y,
+                                constraint_fn = rep(list(function(x) mean(x)), 2), 
+                                maxit = 200,
+                                tolerance = 1e-3,
+                                verbose = FALSE,
+                                use_discrete = FALSE)
+  
+  
+  fit_discrete <- emuFit_micro_discrete(X = X,
+                                        Y = Y,
+                                        j_ref = 1)
+  
+  
+  testthat::expect_lte(  max(abs(ml_fit -  
+                                    rbind(fit_discrete[1,] - mean(fit_discrete[1,]), 
+                                          fit_discrete[2,] - mean(fit_discrete[2,])))),
+                          5e-3)
+  
+  testthat::expect_lte(max(abs(ml_fit - ml_fit_fisher)), 5e-3)
+  
+  ord <- sample(1:40, 40, replace = FALSE)
+  X_reorder <- X[ord, ]
+  Y_reorder <- Y[ord, ]
+  
+  ml_fit_reord <- emuFit_micro(X = X_reorder,
+                               Y = Y_reorder,
+                               constraint_fn = rep(list(function(x) mean(x)), 2), 
+                               maxit = 200,
+                               tolerance = 1e-3,
+                               verbose = FALSE)
+  ml_fit_fisher_reord <- emuFit_micro(X = X_reorder,
+                                      Y = Y_reorder,
+                                      constraint_fn = rep(list(function(x) mean(x)), 2), 
+                                      maxit = 200,
+                                      tolerance = 1e-3,
+                                      verbose = FALSE,
+                                      use_discrete = FALSE)
+  
+  
+  fit_discrete_reord <- emuFit_micro_discrete(X = X_reorder,
+                                              Y = Y_reorder,
+                                              j_ref = 1)
+  
+  
+  testthat::expect_lte(max(abs(fit_discrete - fit_discrete_reord)), 1e-8)
+  testthat::expect_lte(max(abs(ml_fit - ml_fit_reord)), 1e-8)
+  testthat::expect_lte(max(abs(ml_fit_reord - ml_fit_fisher_reord)), 5e-3)
+  
+})
+
+test_that("sim with multiple-level covariate covariate", {
+  set.seed(4323)
+  dat <- data.frame(cat = rep(c("A", "B", "C"), each = 20))
+  X <- make_design_matrix(formula = ~ cat, data = dat)
+  X_reorder <- X[sample(1:60, 60), ]
+  J <- 10
+  n <- 60
+  Y <- radEmu:::simulate_data(n = n,
+                              J = J,
+                              X = X,
+                              B = rbind(rnorm(J, 0, 2), rnorm(J, 0, 2), rnorm(J, 0, 2)),
+                              distn = "Poisson",
+                              mean_z = 10)
+  
+  ml_fit <- emuFit_micro(X = X,
+                         Y = Y,
+                         constraint_fn = rep(list(function(x) mean(x)), 3), 
+                         maxit = 200,
+                         tolerance = 1e-3,
+                         verbose = FALSE)
+  ml_fit_fisher <- emuFit_micro(X = X,
+                                Y = Y,
+                                constraint_fn = rep(list(function(x) mean(x)), 3), 
+                                maxit = 200,
+                                tolerance = 1e-3,
+                                verbose = FALSE,
+                                use_discrete = FALSE)
+  
+  
+  fit_discrete <- emuFit_micro_discrete(X = X,
+                                        Y = Y,
+                                        j_ref = 1)
+  
+  
+  testthat::expect_lte(  max(abs(ml_fit -  
+                                   rbind(fit_discrete[1,] - mean(fit_discrete[1,]), 
+                                         fit_discrete[2,] - mean(fit_discrete[2,]),
+                                         fit_discrete[3,] - mean(fit_discrete[3,])))),
+                         5e-3)
+  
+  testthat::expect_lte(max(abs(ml_fit - ml_fit_fisher)), 5e-3)
+  
+  ord <- sample(1:60, 60, replace = FALSE)
+  X_reorder <- X[ord, ]
+  Y_reorder <- Y[ord, ]
+  
+  ml_fit_reord <- emuFit_micro(X = X_reorder,
+                               Y = Y_reorder,
+                               constraint_fn = rep(list(function(x) mean(x)), 3), 
+                               maxit = 200,
+                               tolerance = 1e-3,
+                               verbose = FALSE)
+  ml_fit_fisher_reord <- emuFit_micro(X = X_reorder,
+                                      Y = Y_reorder,
+                                      constraint_fn = rep(list(function(x) mean(x)), 3), 
+                                      maxit = 200,
+                                      tolerance = 1e-3,
+                                      verbose = FALSE,
+                                      use_discrete = FALSE)
+  
+  
+  fit_discrete_reord <- emuFit_micro_discrete(X = X_reorder,
+                                              Y = Y_reorder,
+                                              j_ref = 1)
+  
+  
+  testthat::expect_lte(max(abs(fit_discrete - fit_discrete_reord)), 1e-8)
+  testthat::expect_lte(max(abs(ml_fit - ml_fit_reord)), 1e-8)
+  testthat::expect_lte(max(abs(ml_fit_reord - ml_fit_fisher_reord)), 5e-3)
+  
+})
+
+test_that("discrete works with wirbel data, binary design", {
+  
+  skip("too long for automated testing")
+  
+  data("wirbel_sample")
+  data("wirbel_otu")
+  data("wirbel_taxonomy")
+  wirbel_phylo <- phyloseq::phyloseq(phyloseq::sample_data(wirbel_sample),
+                                     phyloseq::otu_table(wirbel_otu, taxa_are_rows = FALSE),
+                                     phyloseq::tax_table(wirbel_taxonomy))
+  wirbel_genus <- phyloseq::tax_glom(wirbel_phylo, taxrank = "genus")
+  zero_taxa <- phyloseq::taxa_sums(wirbel_genus) == 0
+  wirbel_genus <- phyloseq::prune_taxa(zero_taxa == FALSE, wirbel_genus)
+  wirbel_genus <- phyloseq::prune_samples(phyloseq::sample_sums(wirbel_genus) > 0, 
+                                          wirbel_genus)
+  design <- make_design_matrix(formula = ~ Group, Y = wirbel_genus)
+  Y <- as.matrix(phyloseq::otu_table(wirbel_genus))
+  
+  ch_fit_old <- emuFit_micro_penalized(X = design, Y = Y,
+                             constraint_fn = rep(list(pseudohuber_median), 6), 
+                             use_discrete = FALSE)
+  ch_fit <- emuFit_micro_penalized(X = design, Y = Y, 
+                         constraint_fn = rep(list(pseudohuber_median), 6), 
+                         use_discrete = TRUE)
+  
+  expect_true(all.equal(ch_fit_old$B, ch_fit$B, tol = 0.001))
+  
+})
+
+test_that("discrete works with wirbel data, more categories", {
+  
+  skip("too long for automated testing")
+  
+  data("wirbel_sample")
+  data("wirbel_otu")
+  data("wirbel_taxonomy")
+  wirbel_phylo <- phyloseq::phyloseq(phyloseq::sample_data(wirbel_sample),
+                                     phyloseq::otu_table(wirbel_otu, taxa_are_rows = FALSE),
+                                     phyloseq::tax_table(wirbel_taxonomy))
+  wirbel_genus <- phyloseq::tax_glom(wirbel_phylo, taxrank = "genus")
+  zero_taxa <- phyloseq::taxa_sums(wirbel_genus) == 0
+  wirbel_genus <- phyloseq::prune_taxa(zero_taxa == FALSE, wirbel_genus)
+  wirbel_genus <- phyloseq::prune_samples(phyloseq::sample_sums(wirbel_genus) > 0, 
+                                          wirbel_genus)
+  design <- make_design_matrix(formula = ~ Study, Y = wirbel_genus)
+  Y <- as.matrix(phyloseq::otu_table(wirbel_genus))
+  
+  ch_fit_old <- emuFit_micro_penalized(X = design, Y = Y,
+                                       constraint_fn = rep(list(pseudohuber_median), 6), 
+                                       use_discrete = FALSE)
+  ch_fit <- emuFit_micro_penalized(X = design, Y = Y, 
+                                   constraint_fn = rep(list(pseudohuber_median), 6), 
+                                   use_discrete = TRUE)
+  
+  expect_true(all.equal(ch_fit_old$B, ch_fit$B, tol = 0.001))
+  
+  ord <- sample(1:nrow(design), nrow(design), replace = FALSE)
+  ch_fit_reord <- emuFit_micro_penalized(X = design[ord, ], Y = Y[ord, ], 
+                                   constraint_fn = rep(list(pseudohuber_median), 6), 
+                                   use_discrete = TRUE)
+  
+  expect_true(all.equal(ch_fit_reord$B, ch_fit$B, tol = 0.001))
+  
+})
diff --git a/tests/testthat/test-emuFit_micro_penalized.R b/tests/testthat/test-emuFit_micro_penalized.R
index fa9de7d..c0d276c 100644
--- a/tests/testthat/test-emuFit_micro_penalized.R
+++ b/tests/testthat/test-emuFit_micro_penalized.R
@@ -107,21 +107,8 @@ less efficient implementation (and that both substantially differ from MLE", {
                                        maxit = 1000,
                                        tolerance = 1e-3,
                                        verbose= FALSE)
-  #using old implementation should trigger a message explaining there's no
-  #reason to do this except for testing
-  suppressMessages(expect_message(
-    pl_fit_old <- emuFit_micro_penalized(X,
-                                         Y,
-                                         B = NULL,
-                                         constraint_fn = rep(list(function(x) mean(x)), 2),
-                                         maxit = 10000,
-                                         tolerance = 1e-3,
-                                         verbose= TRUE,
-                                         use_legacy_augmentation = TRUE)))
   
   expect_true(max(abs(ml_fit - pl_fit_new$B))>1)
-  expect_true(max(abs(ml_fit - pl_fit_old$B))>1)
-  expect_equal(pl_fit_new,pl_fit_old,tolerance = 1e-6)
 })
 
 
@@ -158,21 +145,8 @@ less efficient implementation (and that both substantially differ from MLE", {
                                        maxit = 10000,
                                        tolerance = 0.001,
                                        verbose= FALSE)
-  #using old implementation should trigger a message explaining there's no
-  #reason to do this except for testing
-  suppressMessages(expect_message(
-    pl_fit_old <- emuFit_micro_penalized(X,
-                                         Y,
-                                         B = NULL,
-                                         constraint_fn = rep(list(function(x) mean(x)), 2),
-                                         maxit = 10000,
-                                         tolerance = 0.001,
-                                         verbose= TRUE,
-                                         use_legacy_augmentation = TRUE)))
   
   expect_true(max(abs(ml_fit - pl_fit_new$B))>0.5)
-  expect_true(max(abs(ml_fit - pl_fit_old$B))>0.5)
-  expect_equal(pl_fit_new,pl_fit_old)
 })
 
 test_that("penalized fit uses B if given, and therefore fit is quicker", {
diff --git a/tests/testthat/test-macro_fisher_null.R b/tests/testthat/test-macro_fisher_null.R
index 71a91a3..c00e753 100644
--- a/tests/testthat/test-macro_fisher_null.R
+++ b/tests/testthat/test-macro_fisher_null.R
@@ -107,7 +107,7 @@ test_that("We take same step as we'd take using numerical derivatives when gap,
 
   min_ratio <- min(update$update/n_update,na.rm = TRUE)
 
-  expect_equal(max_ratio,min_ratio,tolerance = 1e-3)
+  expect_equal(max_ratio,min_ratio,tolerance = 1e-2)
 
 })