From 321468244514287703aae93d902b182475de7072 Mon Sep 17 00:00:00 2001
From: philchalmers <rphilip.chalmers@gmail.com>
Date: Sat, 9 Nov 2024 23:15:40 -0500
Subject: [PATCH] M2 no longer needs row-removal

---
 DESCRIPTION |  2 +-
 NEWS.md     |  4 ++++
 R/M2.R      | 63 +++++++++++++++++++----------------------------------
 R/utils.R   |  9 ++++++++
 man/M2.Rd   |  7 +-----
 man/mirt.Rd |  2 +-
 6 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0d82c0c33..9c81394e8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: mirt
-Version: 1.42.5
+Version: 1.42.6
 Type: Package
 Title: Multidimensional Item Response Theory
 Authors@R: c( person("Phil", family="Chalmers", email =
diff --git a/NEWS.md b/NEWS.md
index b647045ee..db83d5801 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # Changes in mirt 1.43
 
+- `M2()` family no longer requires row-wise removal of missing data to behave
+  correctly. As such, the `na.rm` argument has been removed as it is no longer
+  required (requested by Ulrich Schroeders)
+
 - Added support for latent regression ACOV/SE estimation with Oakes method
   in `mirt()`
 
diff --git a/R/M2.R b/R/M2.R
index 6707d884e..4e01a3339 100644
--- a/R/M2.R
+++ b/R/M2.R
@@ -24,8 +24,6 @@
 #' @param quadpts number of quadrature points to use during estimation. If \code{NULL},
 #'   a suitable value will be chosen based
 #'   on the rubric found in \code{\link{fscores}}
-#' @param na.rm logical; remove rows with any missing values? The M2 family of statistics
-#'   requires a complete dataset in order to be well defined
 #' @param calcNull logical; calculate statistics for the null model as well?
 #'   Allows for statistics such as the limited information TLI and CFI. Only valid when items all
 #'   have a suitable null model (e.g., those created via \code{\link{createItem}} will not)
@@ -77,8 +75,7 @@
 #' # M2 with missing data present
 #' dat[sample(1:prod(dim(dat)), 250)] <- NA
 #' mod2 <- mirt(dat, 1)
-#' # Compute stats by removing missing data row-wise
-#' M2(mod2, na.rm = TRUE)
+#' M2(mod2)
 #'
 #' # C2 statistic (useful when polytomous IRT models have too few df)
 #' pmod <- mirt(Science, 1)
@@ -88,7 +85,7 @@
 #' M2(pmod, type = 'C2')
 #'
 #' }
-M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, theta_lim = c(-6, 6),
+M2 <- function(obj, type="M2*", calcNull = TRUE, quadpts = NULL, theta_lim = c(-6, 6),
                 CI = .9, residmat = FALSE, QMC = FALSE, suppress = 1, ...){
     impute <- 0
     if(is(obj, 'MixtureModel'))
@@ -110,14 +107,15 @@ M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, th
         ret <- list()
         group <- if(is.null(attr(obj, 'MG'))) 1 else attr(obj, 'MG')
         nitems <- ncol(obj@Data$data)
-        if(any(is.na(obj@Data$data)))
-            stop('M2 can not be calculated for data with missing values.', call.=FALSE)
+        # if(any(is.na(obj@Data$data)))
+        #     stop('M2 can not be calculated for data with missing values.', call.=FALSE)
         adj <- obj@Data$mins
         dat <- t(t(obj@Data$data) - adj)
-        N <- nrow(dat)
-        p  <- colMeans(dat)
-        cross <- crossprod(dat, dat)
-        p <- c(p, cross[lower.tri(cross)]/N)
+        N <- colSums(!is.na(dat))
+        cN <- crossprod_miss(!is.na(dat), !is.na(dat))
+        p  <- colMeans(dat, na.rm = TRUE)
+        cross <- crossprod_miss(dat, dat)
+        p <- c(p, cross[lower.tri(cross)]/cN[lower.tri(cross)])
         prodlist <- attr(obj@ParObjects$pars, 'prodlist')
         K <- obj@Data$K
         pars <- obj@ParObjects$pars
@@ -220,7 +218,7 @@ M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, th
             E2[is.na(E2)] <- 0
             E2 <- E2 + t(E2)
             diag(E2) <- E11
-            R <- cov2cor(cross/N - outer(colMeans(dat), colMeans(dat)))
+            R <- cov2cor(cross/cN - outer(colMeans(dat, na.rm=TRUE), colMeans(dat, na.rm=TRUE)))
             Kr <- cov2cor(E2 - outer(E1, E1))
             SRMSR <- sqrt( sum((R[lower.tri(R)] - Kr[lower.tri(Kr)])^2) / sum(lower.tri(R)))
             if(residmat){
@@ -293,7 +291,7 @@ M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, th
                 E2[is.na(E2)] <- 0
                 E2 <- E2 + t(E2)
                 diag(E2) <- E11
-                R <- cov2cor(cross/N - outer(colMeans(dat), colMeans(dat)))
+                R <- cov2cor(cross/cN - outer(colMeans(dat, na.rm=TRUE), colMeans(dat, na.rm=TRUE)))
                 Kr <- cov2cor(E2 - outer(E1, E1))
                 SRMSR <- sqrt( sum((R[lower.tri(R)] - Kr[lower.tri(Kr)])^2) / sum(lower.tri(R)))
             } else SRMSR <- NULL
@@ -325,8 +323,15 @@ M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, th
             }
             itemloc <- obj@Model$itemloc
             itemloc <- itemloc[-length(itemloc)]
-            p <- c(colMeans(obj@Data$fulldata[[1L]][,-itemloc]),
-                   cross[lower.tri(cross)]/N)
+            was_na <- is.na(extract.mirt(obj, 'data'))
+            fulldata <- obj@Data$fulldata[[1L]]
+            for(i in 1:(nitems)){
+                pick <- if(i == nitems) c(itemloc[i], ncol(fulldata))
+                    else c(itemloc[i], itemloc[i+1]-1)
+                fulldata[was_na[,i], pick] <- NA
+            }
+            p <- c(colMeans(fulldata[,-itemloc], na.rm=TRUE),
+                   cross[lower.tri(cross)]/cN[lower.tri(cross)])
         } else {
             # M2 TODO
         }
@@ -338,7 +343,9 @@ M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, th
         } else .Call('buildXi2els_C2', nrow(delta1), nrow(delta2), ncol(PIs), nitems,
                      PIs, EIs, EIs2, Prior, abcats, abcats2)
         Xi2 <- rbind(cbind(Xi2els$Xi11, Xi2els$Xi12), cbind(t(Xi2els$Xi12), Xi2els$Xi22))
-        ret <- list(Xi2=Xi2, delta=delta, estpars=estpars, p=sqrt(N)*p, e=sqrt(N)*e, SRMSR=SRMSR)
+        Nstar <- c(N, cN[lower.tri(cN)])
+        ret <- list(Xi2=Xi2, delta=delta, estpars=estpars,
+                    p=sqrt(Nstar)*p, e=sqrt(Nstar)*e, SRMSR=SRMSR, N=Nstar)
         ret
     }
 
@@ -355,31 +362,7 @@ M2 <- function(obj, type="M2*", calcNull = TRUE, na.rm=FALSE, quadpts = NULL, th
     if(is(obj, 'MixtureClass'))
         stop('MixtureClass objects are not yet supported', call.=FALSE)
     if(QMC && is.null(quadpts)) quadpts <- 5000L
-    if(na.rm) obj <- removeMissing(obj)
-    if(na.rm) message('Sample size after row-wise response data removal: ',
-                      nrow(extract.mirt(obj, 'data')))
     if(nrow(extract.mirt(obj, 'data')) == 0L) stop('No data!', call.=FALSE)
-    if(any(is.na(obj@Data$data))){
-        if(impute == 0)
-            stop('Fit statistics cannot be computed when there are missing data.
-                 Remove cases row-wise by passing na.rm=TRUE', call.=FALSE)
-        if(residmat) stop('residmat not supported when imputing data', call.=FALSE)
-        Theta <- fscores(obj, plausible.draws = impute, QMC=QMC, leave_missing=TRUE, ...)
-        collect <- myLapply(Theta, fn, obj=obj, calcNull=calcNull,
-                            quadpts=quadpts, QMC=QMC, theta_lim=theta_lim)
-        ave <- SD <- collect[[1L]]
-        ave[ave!= 0] <- SD[SD!=0] <- 0
-        for(i in seq_len(impute))
-            ave <- ave + collect[[i]]
-        ave <- ave/impute
-        vars <- 0
-        for(i in seq_len(impute))
-            vars <- vars + (ave - collect[[i]])^2
-        SD <- sqrt(vars/impute)
-        ret <- rbind(ave, SD)
-        rownames(ret) <- c('stats', 'SD_stats')
-        return(ret)
-    }
     discrete <- FALSE
     if(is(obj, 'DiscreteClass')){
         discrete <- TRUE
diff --git a/R/utils.R b/R/utils.R
index 16ef9e34e..1416b6af5 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -2888,6 +2888,15 @@ mySapply <- function(X, FUN, progress = FALSE, ...){
     }
 }
 
+crossprod_miss <- function(x, y){
+    mat <- matrix(0, ncol(x), ncol(x))
+    for(i in 1:ncol(x))
+        for(j in 1:ncol(x))
+            if(i <= j)
+                mat[i,j] <- mat[j,i] <- sum(x[,i] * y[,j], na.rm=TRUE)
+    mat
+}
+
 printf <- function(...) {
   catf(sprintf(...))
 }
diff --git a/man/M2.Rd b/man/M2.Rd
index 52910d136..12d3fb0a7 100644
--- a/man/M2.Rd
+++ b/man/M2.Rd
@@ -8,7 +8,6 @@ M2(
   obj,
   type = "M2*",
   calcNull = TRUE,
-  na.rm = FALSE,
   quadpts = NULL,
   theta_lim = c(-6, 6),
   CI = 0.9,
@@ -30,9 +29,6 @@ M2 and M2* where only the bivariate moments are collapsed}
 Allows for statistics such as the limited information TLI and CFI. Only valid when items all
 have a suitable null model (e.g., those created via \code{\link{createItem}} will not)}
 
-\item{na.rm}{logical; remove rows with any missing values? The M2 family of statistics
-requires a complete dataset in order to be well defined}
-
 \item{quadpts}{number of quadrature points to use during estimation. If \code{NULL},
 a suitable value will be chosen based
 on the rubric found in \code{\link{fscores}}}
@@ -85,8 +81,7 @@ summary(resids[lower.tri(resids)])
 # M2 with missing data present
 dat[sample(1:prod(dim(dat)), 250)] <- NA
 mod2 <- mirt(dat, 1)
-# Compute stats by removing missing data row-wise
-M2(mod2, na.rm = TRUE)
+M2(mod2)
 
 # C2 statistic (useful when polytomous IRT models have too few df)
 pmod <- mirt(Science, 1)
diff --git a/man/mirt.Rd b/man/mirt.Rd
index 6a56ef78e..6b38d8561 100644
--- a/man/mirt.Rd
+++ b/man/mirt.Rd
@@ -1343,7 +1343,7 @@ ds <- rbind(cbind(d1=NA, d2=d), cbind(d1, d2))
 (pars <- data.frame(a=as, d=ds))
 dat <- simdata(as, ds, 2500,
   itemtype = c(rep('dich', 18), rep('partcomp', 12)))
-colMeans(dat)
+itemstats(dat)
 
 # unconditional model
 syntax <- "theta1 = 1-9, 19-30