Upgrade to v3 of testthat, prepare for covariates

kolesarm · Mar 7, 2024 · f3ee09a · f3ee09a
1 parent 96fba02
commit f3ee09a
Show file tree

Hide file tree

Showing 15 changed files with 311 additions and 297 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: RDHonest
 Title: Honest Inference in Regression Discontinuity Designs
-Version: 0.4.1
+Version: 0.5.0
 Authors@R:
     c(person(given = "Michal",
             family = "Kolesár",
@@ -14,7 +14,7 @@ Authors@R:
 Description: Honest and nearly-optimal confidence intervals in fuzzy and sharp
     regression discontinuity designs and for inference at a point based on local
     linear regression.
-Depends: R (>= 4.1.0)
+Depends: R (>= 4.3.0)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
@@ -28,8 +28,9 @@ Suggests:
     knitr,
     rmarkdown,
     formatR
-RoxygenNote: 7.3.1
-URL: https://github.com/kolesarm/RDHonest
-VignetteBuilder: knitr
+Config/testthat/edition: 3
 Language: en-US
+URL: https://github.com/kolesarm/RDHonest
 BugReports: https://github.com/kolesarm/RDHonest/issues
+RoxygenNote: 7.3.1
+VignetteBuilder: knitr
diff --git a/R/NPRfunctions.R b/R/NPRfunctions.R
@@ -12,8 +12,8 @@ LPReg <- function(X, Y, h, K, order=1, se.method=NULL, sigma2, J=3,
     Gamma <- crossprod(R, W*R)
 
     if (h==0 || inherits(try(solve(Gamma), silent=TRUE), "try-error"))
-        return(list(theta=0, sigma2=NA, var=NA, w=0, eff.obs=0))
-
+        return(list(theta=0, sigma2=NA*Y, res=NA*Y,
+                    var=NA*stats::var(Y), est_w=W*NA, eff.obs=0))
     ## weights if we think of the estimator as linear estimator
     wgt <- (W*R %*% solve(Gamma))[, 1]
     beta <- solve(Gamma, crossprod(R, W*Y))

diff --git a/R/RDHonest.R b/R/RDHonest.R
@@ -50,8 +50,8 @@
 #'
 #'     }
 #'
-#'     The methods use conditional variance given by \code{sigmaY2}, if supplied.
-#'     Otherwise, for the purpose of estimating the optimal bandwidth,
+#'     The methods use conditional variance given by \code{sigmaY2}, if
+#'     supplied. Otherwise, for the purpose of estimating the optimal bandwidth,
 #'     conditional variance is estimated using the method specified by
 #'     \code{se.initial}.
 #' @param beta Determines quantile of excess length to optimize, if bandwidth
@@ -60,19 +60,19 @@
 #' @param alpha determines confidence level, \code{1-alpha} for
 #'     constructing/optimizing confidence intervals.
 #' @param M Bound on second derivative of the conditional mean function.
-#' @param sclass Smoothness class, either \code{"T"} for Taylor or
-#'     \code{"H"} for Hölder class.
+#' @param sclass Smoothness class, either \code{"T"} for Taylor or \code{"H"}
+#'     for Hölder class.
 #' @param h bandwidth, a scalar parameter. If not supplied, optimal bandwidth is
 #'     computed according to criterion given by \code{opt.criterion}.
-#' @param weights Optional vector of weights to weight the observations
-#'     (useful for aggregated data). Disregarded if optimal kernel is used.
+#' @param weights Optional vector of weights to weight the observations (useful
+#'     for aggregated data). Disregarded if optimal kernel is used.
 #' @param point.inference Do inference at a point determined by \code{cutoff}
 #'     instead of RD.
 #' @param T0 Initial estimate of the treatment effect for calculating the
 #'     optimal bandwidth. Only relevant for Fuzzy RD.
 #' @param sigmaY2 Supply variance of outcome. Ignored when kernel is optimal.
 #' @param sigmaD2 Supply variance of treatment (fuzzy RD only).
-#' @param sigmaYD Supply covariante of treatment and outcome (fuzzy RD only).
+#' @param sigmaYD Supply covariance of treatment and outcome (fuzzy RD only).
 #' @param clusterid Cluster id for cluster-robust standard errors
 #' @return Returns an object of class \code{"RDResults"}. The function
 #'     \code{print} can be used to obtain and print a summary of the results. An
@@ -140,7 +140,8 @@
 RDHonest <- function(formula, data, subset, weights, cutoff=0, M,
                      kern="triangular", na.action, opt.criterion="MSE", h,
                      se.method="nn", alpha=0.05, beta=0.8, J=3, sclass="H",
-                     T0=0, point.inference=FALSE, sigmaY2, sigmaD2, sigmaYD, clusterid) {
+                     T0=0, point.inference=FALSE, sigmaY2, sigmaD2, sigmaYD,
+                     clusterid) {
     ## construct model frame
     cl <- mf <- match.call(expand.dots = FALSE)
     m <- match(c("formula", "data", "subset", "weights", "na.action", "sigmaY2",
@@ -156,46 +157,66 @@ RDHonest <- function(formula, data, subset, weights, cutoff=0, M,
     mf <- eval(mf, parent.frame())
 
     ## TODO: what are the supported se methods, weighting + optimal kernel etc
-    if (!(se.method %in% c("nn", "EHW", "supplied.var")))
-        stop("Unsupported se.method")
-
     if (point.inference) {
-        method <- "IP"
+        method <- c("Value of conditional mean"="IP")
     } else if (length(formula)[2]==2) {
-        method <- "FRD"
+        method <- c("Fuzzy RD parameter"="FRD")
     } else {
-        method <- "SRD"
+        method <- c("Sharp RD parameter"="SRD")
     }
+
     d <- NPRData(mf, cutoff, method)
+    process_options(M, se.method, method, d, kern)
+
+    if (NCOL(d$W)>0 && (missing(M) || missing(h))) {
+        ret <- NPRHonest(d, MROT(d), kern, h, opt.criterion=opt.criterion,
+                         alpha=alpha, beta=beta, se.method=se.method, J=J,
+                         sclass=sclass, T0=T0)
+        return(0) # TODO
+    }
 
     if (missing(M)) {
         M <- MROT(d)
         message("Using Armstong & Kolesar (2020) ROT for smoothness constant M")
     }
-    if (kern=="optimal" && method=="SRD") {
+
+    if (kern=="optimal") {
         ret <- RDTOpt(d, M, opt.criterion, alpha, beta, se.method, J)
-    } else if (!missing(h)) {
-        ret <- NPRHonest(d, M, kern, h, alpha=alpha, se.method=se.method, J=J,
-                         sclass=sclass, T0=T0)
     } else {
-        ret <- NPRHonest(d, M, kern, opt.criterion=opt.criterion, alpha=alpha,
-                         beta=beta, se.method=se.method, J=J, sclass=sclass,
-                         T0=T0)
+        ret <- NPRHonest(d, M, kern, h, opt.criterion=opt.criterion,
+                         alpha=alpha, beta=beta, se.method=se.method, J=J,
+                         sclass=sclass, T0=T0)
     }
+
     ret$call <- cl
     ret$na.action <- attr(mf, "na.action")
 
-    if (is.nan(ret$coefficients$leverage) || ret$coefficients$leverage>0.1)
+    if (!is.finite(ret$coefficients$leverage) || ret$coefficients$leverage>0.1)
         message(paste0("Maximal leverage is large: ",
                        round(ret$coefficients$leverage, 2),
                        ".\nInference may be inaccurate. ",
                        "Consider using bigger bandwidth."))
-    ret$coefficients$term <- switch(method, IP="Value of conditional mean",
-                                    SRD="Sharp RD parameter",
-                                    "Fuzzy RD parameter")
+    ret$coefficients$term <- names(method)
     ret
 }
 
+process_options <- function(M, se.method, method, d, kern) {
+    if (kern=="optimal" && method!="SRD")
+        stop("Optimal kernel requires sharp RD design.")
+
+    if (!missing(M)) {
+        if (method=="FRD" && (length(M)!=2 || !is.numeric(M)))
+            stop("M must be a numeric vector of length 2")
+        if (method!="FRD" && (length(M)!=1 || !is.numeric(M)))
+            stop("M must be a numeric vector of length 1.")
+    }
+    if (!(se.method %in% c("nn", "EHW", "supplied.var"))) {
+        stop("Unsupported se.method")
+    }
+    if (NCOL(d$W)>0 && method=="IP")
+        stop("Covariates not allowed whem method is 'IP'.")
+}
+
 
 ## @param d object of class \code{"RDData"}, \code{"FRDData"}, or
 ##     \code{"LPPData"}
@@ -233,7 +254,7 @@ NPRHonest <- function(d, M, kern="triangular", h, opt.criterion, alpha=0.05,
     }
 
     ## Determine bias
-    if (nobs==0) {
+    if (nobs==0 || is.na(nobs)) {
         ## If bandwidths too small, big bias / sd
         bias <- r1$se <- sqrt(.Machine$double.xmax/10)
     } else if (sclass=="T")  {

diff --git a/R/utils.R b/R/utils.R
@@ -8,20 +8,23 @@ unAsIs <- function(X) {
 
 ## Class Constructor
 NPRData <- function(d, cutoff, class) {
-    Xindex <- if (class=="FRD") 3 else 2
+    Xidx <- if (class=="FRD") 3 else 2
     ## Unclass in case generated by I()
-    if (is.unsorted(d[[Xindex]]))
-        d <- d[sort(unAsIs(d[[Xindex]]), index.return=TRUE)$ix, ]
-    Y <- if (class=="FRD") cbind(d[[1]], d[[2]]) else d[[1]]
+    if (is.unsorted(d[[Xidx]]))
+        d <- d[sort(unAsIs(d[[Xidx]]), index.return=TRUE)$ix, ]
+    Y <- drop(as.matrix(d[, seq_len(Xidx-1)]))
+    W <- stats::model.matrix(attr(d, "terms"), d)[, -seq_len(Xidx)]
 
-    df <- structure(list(Y=Y, X=d[[Xindex]] - cutoff, orig.cutoff=cutoff,
-                         var.names=names(d)[1:Xindex]), class=class)
+    df <- structure(list(Y=Y, X=d[[Xidx]] - cutoff, orig.cutoff=cutoff, W=W,
+                         var.names=c(names(d)[1:Xidx], colnames(W))),
+                    class=class)
     df$p <- df$X>=0
     df$m <- df$X<0
     if (class!="FRD") {
         df$sigma2 <- d$"(sigmaY2)"
     } else if (!is.null(d$"(sigmaY2)")) {
-        df$sigma2 <- cbind(d$"(sigmaY2)", d$"(sigmaYD)", d$"(sigmaYD)", d$"(sigmaD2)")
+        df$sigma2 <- cbind(d$"(sigmaY2)", d$"(sigmaYD)", d$"(sigmaYD)",
+                           d$"(sigmaD2)")
     }
 
     df$clusterid <- d$"(clusterid)"

diff --git a/man/RDHonest.Rd b/man/RDHonest.Rd
diff --git a/tests/testthat/test_cbound.R b/tests/testthat/test_cbound.R
@@ -1,22 +1,21 @@
-context("Test estimation of bound on C")
-
 test_that("Test estimation of bound on M in Lee data", {
-    r <- RDHonest(voteshare~margin, data=lee08)
+    ma <- function(a) max(abs(a))
+
+    expect_message(r <- RDHonest(voteshare~margin, data=lee08))
     r1 <- RDSmoothnessBound(r, s=100, separate=TRUE, multiple=TRUE, sclass="T")
-    expect_equal(r1$estimate, c(0.00147446, 0.00016410))
+    expect_lt(ma(r1$estimate-c(0.00147446, 0.00016410)), 3e-9)
     r2 <- RDSmoothnessBound(r, s=100, separate=FALSE, multiple=TRUE, sclass="H")
-    expect_equal(r2$conf.low, 0.017285255)
+    expect_lt(ma(r2$conf.low- 0.017285255), 1e-9)
     r3 <- RDSmoothnessBound(r, s=100, separate=FALSE, multiple=FALSE,
-                            sclass="H")
-    expect_equal(r3$conf.low, 0.59509966)
+                            sclass="H")$conf.low
+    expect_equal(r3, 0.59509966)
     ## Should fail, s too big
     expect_error(RDSmoothnessBound(r, s=10000))
     r <- RDHonest(log(earnings)~yearat14, cutoff=1947, data=cghs, M=0.023,
                   h=3.4)
     expect_error(RDSmoothnessBound(r, s=20, separate=TRUE, multiple=FALSE))
     r4 <- RDSmoothnessBound(r, s=2, separate=TRUE, multiple=FALSE)
-    expect_equal(r4$estimate, c(0.015359658, 0L))
-
+    expect_lt(ma(r4$estimate- c(0.015359658, 0L)), 1e-9)
 
     ## Old implementation
     ## r2 <- capture.output(print(r, digits=5))

diff --git a/tests/testthat/test_clustering.R b/tests/testthat/test_clustering.R
@@ -1,27 +1,32 @@
-context("Test RD")
-
 test_that("Test clustering formulas", {
     ## Everyone in their own cluster
-    s0 <- RDHonest(voteshare~margin, data=lee08, se.method="EHW")
-    s0c <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
-                    clusterid=seq_along(lee08$margin))
+    expect_message(s0 <- RDHonest(voteshare~margin, data=lee08,
+                                  se.method="EHW"))
+    expect_message(s0c <- RDHonest(voteshare~margin, data=lee08,
+                                   se.method="EHW",
+                                   clusterid=seq_along(lee08$margin)))
     expect_equal(s0$coefficients, s0c$coefficients)
     rr <- rcp[1:1000, ]
-    f0 <- RDHonest(c~retired | elig_year, data=rr, se.method="EHW")
-    f0c <- RDHonest(c~retired | elig_year, data=rr, clusterid=seq_along(rr$c))
+    expect_message(f0 <- RDHonest(c~retired | elig_year, data=rr,
+                                  se.method="EHW"))
+    expect_message(f0c <- RDHonest(c~retired | elig_year, data=rr,
+                                   clusterid=seq_along(rr$c)))
     expect_equal(f0$coefficients, f0c$coefficients)
-    p0 <- RDHonest(c~elig_year, data=rr, se.method="EHW", point.inference=TRUE)
-    p0c <- RDHonest(c~elig_year, data=rr, clusterid=seq_along(rr$c),
-                    point.inference=TRUE)
+    expect_message(p0 <- RDHonest(c~elig_year, data=rr,
+                                  se.method="EHW", point.inference=TRUE))
+    expect_message(p0c <- RDHonest(c~elig_year, data=rr,
+                                   clusterid=seq_along(rr$c),
+                                   point.inference=TRUE))
     expect_equal(p0$coefficients, p0c$coefficients)
 
     ## Check we match sandwich::vcovCL: uniform + triangular
     set.seed(42)
     clusterid <- sample(1:50, NROW(lee08), replace=TRUE)
-    s1h <- RDHonest(clusterid~margin, data=lee08, se.method="EHW", h=5,
-                    kern="uniform")
-    s1c <- RDHonest(clusterid~margin, data=lee08, se.method="EHW", h=5,
-                    kern="uniform", clusterid=clusterid)
+    expect_message(s1h <- RDHonest(clusterid~margin, data=lee08,
+                                   se.method="EHW", h=5, kern="uniform"))
+    expect_message(s1c <- RDHonest(clusterid~margin, data=lee08,
+                                   se.method="EHW", h=5,
+                                   kern="uniform", clusterid=clusterid))
     m1 <- lm(clusterid~margin*I(margin>0), data=lee08, subset=abs(margin)<=5)
     ## Should equal
     ## sqrt(sum(s1c$data$est_w[s1c$data$est_w!=0]^2*m1$residuals^2))
@@ -30,8 +35,8 @@ test_that("Test clustering formulas", {
     ## sandwich::vcovCL(m1, type="HC0", cluster=clusterid[abs(lee08$margin)<=5],
     ##                  cadjust=FALSE)[3, 3]
     expect_equal(as.numeric(s1c$coefficients[3])^2, 6.523026427)
-    s2c <- RDHonest(clusterid~margin, data=lee08, se.method="EHW",
-                    clusterid=clusterid)
+    expect_message(s2c <- RDHonest(clusterid~margin, data=lee08,
+                                   se.method="EHW", clusterid=clusterid))
     h <- s2c$coefficients$bandwidth
     m2 <- lm(clusterid~margin*I(margin>0), data=lee08,
              subset=abs(margin)<=h,
@@ -44,10 +49,10 @@ test_that("Test clustering formulas", {
 
     set.seed(42)
     clusterid <- sample(1:20, NROW(rr), replace=TRUE)
-    p1c <- RDHonest(c~elig_year, data=rr, clusterid=clusterid,
-                    point.inference=TRUE, kern="uniform")
-    p1h <- RDHonest(c~elig_year, data=rr, se.method="EHW", point.inference=TRUE,
-                    kern="uniform") # same bw
+    expect_message(p1c <- RDHonest(c~elig_year, data=rr, clusterid=clusterid,
+                                   point.inference=TRUE, kern="uniform"))
+    expect_message(p1h <- RDHonest(c~elig_year, data=rr, se.method="EHW",
+                                   point.inference=TRUE, kern="uniform"))
     m2 <- lm(c~elig_year, data=rr, subset=abs(elig_year)<=5)
     expect_equal(p1c$coefficients$estimate, unname(m2$coefficients[1]))
     ## sqrt(sandwich::vcovHC(m2, type="HC0")[1, 1]) 792.2299439
@@ -56,8 +61,8 @@ test_that("Test clustering formulas", {
     expect_equal(p1c$coefficients$std.error^2, 723178.2655)
     expect_equal(p1h$coefficients$std.error, 792.2299439)
 
-    p2c <- RDHonest(c~elig_year, data=rr, clusterid=clusterid,
-                    point.inference=TRUE, h=8.27778063886)
+    expect_message(p2c <- RDHonest(c~elig_year, data=rr, clusterid=clusterid,
+                                   point.inference=TRUE, h=8.27778063886))
     h <- p2c$coefficients$bandwidth
     m3 <- lm(c~elig_year, data=rr, subset=abs(elig_year)<=h,
              weights = (1 - abs(elig_year/h)))
@@ -67,10 +72,12 @@ test_that("Test clustering formulas", {
     expect_equal(as.numeric(p2c$coefficients[2:3]),
                  c(unname(m3$coefficients[1]), 919.162944))
 
-    f1c <- RDHonest(c~retired | elig_year, data=rr, clusterid=clusterid,
-                    kern="uniform")
-    f2c <- RDHonest(cn~retired | elig_year, data=rr, clusterid=clusterid,
-                    kern="uniform", h=4)
+    expect_message(f1c <- RDHonest(c~retired | elig_year, data=rr,
+                                   clusterid=clusterid,
+                                   kern="uniform"))
+    expect_message(f2c <- RDHonest(cn~retired | elig_year, data=rr,
+                                   clusterid=clusterid,
+                                   kern="uniform", h=4))
     ## m3 <- AER::ivreg(c~retired+elig_year+elig_year:I(elig_year>0) |
     ##                      elig_year*I(elig_year>0),
     ##                  data=rr, subset=abs(elig_year)<=5)
@@ -85,9 +92,6 @@ test_that("Test clustering formulas", {
     ## Triangular kernel doesn't work with sandwich + ivreg
 
     ## Check preliminary bw calculations and rho
-    dd <- data.frame(y=log(rr$c), x=rr$elig_year, "(clusterid)"=clusterid)
-    names(dd)[3]<- "(clusterid)"
-    d <- NPRData(dd, cutoff=0, class="SRD")
-    r0 <- PrelimVar(d)
-    expect_equal(r0$rho, -0.0008690793197)
+    r0 <- RDHonest(log(c)~elig_year, h=Inf, M=1, data=rr, clusterid=clusterid)
+    expect_equal(PrelimVar(r0$d)$rho, -0.0008690793197)
 })