NN uses covariate-adjusted outcome

kolesarm · Dec 12, 2024 · 06acbf2 · 06acbf2
1 parent 38a6cba
commit 06acbf2
Show file tree

Hide file tree

Showing 8 changed files with 32 additions and 31 deletions.
diff --git a/R/NPRfunctions.R b/R/NPRfunctions.R
@@ -27,7 +27,7 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) {
     be <- as.matrix(r0$coefficients)
     if (any(is.na(be[1:Lz, ]))) {
         return(list(estimate=0, se=NA, est_w=W*0, sigma2=NA*d$Y, eff.obs=0,
-                    fs=NA, lm=r0, Z=Z))
+                    fs=NA, lm=r0, Yadj=d$Y))
     }
     ## If the collinearity comes from covariates, drop them
     if (any(is.na(be[-(1:Lz), ]))) {
@@ -38,6 +38,11 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) {
                       collapse=", "))
         r0 <- stats::lm.wfit(x=Z, y=d$Y, w=W)
     }
+    Yadj <- d$Y
+    if (NCOL(Z)> Lz) {
+        Yadj <- d$Y - Z[, -(1:Lz), drop=FALSE] %*%
+            as.matrix(r0$coefficients)[-(1:Lz), , drop=FALSE]
+    }
 
     class(r0) <- c(if (ncol(d$Y)>1) "mlm", "lm")
     wgt <- W*0
@@ -54,14 +59,14 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) {
     HC <- function(r) r[, rep(seq_len(ny), each=ny)] * r[, rep(seq_len(ny), ny)]
 
     ## For RD, compute variance separately on either side of cutoff
-    ## TODO: better implement
+    ## using covariate-adjusted outcome
     NN <- function(X) {
         res <- matrix(0, nrow=length(X), ncol=ny^2)
         res[ok] <-
             if (!inherits(d, "IP"))
-                rbind(as.matrix(sigmaNN(X[d$m & ok], d$Y[d$m & ok, ], J,
+                rbind(as.matrix(sigmaNN(X[d$m & ok], Yadj[d$m & ok, ], J,
                                         d$w[d$m & ok])),
-                      as.matrix(sigmaNN(X[d$p & ok], d$Y[d$p & ok, ], J,
+                      as.matrix(sigmaNN(X[d$p & ok], Yadj[d$p & ok, ], J,
                                         d$w[d$p & ok])))
             else
                 sigmaNN(X[ok], d$Y[ok, ], J, d$w[ok])
@@ -83,7 +88,7 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) {
         V <- as.vector(crossprod(us))
     }
     ret <- list(estimate=r0$coefficients[1], se=sqrt(V[1]), est_w=wgt,
-                sigma2=hsigma2, eff.obs=eff.obs, fs=NA, lm=r0, Z=Z)
+                sigma2=hsigma2, eff.obs=eff.obs, fs=NA, lm=r0, Yadj=Yadj)
 
     if (inherits(d, "FRD")) {
         ret$fs <- r0$coefficients[1, 2]

diff --git a/R/RDHonest.R b/R/RDHonest.R
@@ -210,7 +210,8 @@ RDHonest <- function(formula, data, subset, weights, cutoff=0, M,
 
     if (missing(M)) {
         M <- MROT(d)
-        message("Using Armstong & Kolesar (2020) ROT for smoothness constant M")
+        message(paste0("Using Armstrong & Kolesar (2020) ROT ",
+                       "for smoothness constant M"))
     }
 
     if (kernel_type(kern)=="optimal") {
@@ -234,17 +235,9 @@ RDHonest <- function(formula, data, subset, weights, cutoff=0, M,
 }
 
 covariate_adjust <- function(d, kern, h) {
-    if (is.null(d$Y_unadj)) d$Y_unadj <- d$Y
-    d0 <- d
-    d0$Y <- d0$Y_unadj
-    r <- NPReg(d0, h, kern, order=1, se.method="EHW")
-    be <- as.matrix(r$lm$coefficients)
-    if (r$eff.obs==0)
-        stop("Too few effective observations to perform covariate adjustment")
-    ## Original
-    ## L <- NCOL(d$covs)
-    ## d$Y <-d$Y_unadj-d$covs %*% be[seq_len(L)+NROW(be)-L, , drop=FALSE]
-    d$Y <-d$Y_unadj-r$Z[, -(1:4)] %*% be[-(1:4), , drop=FALSE]
+    r <- NPReg(d, h, kern, order=1, se.method="EHW")
+    d$Y_unadj <- d$Y
+    d$Y <- r$Yadj
     d
 }
 

diff --git a/doc/RDHonest.R b/doc/RDHonest.R
@@ -90,7 +90,7 @@ RDHonest(log(cn) | retired ~ elig_year | education, data=rcp,
 f3 <- RDScatter(log(cn)~elig_year, data=rcp, cutoff=0, avg=Inf,
                 xlab="Years to eligibility",
                 ylab="Log consumption of non-durables", propdotsize=TRUE,
-                subset=abs(elig_year)<10)
+                subset=abs(elig_year)<15)
 ## Adjust size of dots if they are too big
 f3 + ggplot2::scale_size_area(max_size = 5)
 

diff --git a/doc/RDHonest.Rmd b/doc/RDHonest.Rmd
@@ -726,16 +726,16 @@ RDHonest(log(cn) | retired ~ elig_year | education, data=rcp,
 Relative to the previous estimate without covariates, the point estimate is now
 much larger. This is in part due to slightly smaller bandwidth used, and the
 regression function for the reduced form appears noisy below the cutoff,
-potentially due to measurement error: see Figure 3. The noise is also
-responsible for the rather large data-driven estimates of the curvature
-parameters.
+potentially due to measurement error: see Figure 3. As a result, the estimates
+are quite sensitive to the bandwidth used. The noise is also responsible for the
+rather large data-driven estimates of the curvature parameters.
 
 ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Battistin et al (2009) data"}
 ## see Figure 3
 f3 <- RDScatter(log(cn)~elig_year, data=rcp, cutoff=0, avg=Inf,
                 xlab="Years to eligibility",
                 ylab="Log consumption of non-durables", propdotsize=TRUE,
-                subset=abs(elig_year)<10)
+                subset=abs(elig_year)<15)
 ## Adjust size of dots if they are too big
 f3 + ggplot2::scale_size_area(max_size = 5)
 ```

diff --git a/doc/RDHonest.pdf b/doc/RDHonest.pdf
diff --git a/doc/manual.pdf b/doc/manual.pdf
diff --git a/tests/testthat/test_covariates.R b/tests/testthat/test_covariates.R
@@ -60,21 +60,24 @@ test_that("Test collinear covariates", {
     df <- headst[complete.cases(headst), ]
     covlist <- "urban+black"
     fh1 <- as.formula(paste0("mortHS ~ povrate|", covlist))
-    r0 <- RDHonest(fh1, data=df)
+    expect_message(r0 <- RDHonest(fh1, data=df), "Using Armstrong")
     Z <- cbind(df$urban, df$black, df$urban)
-    expect_message(r1 <- RDHonest(df$mortHS~df$povrate|Z, M=r0$coefficients$M))
+    expect_message(expect_message(r1 <- RDHonest(df$mortHS~df$povrate|Z,
+                                                 M=r0$coefficients$M)))
     expect_lt(max(abs(r1$coefficients[2:13]-r0$coefficients[2:13])), 1e-15)
     Z <- cbind(df$urban, df$urban, df$black, df$black, df$urban+df$black)
-    expect_message(r2 <- RDHonest(df$mortHS~df$povrate|Z))
+    suppressMessages(expect_message(r2 <- RDHonest(df$mortHS~df$povrate|Z)))
     expect_lt(max(abs(r2$coefficients[2:13]-r0$coefficients[2:13])), 1e-15)
-    expect_error(RDHonest(df$mortHS~df$povrate|Z, h=0.05, kern="uniform"))
+    expect_message(expect_message(r <- RDHonest(df$mortHS~df$povrate|Z, h=0.05,
+                                                kern="uniform")), "large")
 
     ## Fuzzy
     df <- rcp[1:1000, ]
     Z <- cbind(df$food, df$food+df$family_size, df$family_size)
-    r0 <- RDHonest(log(c)|retired ~ elig_year|Z[, 1:2], data=df,
-                   weights=survey_year)
-    expect_message(r1 <- RDHonest(log(c)|retired ~ elig_year|Z, data=df,
-                                  weights=survey_year))
+    expect_message(r0 <- RDHonest(log(c)|retired ~ elig_year|Z[, 1:2], data=df,
+                                  weights=survey_year), "Using Armstrong")
+    suppressMessages(expect_message(r1 <- RDHonest(log(c)|retired ~ elig_year|Z,
+                                                   data=df,
+                                                   weights=survey_year)))
     expect_lt(max(abs(r1$coefficients[2:13]-r0$coefficients[2:13])), 1e-15)
 })
diff --git a/vignettes/RDHonest.Rmd b/vignettes/RDHonest.Rmd
@@ -700,7 +700,7 @@ rc
 We see that the inclusion of covariates leads to a reduction in the
 rule-of-thumb curvature and also smaller standard errors (this would be true
 even if the bandwidth was kept fixed). Correspondingly, the CIs are tighter by
-about 6 percentage points:
+about 9 percentage points:
 ```{r}
 ci_len <- c(rc$coefficients$conf.high-rc$coefficients$conf.low,
             rn$coefficients$conf.high-rn$coefficients$conf.low)