diff --git a/R/NPRfunctions.R b/R/NPRfunctions.R index 8576a5f..97d2534 100644 --- a/R/NPRfunctions.R +++ b/R/NPRfunctions.R @@ -27,7 +27,7 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { be <- as.matrix(r0$coefficients) if (any(is.na(be[1:Lz, ]))) { return(list(estimate=0, se=NA, est_w=W*0, sigma2=NA*d$Y, eff.obs=0, - fs=NA, lm=r0, Z=Z)) + fs=NA, lm=r0, Yadj=d$Y)) } ## If the collinearity comes from covariates, drop them if (any(is.na(be[-(1:Lz), ]))) { @@ -38,6 +38,11 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { collapse=", ")) r0 <- stats::lm.wfit(x=Z, y=d$Y, w=W) } + Yadj <- d$Y + if (NCOL(Z)> Lz) { + Yadj <- d$Y - Z[, -(1:Lz), drop=FALSE] %*% + as.matrix(r0$coefficients)[-(1:Lz), , drop=FALSE] + } class(r0) <- c(if (ncol(d$Y)>1) "mlm", "lm") wgt <- W*0 @@ -54,14 +59,14 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { HC <- function(r) r[, rep(seq_len(ny), each=ny)] * r[, rep(seq_len(ny), ny)] ## For RD, compute variance separately on either side of cutoff - ## TODO: better implement + ## using covariate-adjusted outcome NN <- function(X) { res <- matrix(0, nrow=length(X), ncol=ny^2) res[ok] <- if (!inherits(d, "IP")) - rbind(as.matrix(sigmaNN(X[d$m & ok], d$Y[d$m & ok, ], J, + rbind(as.matrix(sigmaNN(X[d$m & ok], Yadj[d$m & ok, ], J, d$w[d$m & ok])), - as.matrix(sigmaNN(X[d$p & ok], d$Y[d$p & ok, ], J, + as.matrix(sigmaNN(X[d$p & ok], Yadj[d$p & ok, ], J, d$w[d$p & ok]))) else sigmaNN(X[ok], d$Y[ok, ], J, d$w[ok]) @@ -83,7 +88,7 @@ NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { V <- as.vector(crossprod(us)) } ret <- list(estimate=r0$coefficients[1], se=sqrt(V[1]), est_w=wgt, - sigma2=hsigma2, eff.obs=eff.obs, fs=NA, lm=r0, Z=Z) + sigma2=hsigma2, eff.obs=eff.obs, fs=NA, lm=r0, Yadj=Yadj) if (inherits(d, "FRD")) { ret$fs <- r0$coefficients[1, 2] diff --git a/R/RDHonest.R b/R/RDHonest.R index ed3df90..0917d55 100644 --- a/R/RDHonest.R +++ b/R/RDHonest.R @@ -210,7 +210,8 @@ RDHonest <- function(formula, data, subset, weights, cutoff=0, M, if (missing(M)) { M <- MROT(d) - message("Using Armstong & Kolesar (2020) ROT for smoothness constant M") + message(paste0("Using Armstrong & Kolesar (2020) ROT ", + "for smoothness constant M")) } if (kernel_type(kern)=="optimal") { @@ -234,17 +235,9 @@ RDHonest <- function(formula, data, subset, weights, cutoff=0, M, } covariate_adjust <- function(d, kern, h) { - if (is.null(d$Y_unadj)) d$Y_unadj <- d$Y - d0 <- d - d0$Y <- d0$Y_unadj - r <- NPReg(d0, h, kern, order=1, se.method="EHW") - be <- as.matrix(r$lm$coefficients) - if (r$eff.obs==0) - stop("Too few effective observations to perform covariate adjustment") - ## Original - ## L <- NCOL(d$covs) - ## d$Y <-d$Y_unadj-d$covs %*% be[seq_len(L)+NROW(be)-L, , drop=FALSE] - d$Y <-d$Y_unadj-r$Z[, -(1:4)] %*% be[-(1:4), , drop=FALSE] + r <- NPReg(d, h, kern, order=1, se.method="EHW") + d$Y_unadj <- d$Y + d$Y <- r$Yadj d } diff --git a/doc/RDHonest.R b/doc/RDHonest.R index 9a1a9f2..2ed77e4 100644 --- a/doc/RDHonest.R +++ b/doc/RDHonest.R @@ -90,7 +90,7 @@ RDHonest(log(cn) | retired ~ elig_year | education, data=rcp, f3 <- RDScatter(log(cn)~elig_year, data=rcp, cutoff=0, avg=Inf, xlab="Years to eligibility", ylab="Log consumption of non-durables", propdotsize=TRUE, - subset=abs(elig_year)<10) + subset=abs(elig_year)<15) ## Adjust size of dots if they are too big f3 + ggplot2::scale_size_area(max_size = 5) diff --git a/doc/RDHonest.Rmd b/doc/RDHonest.Rmd index 60a0596..11bcbed 100644 --- a/doc/RDHonest.Rmd +++ b/doc/RDHonest.Rmd @@ -726,16 +726,16 @@ RDHonest(log(cn) | retired ~ elig_year | education, data=rcp, Relative to the previous estimate without covariates, the point estimate is now much larger. This is in part due to slightly smaller bandwidth used, and the regression function for the reduced form appears noisy below the cutoff, -potentially due to measurement error: see Figure 3. The noise is also -responsible for the rather large data-driven estimates of the curvature -parameters. +potentially due to measurement error: see Figure 3. As a result, the estimates +are quite sensitive to the bandwidth used. The noise is also responsible for the +rather large data-driven estimates of the curvature parameters. ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Battistin et al (2009) data"} ## see Figure 3 f3 <- RDScatter(log(cn)~elig_year, data=rcp, cutoff=0, avg=Inf, xlab="Years to eligibility", ylab="Log consumption of non-durables", propdotsize=TRUE, - subset=abs(elig_year)<10) + subset=abs(elig_year)<15) ## Adjust size of dots if they are too big f3 + ggplot2::scale_size_area(max_size = 5) ``` diff --git a/doc/RDHonest.pdf b/doc/RDHonest.pdf index a62fb06..b9299aa 100644 Binary files a/doc/RDHonest.pdf and b/doc/RDHonest.pdf differ diff --git a/doc/manual.pdf b/doc/manual.pdf index 31cc9a0..68f7fcf 100644 Binary files a/doc/manual.pdf and b/doc/manual.pdf differ diff --git a/tests/testthat/test_covariates.R b/tests/testthat/test_covariates.R index 6d8c112..a9b0ad3 100644 --- a/tests/testthat/test_covariates.R +++ b/tests/testthat/test_covariates.R @@ -60,21 +60,24 @@ test_that("Test collinear covariates", { df <- headst[complete.cases(headst), ] covlist <- "urban+black" fh1 <- as.formula(paste0("mortHS ~ povrate|", covlist)) - r0 <- RDHonest(fh1, data=df) + expect_message(r0 <- RDHonest(fh1, data=df), "Using Armstrong") Z <- cbind(df$urban, df$black, df$urban) - expect_message(r1 <- RDHonest(df$mortHS~df$povrate|Z, M=r0$coefficients$M)) + expect_message(expect_message(r1 <- RDHonest(df$mortHS~df$povrate|Z, + M=r0$coefficients$M))) expect_lt(max(abs(r1$coefficients[2:13]-r0$coefficients[2:13])), 1e-15) Z <- cbind(df$urban, df$urban, df$black, df$black, df$urban+df$black) - expect_message(r2 <- RDHonest(df$mortHS~df$povrate|Z)) + suppressMessages(expect_message(r2 <- RDHonest(df$mortHS~df$povrate|Z))) expect_lt(max(abs(r2$coefficients[2:13]-r0$coefficients[2:13])), 1e-15) - expect_error(RDHonest(df$mortHS~df$povrate|Z, h=0.05, kern="uniform")) + expect_message(expect_message(r <- RDHonest(df$mortHS~df$povrate|Z, h=0.05, + kern="uniform")), "large") ## Fuzzy df <- rcp[1:1000, ] Z <- cbind(df$food, df$food+df$family_size, df$family_size) - r0 <- RDHonest(log(c)|retired ~ elig_year|Z[, 1:2], data=df, - weights=survey_year) - expect_message(r1 <- RDHonest(log(c)|retired ~ elig_year|Z, data=df, - weights=survey_year)) + expect_message(r0 <- RDHonest(log(c)|retired ~ elig_year|Z[, 1:2], data=df, + weights=survey_year), "Using Armstrong") + suppressMessages(expect_message(r1 <- RDHonest(log(c)|retired ~ elig_year|Z, + data=df, + weights=survey_year))) expect_lt(max(abs(r1$coefficients[2:13]-r0$coefficients[2:13])), 1e-15) }) diff --git a/vignettes/RDHonest.Rmd b/vignettes/RDHonest.Rmd index 11bcbed..80b6796 100644 --- a/vignettes/RDHonest.Rmd +++ b/vignettes/RDHonest.Rmd @@ -700,7 +700,7 @@ rc We see that the inclusion of covariates leads to a reduction in the rule-of-thumb curvature and also smaller standard errors (this would be true even if the bandwidth was kept fixed). Correspondingly, the CIs are tighter by -about 6 percentage points: +about 9 percentage points: ```{r} ci_len <- c(rc$coefficients$conf.high-rc$coefficients$conf.low, rn$coefficients$conf.high-rn$coefficients$conf.low)