diff --git a/.lintr b/.lintr index c82dc6d..7c45b54 100644 --- a/.lintr +++ b/.lintr @@ -1 +1,7 @@ -linters: linters_with_defaults(line_length_linter(80), commented_code_linter=NULL, infix_spaces_linter=NULL, object_name_linter = NULL, spaces_left_parentheses_linter=NULL) +linters: linters_with_defaults( + line_length_linter(80), + infix_spaces_linter=NULL, + commented_code_linter=NULL, + object_name_linter(c("CamelCase", "camelCase", "dotted.case", "snake_case")), + indentation_linter(indent=4L) + ) diff --git a/NAMESPACE b/NAMESPACE index 386f1d9..a9ab6a2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,6 @@ S3method(print,RDResults) export(CVb) export(RDHonest) export(RDHonestBME) +export(RDScatter) export(RDSmoothnessBound) export(RDTEfficiencyBound) -export(plot_RDscatter) diff --git a/R/Cbound.R b/R/Cbound.R index 5683954..c0caa23 100644 --- a/R/Cbound.R +++ b/R/Cbound.R @@ -33,8 +33,9 @@ #' @references{ #' #' \cite{Michal Kolesár and Christoph Rothe. Inference in regression -#' discontinuity designs with a discrete running variable. American Economic -#' Review, 108(8):2277—-2304, August 2018. \doi{10.1257/aer.20160945}} +#' discontinuity designs with a discrete running variable. +#' American Economic Review, 108(8):2277—-2304, +#' August 2018. \doi{10.1257/aer.20160945}} #' #' } #' @examples @@ -43,7 +44,7 @@ #' @export RDSmoothnessBound <- function(object, s, separate=FALSE, multiple=TRUE, alpha=0.05, sclass="H") { - d <- NPRPrelimVar.fit(object$data, se.initial="EHW") + d <- PrelimVar(object$data, se.initial="EHW") ## Curvature estimate based on jth set of three points closest to zero Dk <- function(Y, X, xu, s2, j) { @@ -52,16 +53,16 @@ RDSmoothnessBound <- function(object, s, separate=FALSE, multiple=TRUE, I3 <- X >= xu[3*j*s- s+1] & X <= xu[3*j*s] lam <- (mean(X[I3])-mean(X[I2])) / (mean(X[I3])-mean(X[I1])) - den <- if (sclass=="T") { - (1-lam)*mean(X[I3]^2) + lam*mean(X[I1]^2) + mean(X[I2]^2) - } else { - (1-lam)*mean(X[I3]^2) + lam*mean(X[I1]^2) - mean(X[I2]^2) - } + if (sclass=="T") { + den <- (1-lam)*mean(X[I3]^2) + lam*mean(X[I1]^2) + mean(X[I2]^2) + } else { + den <- (1-lam)*mean(X[I3]^2) + lam*mean(X[I1]^2) - mean(X[I2]^2) + } ## Delta is lower bound on M by lemma S2 in Kolesar and Rothe - Del <- 2*(lam*mean(Y[I1])+(1-lam)*mean(Y[I3])-mean(Y[I2])) / den + Del <- 2 * (lam*mean(Y[I1]) + (1-lam) * mean(Y[I3])-mean(Y[I2])) / den ## Variance of Delta - VD <- 4*(lam^2*mean(s2[I1])/sum(I1) + - (1-lam)^2*mean(s2[I3])/sum(I3) + mean(s2[I2])/sum(I2)) / den^2 + VD <- 4 * (lam^2*mean(s2[I1])/sum(I1) + (1-lam)^2*mean(s2[I3]) / + sum(I3) + mean(s2[I2])/sum(I2)) / den^2 c(Del, sqrt(VD), mean(Y[I1]), mean(Y[I2]), mean(Y[I3]), range(X[I1]), range(X[I2]), range(X[I3])) } @@ -72,8 +73,8 @@ RDSmoothnessBound <- function(object, s, separate=FALSE, multiple=TRUE, Dpj <- function(j) Dk(d$Y[d$p], d$X[d$p], xp, d$sigma2[d$p], j) Dmj <- function(j) Dk(d$Y[d$m], abs(d$X[d$m]), xm, d$sigma2[d$m], j) - Sp <- floor(length(xp)/(3*s)) - Sm <- floor(length(xm)/(3*s)) + Sp <- floor(length(xp) / (3*s)) + Sm <- floor(length(xm) / (3*s)) if (min(Sp, Sm) == 0) stop("Value of s is too big") @@ -111,8 +112,8 @@ RDSmoothnessBound <- function(object, s, separate=FALSE, multiple=TRUE, } list(estimate=hatM, conf.low=lower, diagnostics=c(Delta=maxt[1], sdDelta=maxt[2], - y1=maxt[3], y2=maxt[4], y3=maxt[5], - I1=maxt[6:7], I2=maxt[8:9], I3=maxt[10:11])) + y1=maxt[3], y2=maxt[4], y3=maxt[5], + I1=maxt[6:7], I2=maxt[8:9], I3=maxt[10:11])) } if (separate) { diff --git a/R/NPR_lp.R b/R/NPR_lp.R deleted file mode 100644 index ddb176f..0000000 --- a/R/NPR_lp.R +++ /dev/null @@ -1,159 +0,0 @@ -## @param d object of class \code{"RDData"}, \code{"FRDData"}, or -## \code{"LPPData"} -## @param T0 Initial estimate of the treatment effect for calculating the -## optimal bandwidth. Only relevant for Fuzzy RD. -## @param T0bias When evaluating the maximum bias of the estimate, use the -## estimate itself (if \code{T0bias==FALSE}), or use the preliminary -## estimate \code{T0} (if \code{T0bias==TRUE}). Only relevant for Fuzzy RD. -NPRHonest.fit <- function(d, M, kern="triangular", h, opt.criterion, alpha=0.05, - beta=0.8, se.method="nn", J=3, sclass="H", T0=0, - T0bias=FALSE) { - if (missing(h)) - h <- NPROptBW.fit(d, M, kern, opt.criterion, alpha, beta, sclass, T0) - r1 <- NPRreg.fit(d, h, kern, order=1, se.method, J) - - wt <- r1$w[r1$w!=0] - xx <- d$X[r1$w!=0] - if (d$class=="IP") { - nobs <- length(wt) - ## Are we at a boundary? - bd <- length(unique(d$X>=0))==1 - } else { - nobs <- min(sum(xx>=0), sum(xx<0)) - bd <- TRUE - } - - if (T0bias && d$class=="FRD") { - ## multiply bias and sd by r1$fs to make if free of first stage - r1$se <- r1$se*abs(r1$fs) - M <- unname(c((M[1]+M[2]*abs(T0)), M)) - } else if (!T0bias && d$class=="FRD") { - M <- unname(c((M[1]+M[2]*abs(r1$estimate)) / abs(r1$fs), M)) - } - - ## Determine bias - if (nobs==0) { - ## If bandwidths too small, big bias / sd - bias <- r1$se <- sqrt(.Machine$double.xmax/10) - } else if (sclass=="T") { - bias <- M[1]/2 * (sum(abs(wt*xx^2))) - } else if (sclass=="H" && bd) { - ## At boundary we know form of least favorable function - bias <- M[1]/2 * - abs(sum(wt[xx<0]*xx[xx<0]^2)-sum(wt[xx>=0]*xx[xx>=0]^2)) - } else { - ## Else need to find numerically (same formula if order=2) - w2p <- function(s) abs(sum((wt*(xx-s))[xx>=s])) - w2m <- function(s) abs(sum((wt*(s-xx))[xx<=s])) - bp <- stats::integrate(function(s) vapply(s, w2p, numeric(1)), 0, h) - bm <- stats::integrate(function(s) vapply(s, w2m, numeric(1)), -h, 0) - bias <- M[1]*(bp$value+bm$value) - } - lower <- r1$estimate - bias - stats::qnorm(1-alpha)*r1$se - upper <- r1$estimate + bias + stats::qnorm(1-alpha)*r1$se - B <- bias/r1$se - cv <- CVb(B, alpha) - term <- switch(d$class, IP="Value of conditional mean", - SRD="Sharp RD Parameter", "Fuzzy RD Parameter") - method <- switch(sclass, H="Holder", "Tayor") - if (d$class!="FRD") M[2:3] <- c(NA, NA) - d$est_w <- r1$w - d$sigma2 <- r1$sigma2 - kernel <- if (!is.function(kern)) kern else "user-supplied" - coef <- data.frame(term=term, estimate=r1$estimate, std.error=r1$se, - maximum.bias=bias, conf.low=r1$estimate-cv*r1$se, - conf.high=r1$estimate+cv*r1$se, conf.low.onesided=lower, - conf.high.onesided=upper, bandwidth=h, - eff.obs=r1$eff.obs, leverage=max(r1$w^2)/sum(r1$w^2), - cv=cv, alpha=alpha, method=method, M=M[1], M.rf=M[2], - M.fs=M[3], first.stage=r1$fs, kernel=kernel, - p.value=stats::pnorm(B-abs(r1$estimate/r1$se))+ - stats::pnorm(-B-abs(r1$estimate/r1$se))) - structure(list(coefficients=coef, data=d), class="RDResults") -} - - -## Optimal bandwidth selection in nonparametric regression -NPROptBW.fit <- function(d, M, kern="triangular", opt.criterion, alpha=0.05, - beta=0.8, sclass="H", T0=0) { - - ## First check if sigma2 is supplied - if (is.null(d$sigma2)) - d <- NPRPrelimVar.fit(d, se.initial="EHW") - - ## Objective function for optimizing bandwidth - obj <- function(h) { - r <- NPRHonest.fit(d, M, kern, h, alpha=alpha, se.method="supplied.var", - sclass=sclass, T0=T0, T0bias=TRUE)$coefficients - switch(opt.criterion, - OCI=2*r$maximum.bias+ - r$std.error*(stats::qnorm(1-alpha)+stats::qnorm(beta)), - MSE=r$maximum.bias^2+r$std.error^2, - FLCI=r$conf.high-r$conf.low) - } - hmin <- if (d$class=="IP") { - sort(unique(abs(d$X)))[2] - } else { - max(unique(d$X[d$p])[2], sort(unique(abs(d$X[d$m])))[2]) - } - - ## Optimize piecewise constant function using modification of golden - ## search. In fact, the criterion may not be unimodal, so proceed with - ## caution. (For triangular kernel, it appears unimodal) - if (kern=="uniform") { - supp <- sort(unique(abs(d$X))) - h <- gss(obj, supp[supp>=hmin]) - } else { - h <- abs(stats::optimize(obj, interval=c(hmin, max(abs(d$X))), - tol=.Machine$double.eps^0.75)$minimum) - } - - h -} - - - -#' @export -print.RDResults <- function(x, digits = getOption("digits"), ...) { - if (!is.null(x$call)) - cat("Call:\n", deparse(x$call), "\n\n", sep = "", fill=TRUE) - fmt <- function(x) format(x, digits=digits, width=digits+1) - y <- x$coefficients - cat("Estimates (using ", y$method, " class):\n", sep="") - nm <- c("Parameter", "Estimate", "Std. Error", "Maximum Bias") - names(y)[1:4] <- nm - y$"Confidence Interval" <- paste0("(", fmt(y$conf.low), ", ", - fmt(y$conf.high), ")") - y$OCI <- paste0("(-Inf, ", fmt(y$conf.high.onesided), "), (", - fmt(y$conf.low.onesided), ", Inf)") - print.data.frame(y[, c(nm, "Confidence Interval"), ], - digits=digits, row.names=FALSE) - cat("\nOnesided CIs: ", y$OCI) - if(!is.null(y$bandwidth)) - cat("\nBandwidth: ", fmt(y$bandwidth), ", Kernel: ", y$kernel, sep="") - else - cat("\nSmoothing parameters below and above cutoff: ", - fmt(y$bandwidth.m), ", ", - fmt(y$bandwidth.m), sep="") - - cat("\nNumber of effective observations:", fmt(y$eff.obs)) - par <- paste0(tolower(substr(y$Parameter, 1, 1)), substring(y$Parameter, 2)) - cat("\nMaximal leverage for ", par, ": ", fmt(y$leverage), - sep="") - if(!is.null(y$first.stage) && !is.na(y$first.stage)) - cat("\nFirst stage estimate:", fmt(y$first.stage), - "\nFirst stage smoothness constant M:", fmt(y$M.fs), - "\nReduced form smoothness constant M:", fmt(y$M.rf), - "\n") - else - cat("\nSmoothness constant M:", fmt(y$M), - "\n") - cat("P-value:", fmt(y$p.value), "\n") - - if (inherits(x$na.action, "omit")) - cat(length(x$na.action), "observations with missing values dropped\n") - - - - invisible(x) -} diff --git a/R/NPRfunctions.R b/R/NPRfunctions.R index ce73535..ff1ddd2 100644 --- a/R/NPRfunctions.R +++ b/R/NPRfunctions.R @@ -25,8 +25,8 @@ LPReg <- function(X, Y, h, K, order=1, se.method=NULL, sigma2, J=3, ## Squared residuals, allowing for multivariate Y sig <- function(r) { - r[, rep(seq_len(ncol(r)), each=ncol(r))] * - r[, rep(seq_len(ncol(r)), ncol(r))] + r[, rep(seq_len(ncol(r)), each=ncol(r))] * r[, rep(seq_len(ncol(r)), + ncol(r))] } ## For RD, compute variance separately on either side of cutoff signn <- function(X) { @@ -45,15 +45,15 @@ LPReg <- function(X, Y, h, K, order=1, se.method=NULL, sigma2, J=3, wgt_unif <- ((weights)*R %*% solve(crossprod(R, weights*R)))[, 1] eff.obs <- length(X)*sum(wgt_unif^2)/sum(wgt^2) ## Variance - V <- if (is.null(clusterid)) { - colSums(as.matrix(wgt^2 * hsigma2)) - } else if (se.method == "supplied.var") { - colSums(as.matrix(wgt^2 * hsigma2))+ - rho*(sum(tapply(wgt, clusterid, sum)^2)-sum(wgt^2)) - } else { - us <- apply(wgt*res, 2, function(x) tapply(x, clusterid, sum)) - as.vector(crossprod(us)) - } + if (is.null(clusterid)) { + V <- colSums(as.matrix(wgt^2 * hsigma2)) + } else if (se.method == "supplied.var") { + V <- colSums(as.matrix(wgt^2 * hsigma2))+ + rho * (sum(tapply(wgt, clusterid, sum)^2)-sum(wgt^2)) + } else { + us <- apply(wgt*res, 2, function(x) tapply(x, clusterid, sum)) + V <- as.vector(crossprod(us)) + } list(theta=beta[1, ], sigma2=hsigma2, res=res, var=V, w=wgt, eff.obs=eff.obs) @@ -67,14 +67,14 @@ LPReg <- function(X, Y, h, K, order=1, se.method=NULL, sigma2, J=3, ## Calculate fuzzy or sharp RD estimate, or estimate of a conditional mean at a ## point (depending on the class of \code{d}), and its variance using local ## polynomial regression of order \code{order}. -NPRreg.fit <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { +NPReg <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { if (!is.function(kern)) kern <- EqKern(kern, boundary=FALSE, order=0) ## Keep only positive kernel weights W <- if (h<=0) 0*d$X else kern(d$X/h) # kernel weights - if(!is.null(d$sigma2)) d$sigma2 <- as.matrix(d$sigma2) + if (!is.null(d$sigma2)) d$sigma2 <- as.matrix(d$sigma2) r <- LPReg(d$X[W>0], as.matrix(d$Y)[W>0, ], h, kern, order, se.method, - d$sigma2[W>0, ], J, weights=d$w[W>0], RD=(d$class!="IP"), + d$sigma2[W>0, ], J, weights=d$w[W>0], RD = (d$class!="IP"), d$rho, d$clusterid[W>0]) sigma2 <- matrix(NA, nrow=length(W), ncol=NCOL(d$Y)^2) sigma2[W>0, ] <- r$sigma2 @@ -88,8 +88,8 @@ NPRreg.fit <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { if (d$class=="FRD") { ret$fs <- r$theta[2] ret$estimate <- r$theta[1]/r$theta[2] - ret$se <- sqrt(sum(c(1, -ret$estimate, -ret$estimate, ret$estimate^2) * - r$var) / ret$fs^2) + ret$se <- sqrt(sum(c(1, -ret$estimate, -ret$estimate, + ret$estimate^2) * r$var) / ret$fs^2) } ret } @@ -100,26 +100,22 @@ NPRreg.fit <- function(d, h, kern="triangular", order=1, se.method="nn", J=3) { ## Use global quartic regression to estimate a bound on the second derivative ## for inference under under second order Hölder class. For RD, use a separate ## regression on either side of the cutoff -NPR_MROT.fit <- function(d) { +MROT <- function(d) { if (d$class=="SRD") { - max(NPR_MROT.fit(NPRData(data.frame(Y=d$Y[d$p], X=d$X[d$p]), 0, "IP")), - NPR_MROT.fit(NPRData(data.frame(Y=d$Y[d$m], X=d$X[d$m]), 0, "IP"))) + max(MROT(NPRData(data.frame(Y=d$Y[d$p], X=d$X[d$p]), 0, "IP")), + MROT(NPRData(data.frame(Y=d$Y[d$m], X=d$X[d$m]), 0, "IP"))) } else if (d$class=="FRD") { - c(M1=max(NPR_MROT.fit(NPRData(data.frame(Y=d$Y[d$p, 1], X=d$X[d$p]), 0, - "IP")), - NPR_MROT.fit(NPRData(data.frame(Y=d$Y[d$m, 1], X=d$X[d$m]), 0, - "IP"))), - M2=max(NPR_MROT.fit(NPRData(data.frame(Y=d$Y[d$p, 2], X=d$X[d$p]), 0, - "IP")), - NPR_MROT.fit(NPRData(data.frame(Y=d$Y[d$m, 2], X=d$X[d$m]), 0, - "IP")))) + c(M1=max(MROT(NPRData(data.frame(Y=d$Y[d$p, 1], X=d$X[d$p]), 0, "IP")), + MROT(NPRData(data.frame(Y=d$Y[d$m, 1], X=d$X[d$m]), 0, "IP"))), + M2=max(MROT(NPRData(data.frame(Y=d$Y[d$p, 2], X=d$X[d$p]), 0, "IP")), + MROT(NPRData(data.frame(Y=d$Y[d$m, 2], X=d$X[d$m]), 0, "IP")))) } else if (d$class=="IP") { ## STEP 1: Estimate global polynomial regression r1 <- unname(stats::lm(d$Y ~ 0 + outer(d$X, 0:4, "^"))$coefficients) f2 <- function(x) abs(2*r1[3]+6*x*r1[4]+12*x^2*r1[5]) ## maximum occurs either at endpoints, or else at the extremum, ## -r1[4]/(4*r1[5]), if the extremum is in the support - f2e <- if(abs(r1[5])<=1e-10) Inf else -r1[4]/(4*r1[5]) + f2e <- if (abs(r1[5])<=1e-10) Inf else -r1[4] / (4*r1[5]) M <- max(f2(min(d$X)), f2(max(d$X))) if (min(d$X) < f2e && max(d$X) > f2e) M <- max(f2(f2e), M) diff --git a/R/RDHonest.R b/R/RDHonest.R index 2b7580d..1140bed 100644 --- a/R/RDHonest.R +++ b/R/RDHonest.R @@ -157,37 +157,199 @@ RDHonest <- function(formula, data, subset, weights, cutoff=0, M, mf[[1L]] <- quote(stats::model.frame) mf <- eval(mf, parent.frame()) - d <- if (point.inference) { - NPRData(mf, cutoff, "IP") - } else if (length(formula)[2]==2) { - NPRData(mf, cutoff, "FRD") - } else { - NPRData(mf, cutoff, "SRD") - } + if (point.inference) { + d <- NPRData(mf, cutoff, "IP") + } else if (length(formula)[2]==2) { + d <- NPRData(mf, cutoff, "FRD") + } else { + d <- NPRData(mf, cutoff, "SRD") + } if (missing(M)) { - M <- NPR_MROT.fit(d) + M <- MROT(d) message("Using Armstong & Kolesar (2020) ROT for smoothness constant M") } if (kern=="optimal") { - ret <- RDTOpt.fit(d, M, opt.criterion, alpha, beta, se.method, J) + ret <- RDTOpt(d, M, opt.criterion, alpha, beta, se.method, J) } else if (!missing(h)) { - ret <- NPRHonest.fit(d, M, kern, h, alpha=alpha, se.method=se.method, - J=J, sclass=sclass, T0=T0) + ret <- NPRHonest(d, M, kern, h, alpha=alpha, se.method=se.method, J=J, + sclass=sclass, T0=T0) } else { - ret <- NPRHonest.fit(d, M, kern, opt.criterion=opt.criterion, - alpha=alpha, beta=beta, se.method=se.method, J=J, - sclass=sclass, T0=T0) + ret <- NPRHonest(d, M, kern, opt.criterion=opt.criterion, alpha=alpha, + beta=beta, se.method=se.method, J=J, sclass=sclass, + T0=T0) } ret$call <- cl ret$na.action <- attr(mf, "na.action") - if (is.nan(ret$coefficients$leverage) || - ret$coefficients$leverage>0.1) + if (is.nan(ret$coefficients$leverage) || ret$coefficients$leverage>0.1) message(paste0("Maximal leverage is large: ", - round(ret$coefficients$leverage, 2), - ".\nInference may be inaccurate. ", - "Consider using bigger bandwidth.")) + round(ret$coefficients$leverage, 2), + ".\nInference may be inaccurate. ", + "Consider using bigger bandwidth.")) ret } + + +## @param d object of class \code{"RDData"}, \code{"FRDData"}, or +## \code{"LPPData"} +## @param T0 Initial estimate of the treatment effect for calculating the +## optimal bandwidth. Only relevant for Fuzzy RD. +## @param T0bias When evaluating the maximum bias of the estimate, use the +## estimate itself (if \code{T0bias==FALSE}), or use the preliminary +## estimate \code{T0} (if \code{T0bias==TRUE}). Only relevant for Fuzzy RD. +NPRHonest <- function(d, M, kern="triangular", h, opt.criterion, alpha=0.05, + beta=0.8, se.method="nn", J=3, sclass="H", T0=0, + T0bias=FALSE) { + if (missing(h)) + h <- OptBW(d, M, kern, opt.criterion, alpha, beta, sclass, T0) + r1 <- NPReg(d, h, kern, order=1, se.method, J) + + wt <- r1$w[r1$w!=0] + xx <- d$X[r1$w!=0] + if (d$class=="IP") { + nobs <- length(wt) + ## Are we at a boundary? + bd <- length(unique(d$X>=0))==1 + } else { + nobs <- min(sum(xx>=0), sum(xx<0)) + bd <- TRUE + } + + if (T0bias && d$class=="FRD") { + ## multiply bias and sd by r1$fs to make if free of first stage + r1$se <- r1$se*abs(r1$fs) + M <- unname(c((M[1]+M[2]*abs(T0)), M)) + } else if (!T0bias && d$class=="FRD") { + M <- unname(c((M[1]+M[2]*abs(r1$estimate)) / abs(r1$fs), M)) + } else { + M[2:3] <- c(NA, NA) + } + + ## Determine bias + if (nobs==0) { + ## If bandwidths too small, big bias / sd + bias <- r1$se <- sqrt(.Machine$double.xmax/10) + } else if (sclass=="T") { + bias <- M[1]/2 * (sum(abs(wt*xx^2))) + } else if (sclass=="H" && bd) { + ## At boundary we know form of least favorable function + bias <- M[1]/2 * + abs(sum(wt[xx<0]*xx[xx<0]^2)-sum(wt[xx>=0]*xx[xx>=0]^2)) + } else { + ## Else need to find numerically (same formula if order=2) + w2p <- function(s) abs(sum((wt * (xx-s))[xx>=s])) + w2m <- function(s) abs(sum((wt * (s-xx))[xx<=s])) + bp <- stats::integrate(function(s) vapply(s, w2p, numeric(1)), 0, h) + bm <- stats::integrate(function(s) vapply(s, w2m, numeric(1)), -h, 0) + bias <- M[1] * (bp$value+bm$value) + } + lower <- r1$estimate - bias - stats::qnorm(1-alpha)*r1$se + upper <- r1$estimate + bias + stats::qnorm(1-alpha)*r1$se + B <- bias/r1$se + cv <- CVb(B, alpha) + term <- switch(d$class, IP="Value of conditional mean", + SRD="Sharp RD Parameter", "Fuzzy RD Parameter") + method <- switch(sclass, H="Holder", "Tayor") + + d$est_w <- r1$w + d$sigma2 <- r1$sigma2 + kernel <- if (!is.function(kern)) kern else "user-supplied" + coef <- data.frame(term=term, estimate=r1$estimate, std.error=r1$se, + maximum.bias=bias, conf.low=r1$estimate-cv*r1$se, + conf.high=r1$estimate+cv*r1$se, conf.low.onesided=lower, + conf.high.onesided=upper, bandwidth=h, + eff.obs=r1$eff.obs, leverage=max(r1$w^2)/sum(r1$w^2), + cv=cv, alpha=alpha, method=method, M=M[1], M.rf=M[2], + M.fs=M[3], first.stage=r1$fs, kernel=kernel, + p.value=stats::pnorm(B-abs(r1$estimate/r1$se))+ + stats::pnorm(-B-abs(r1$estimate/r1$se))) + structure(list(coefficients=coef, data=d), class="RDResults") +} + + +## Optimal bandwidth selection in nonparametric regression +OptBW <- function(d, M, kern="triangular", opt.criterion, alpha=0.05, beta=0.8, + sclass="H", T0=0) { + + ## First check if sigma2 is supplied + if (is.null(d$sigma2)) + d <- PrelimVar(d, se.initial="EHW") + + ## Objective function for optimizing bandwidth + obj <- function(h) { + r <- NPRHonest(d, M, kern, h, alpha=alpha, se.method="supplied.var", + sclass=sclass, T0=T0, T0bias=TRUE)$coefficients + switch(opt.criterion, + OCI=2*r$maximum.bias+ + r$std.error * (stats::qnorm(1-alpha)+stats::qnorm(beta)), + MSE=r$maximum.bias^2+r$std.error^2, + FLCI=r$conf.high-r$conf.low) + } + if (d$class=="IP") { + hmin <- sort(unique(abs(d$X)))[2] + } else { + hmin <- max(unique(d$X[d$p])[2], sort(unique(abs(d$X[d$m])))[2]) + } + + ## Optimize piecewise constant function using modification of golden + ## search. In fact, the criterion may not be unimodal, so proceed with + ## caution. (For triangular kernel, it appears unimodal) + if (kern=="uniform") { + supp <- sort(unique(abs(d$X))) + h <- gss(obj, supp[supp>=hmin]) + } else { + h <- abs(stats::optimize(obj, interval=c(hmin, max(abs(d$X))), + tol=.Machine$double.eps^0.75)$minimum) + } + + h +} + + + +#' @export +print.RDResults <- function(x, digits = getOption("digits"), ...) { + if (!is.null(x$call)) + cat("Call:\n", deparse(x$call), "\n\n", sep = "", fill=TRUE) + fmt <- function(x) format(x, digits=digits, width=digits+1) + y <- x$coefficients + cat("Estimates (using ", y$method, " class):\n", sep="") + nm <- c("Parameter", "Estimate", "Std. Error", "Maximum Bias") + names(y)[1:4] <- nm + y$"Confidence Interval" <- paste0("(", fmt(y$conf.low), ", ", + fmt(y$conf.high), ")") + y$OCI <- paste0("(-Inf, ", fmt(y$conf.high.onesided), "), (", + fmt(y$conf.low.onesided), ", Inf)") + print.data.frame(y[, c(nm, "Confidence Interval"), ], + digits=digits, row.names=FALSE) + cat("\nOnesided CIs: ", y$OCI) + if (!is.null(y$bandwidth)) + cat("\nBandwidth: ", fmt(y$bandwidth), ", Kernel: ", y$kernel, sep="") + else + cat("\nSmoothing parameters below and above cutoff: ", + fmt(y$bandwidth.m), ", ", + fmt(y$bandwidth.m), sep="") + + cat("\nNumber of effective observations:", fmt(y$eff.obs)) + par <- paste0(tolower(substr(y$Parameter, 1, 1)), substring(y$Parameter, 2)) + cat("\nMaximal leverage for ", par, ": ", fmt(y$leverage), + sep="") + if (!is.null(y$first.stage) && !is.na(y$first.stage)) + cat("\nFirst stage estimate:", fmt(y$first.stage), + "\nFirst stage smoothness constant M:", fmt(y$M.fs), + "\nReduced form smoothness constant M:", fmt(y$M.rf), + "\n") + else + cat("\nSmoothness constant M:", fmt(y$M), + "\n") + cat("P-value:", fmt(y$p.value), "\n") + + if (inherits(x$na.action, "omit")) + cat(length(x$na.action), "observations with missing values dropped\n") + + + + invisible(x) +} diff --git a/R/RD_bme.R b/R/RD_bme.R index ccb78ed..bf0a511 100644 --- a/R/RD_bme.R +++ b/R/RD_bme.R @@ -1,13 +1,13 @@ ## Formula for local polynomial regression RDlpformula <- function(order) { - f1 <- if (order>0) { - f <- vapply(seq_len(order), - function(p) paste0("I(x^", p, ")"), - character(1)) - paste0("(", paste(f, collapse="+"), ") * I(x>=0)") - } else { - paste0("I(x>=0)") - } + if (order>0) { + f <- vapply(seq_len(order), + function(p) paste0("I(x^", p, ")"), + character(1)) + f1 <- paste0("(", paste(f, collapse="+"), ") * I(x>=0)") + } else { + f1 <- paste0("I(x>=0)") + } paste0("y ~ ", f1) } @@ -73,7 +73,7 @@ RDHonestBME <- function(formula, data, subset, cutoff=0, na.action, mf[[1L]] <- quote(stats::model.frame) mf <- eval(mf, parent.frame()) - if(missing(regformula)) + if (missing(regformula)) regformula <- RDlpformula(order) regformula <- stats::as.formula(regformula) @@ -86,7 +86,7 @@ RDHonestBME <- function(formula, data, subset, cutoff=0, na.action, ## Count effective support points support <- sort(unique(x)) G <- length(support) - G.m <- length(support[support<0]) + Gm <- length(support[support<0]) ## Estimate actual and dummied out model, and calculate delta m1 <- stats::lm(regformula) @@ -97,20 +97,19 @@ RDHonestBME <- function(formula, data, subset, cutoff=0, na.action, ## Compute Q^{-1} manually so that sandwich package is not needed Q1inv <- chol2inv(qr(m1)$qr[1L:m1$rank, 1L:m1$rank, drop = FALSE]) Q2inv <- chol2inv(qr(m2)$qr[1L:m2$rank, 1L:m2$rank, drop = FALSE]) - v.m1m2 <- length(y) * stats::var(cbind( - (stats::model.matrix(m1)*stats::resid(m1)) %*% Q1inv, - (stats::model.matrix(m2)*stats::resid(m2)) %*% Q2inv)) + v.m1m2 <- length(y) * + stats::var(cbind((stats::model.matrix(m1)*stats::resid(m1)) %*% Q1inv, + (stats::model.matrix(m2)*stats::resid(m2)) %*% Q2inv)) df <- data.frame(x=support, y=rep(0, length(support))) e2 <- rep(0, G+length(stats::coef(m1))) e2[order + 2] <- 1 # inference on (p+2)th element aa <- rbind(cbind(-stats::model.matrix(regformula, data=df), diag(nrow=G)), e2) - vdt <- aa %*% v.m1m2 %*% t(aa) # V(W) in paper, except order of m1 and - # m2 swapped + vdt <- aa %*% v.m1m2 %*% t(aa) # V(W) in paper, but swap order of m1 and m2 ## All possible combinations of g_-, g_+, s_-, s_+ - gr <- as.matrix(expand.grid(1:G.m, (G.m+1):G, c(-1, 1), c(-1, 1))) + gr <- as.matrix(expand.grid(1:Gm, (Gm+1):G, c(-1, 1), c(-1, 1))) selvec <- matrix(0, nrow=nrow(gr), ncol=ncol(vdt)) selvec[cbind(seq_len(nrow(selvec)), gr[, 1])] <- gr[, 3] selvec[cbind(seq_len(nrow(selvec)), gr[, 2])] <- gr[, 4] @@ -120,21 +119,21 @@ RDHonestBME <- function(formula, data, subset, cutoff=0, na.action, dev <- drop(selvec[, -ncol(selvec)] %*% delta) ## Upper and lower CIs - CI.l <- stats::coef(m1)[order+2]+dev-stats::qnorm(1-alpha/2)*se - CI.u <- stats::coef(m1)[order+2]+dev+stats::qnorm(1-alpha/2)*se + ci_l <- stats::coef(m1)[order+2]+dev-stats::qnorm(1-alpha/2)*se + ci_u <- stats::coef(m1)[order+2]+dev+stats::qnorm(1-alpha/2)*se ## Onesided - OCI.l <- stats::coef(m1)[order+2]+dev-stats::qnorm(1-alpha)*se - OCI.u <- stats::coef(m1)[order+2]+dev+stats::qnorm(1-alpha)*se + oci_l <- stats::coef(m1)[order+2]+dev-stats::qnorm(1-alpha)*se # + oci_u <- stats::coef(m1)[order+2]+dev+stats::qnorm(1-alpha)*se - l <- which.min(CI.l) - u <- which.max(CI.u) + l <- which.min(ci_l) + u <- which.max(ci_u) coef <- data.frame(term="Sharp RD parameter", estimate=unname(m1$coefficients[order +2]), std.error=sqrt(vdt["e2", "e2"]), maximum.bias=max(abs(c(dev[u], dev[l]))), - conf.low=CI.l[l], conf.high=CI.u[u], - conf.low.onesided=min(OCI.l), - conf.high.onesided=max(OCI.u), bandwidth=h, + conf.low=ci_l[l], conf.high=ci_u[u], + conf.low.onesided=min(oci_l), + conf.high.onesided=max(oci_u), bandwidth=h, eff.obs=length(x), cv=NA, alpha=alpha, method="BME", kernel="uniform") ret <- list(coefficients=coef, call=cl, na.action=attr(mf, "na.action")) diff --git a/R/RD_opt.R b/R/RD_opt.R index e43fca4..e64c051 100644 --- a/R/RD_opt.R +++ b/R/RD_opt.R @@ -39,7 +39,7 @@ RDgbC <- function(d, b, C) { dp <- dstar(d$X[d$p], bp, C, d$sigma2[d$p]) dm <- dstar(d$X[d$m], bm, C, d$sigma2[d$m]) - function(x) SY(x, bp, dp, C)*(x>=0)-SY(x, bm, dm, C)*(x<0) + function(x) SY(x, bp, dp, C) * (x>=0)-SY(x, bm, dm, C) * (x<0) } @@ -78,7 +78,7 @@ RDTEstimator <- function(d, f, alpha, se.method, J) { upper <- Lhat + maxbias + stats::qnorm(1-alpha)*sd hl <- CVb(maxbias/sd, alpha) * sd # Half-length - r.u <- NPRreg.fit(d, max(abs(d$X[W!=0])), kern="uniform") + r.u <- NPReg(d, max(abs(d$X[W!=0])), kern="uniform") eff.obs <- r.u$eff.obs*sum(r.u$w^2)/sum(W^2) d$est_w <- W @@ -94,10 +94,10 @@ RDTEstimator <- function(d, f, alpha, se.method, J) { } ## Optimal inference in RD under Taylor class -RDTOpt.fit <- function(d, M, opt.criterion, alpha, beta, se.method, J) { +RDTOpt <- function(d, M, opt.criterion, alpha, beta, se.method, J) { ## First check if sigma2 is supplied if (is.null(d$sigma2)) - d <- NPRPrelimVar.fit(d, se.initial="EHW") + d <- PrelimVar(d, se.initial="EHW") if (!is.null(d$clusterid)) warning(paste0("Optimal kernel can only be used with independent data.", "Ignoring clusterid")) @@ -124,7 +124,7 @@ RDTOpt.fit <- function(d, M, opt.criterion, alpha, beta, se.method, J) { CVb(maxbias/hse, alpha) * hse # Half-length } ## eq is convex, start around MSE optimal b - bs <- RDTOpt.fit(d, M, "MSE", alpha, beta, se.method, J)$omega/2 + bs <- RDTOpt(d, M, "MSE", alpha, beta, se.method, J)$omega/2 lff <- RDgbC(d, stats::optimize(eq, c(bs/2, 3*bs/2))$minimum, C) } @@ -166,7 +166,7 @@ RDTEfficiencyBound <- function(object, opt.criterion="FLCI", beta=0.5) { d <- object$data alpha <- object$coefficients$alpha C <- object$coefficients$M/2 - d <- NPRPrelimVar.fit(d, se.initial="EHW") + d <- PrelimVar(d, se.initial="EHW") if (opt.criterion=="OCI") { delta <- stats::qnorm(1-alpha)+stats::qnorm(beta) @@ -174,7 +174,7 @@ RDTEfficiencyBound <- function(object, opt.criterion="FLCI", beta=0.5) { se.method="supplied.var") r2 <- RDTEstimator(d, RDLFFunction(d, C, 2*delta), alpha, se.method="supplied.var") - return(r2$omega/(r1$delta*r1$coefficients$std.error+r1$omega)) + return(r2$omega / (r1$delta*r1$coefficients$std.error+r1$omega)) } else { ## From proof of Pratt result, it follows that the expected length is ## int pnorm(z_{1-alpha}-delta_t) dt, where delta_t is value of inverse @@ -187,9 +187,9 @@ RDTEfficiencyBound <- function(object, opt.criterion="FLCI", beta=0.5) { ## By symmetry, half-length is given by value of integral over R_+. The ## integrand equals 1-alpha at zero, need upper cutoff upper <- 10 - while(integrand(upper)>1e-10) upper <- 2*upper - den <- RDTOpt.fit(d, 2*C, opt.criterion="FLCI", alpha, beta, - se.method="supplied.var")$coefficients + while (integrand(upper)>1e-10) upper <- 2*upper + den <- RDTOpt(d, 2*C, opt.criterion="FLCI", alpha, beta, + se.method="supplied.var")$coefficients den <- (den$conf.high-den$conf.low)/2 return(stats::integrate(integrand, 1e-6, upper)$value / den) } diff --git a/R/documentation.R b/R/documentation.R index cb8a941..22213dc 100644 --- a/R/documentation.R +++ b/R/documentation.R @@ -1,3 +1,4 @@ +# nolint start #' Head Start data from Ludwig and Miller (2007) #' #' Subset of Ludwig-Miller (2007) data. Counties with missing poverty rate, or @@ -45,7 +46,7 @@ #' #' } "headst" - +# nolint end #' Austrian unemployment duration data from Lalive (2008) #' diff --git a/R/kernels.R b/R/kernels.R index e8a7545..780416e 100644 --- a/R/kernels.R +++ b/R/kernels.R @@ -16,7 +16,7 @@ EqKern <- function(kernel = "uniform", boundary = TRUE, order = 0) { ## support su <- function(u) (u <= 1) * (u >= -1 + boundary) ## Boundary and order type - if(is.function(kernel)) { + if (is.function(kernel)) { EqKernN(kernel, boundary = boundary, order = order) } else if (order > 2) { K <- EqKern(kernel = kernel, boundary=boundary, order = 0) @@ -33,7 +33,7 @@ EqKern <- function(kernel = "uniform", boundary = TRUE, order = 0) { "1FALSEtriangular" = function(u) (1 - abs(u)) * su(u), "1FALSEepanechnikov" = function(u) 3/4 * (1 - u^2) * su(u), "1TRUEuniform" = function(u) (4 - 6*u) * su(u), - "1TRUEtriangular" = function(u) 6*(1 - 2*u) * (1 - u) * su(u), + "1TRUEtriangular" = function(u) 6 * (1 - 2*u) * (1 - u) * su(u), "1TRUEepanechnikov" = function(u) 6/19 * (16-30*u) * (1-u^2) * su(u), "2FALSEuniform" = function(u) (9 - 15 * u^2) / 8 * su(u), diff --git a/R/plots.R b/R/plots.R index 17093a3..cb468c8 100644 --- a/R/plots.R +++ b/R/plots.R @@ -16,11 +16,11 @@ #' @return An object of class \code{"ggplot"}, a scatterplot the binned raw #' observations. #' @examples -#' plot_RDscatter(log(earnings)~yearat14, data=cghs, cutoff=1947, +#' RDScatter(log(earnings)~yearat14, data=cghs, cutoff=1947, #' avg=Inf, propdotsize=TRUE) #' @export -plot_RDscatter <- function(formula, data, subset, cutoff=0, na.action, avg=10, - xlab=NULL, ylab=NULL, vert=TRUE, propdotsize=FALSE) { +RDScatter <- function(formula, data, subset, cutoff=0, na.action, avg=10, + xlab=NULL, ylab=NULL, vert=TRUE, propdotsize=FALSE) { if (!requireNamespace("ggplot2", quietly = TRUE)) stop("This function requires the ggplot2 package", call. = FALSE) ## construct model frame @@ -42,11 +42,10 @@ plot_RDscatter <- function(formula, data, subset, cutoff=0, na.action, avg=10, ## don't recycle maxp <- (np %/% avg) * avg maxm <- (nm %/% avg) * avg - bd <- data.frame( - x=c(colMeans(matrix(d$X[d$m][1:maxm], nrow=avg)), - colMeans(matrix(d$X[d$p][1:maxp], nrow=avg))), - y=c(colMeans(matrix(d$Y[d$m][1:maxm], nrow=avg)), - colMeans(matrix(d$Y[d$p][1:maxp], nrow=avg)))) + bd <- data.frame(x=c(colMeans(matrix(d$X[d$m][1:maxm], nrow=avg)), + colMeans(matrix(d$X[d$p][1:maxp], nrow=avg))), + y=c(colMeans(matrix(d$Y[d$m][1:maxm], nrow=avg)), + colMeans(matrix(d$Y[d$p][1:maxp], nrow=avg)))) ## if there is a remainder, add it if (maxm+1<=nm) bd <- rbind(bd, data.frame(x=mean(d$X[d$m][(maxm+1):nm]), @@ -65,10 +64,9 @@ plot_RDscatter <- function(formula, data, subset, cutoff=0, na.action, avg=10, } p <- p + ggplot2::theme(legend.position = "none") - if(!is.null(xlab)) p <- p + ggplot2::xlab(xlab) - if(!is.null(ylab)) p <- p + ggplot2::ylab(ylab) - if(vert) p <- p + ggplot2::geom_vline(xintercept=d$orig.cutoff, - linetype="dotted") - + if (!is.null(xlab)) p <- p + ggplot2::xlab(xlab) + if (!is.null(ylab)) p <- p + ggplot2::ylab(ylab) + if (vert) p <- p + ggplot2::geom_vline(xintercept=d$orig.cutoff, + linetype="dotted") p } diff --git a/R/prelim_var.R b/R/prelim_var.R index 130f659..9281a4f 100644 --- a/R/prelim_var.R +++ b/R/prelim_var.R @@ -13,7 +13,7 @@ sigmaNN <- function(X, Y, J=3, weights=rep(1L, length(X))) { ind <- (abs(X-X[k])<=d) ind[k] <- FALSE # exclude oneself Jk <- sum(weights[ind]) - sigma2[k, ] <- Jk/(Jk+weights[k])* + sigma2[k, ] <- Jk / (Jk+weights[k])* if (NCOL(Y)>1) as.vector(outer(Y[k, ]-colSums(weights[ind]*Y[ind, ])/Jk, Y[k, ]-colSums(weights[ind]*Y[ind, ])/Jk)) @@ -28,7 +28,7 @@ sigmaNN <- function(X, Y, J=3, weights=rep(1L, length(X))) { ## ## Compute estimate of variance, which can then be used in optimal bandwidth ## calculations. These estimates are unweighted. -NPRPrelimVar.fit <- function(d, se.initial="EHW") { +PrelimVar <- function(d, se.initial="EHW") { ## Pilot bandwidth: either IK/ROT, or else Silverman (for actually computing ## IK) for uniform kernel making sure this results in enough distinct values ## on either side of threshold so we don't have perfect fit @@ -47,21 +47,21 @@ NPRPrelimVar.fit <- function(d, se.initial="EHW") { } if (se.initial == "EHW") { - h1 <- if (d$class=="IP") ROTBW.fit(drf) else IKBW.fit(drf) - if(is.nan(h1)) { + h1 <- if (d$class=="IP") ROTBW(drf) else IKBW(drf) + if (is.nan(h1)) { warning("Preliminary bandwidth is NaN, setting it to Inf") h1 <- Inf } - r1 <- NPRreg.fit(d, max(h1, hmin), se.method="EHW") + r1 <- NPReg(d, max(h1, hmin), se.method="EHW") } else if (d$class == "SRD" && se.initial == "Silverman") { ## Silverman only for RD/IK h1 <- max(1.84*stats::sd(d$X)/sum(length(d$X))^(1/5), hmin) - r1 <- NPRreg.fit(d, h1, "uniform", order=0, se.method="EHW") + r1 <- NPReg(d, h1, "uniform", order=0, se.method="EHW") ## Variance adjustment for backward compatibility lp <- length(r1$sigma2[d$p & r1$w != 0]) lm <- length(r1$sigma2[d$m & r1$w != 0]) - r1$sigma2[d$p] <- r1$sigma2[d$p]*lp/(lp-1) - r1$sigma2[d$m] <- r1$sigma2[d$m]*lm/(lm-1) + r1$sigma2[d$p] <- r1$sigma2[d$p]*lp / (lp-1) + r1$sigma2[d$m] <- r1$sigma2[d$m]*lm / (lm-1) } else { stop("This method for preliminary variance estimation not supported") } @@ -77,9 +77,9 @@ NPRPrelimVar.fit <- function(d, se.initial="EHW") { } else { d$sigma2 <- matrix(NA, nrow=length(d$X), ncol=4) d$sigma2[d$p, ] <- matrix(rep(colMeans(r1$sigma2[d$p & r1$w != 0, ]), - each=sum(d$p)), nrow=sum(d$p)) + each=sum(d$p)), nrow=sum(d$p)) d$sigma2[d$m, ] <- matrix(rep(colMeans(r1$sigma2[d$m & r1$w != 0, ]), - each=sum(d$m)), nrow=sum(d$m)) + each=sum(d$m)), nrow=sum(d$m)) } d } @@ -103,11 +103,11 @@ Moulton <- function(u, d) { ## Rule of thumb bandwidth for inference at a point. Only used by -## NPRPrelimVar.fit +## PrelimVar ## ## Calculate bandwidth for inference at a point with local linear regression ## using method in Fan and Gijbels (1996, Chapter 4.2). -ROTBW.fit <- function(d, kern="triangular") { +ROTBW <- function(d, kern="triangular") { X <- d$X boundary <- if ((min(X)>=0) || (max(X)<=0)) TRUE else FALSE N <- length(d$X) @@ -115,7 +115,7 @@ ROTBW.fit <- function(d, kern="triangular") { ## STEP 0: Estimate f_X(0) using Silverman h1 <- 1.843 * min(stats::sd(X), (stats::quantile(X, 0.75) - - stats::quantile(X, 0.25)) / 1.349) / N^(1/5) + stats::quantile(X, 0.25)) / 1.349) / N^(1/5) f0 <- sum(abs(X) <= h1) / (2*N*h1) ## STEP 1: Estimate (p+1)th derivative and sigma^2 using global polynomial @@ -126,9 +126,8 @@ ROTBW.fit <- function(d, kern="triangular") { sigma2 <- stats::sigma(r1)^2 ## STEP 2: Kernel constants - s <- kernC[kernC$kernel==kern & - kernC$order==order & - kernC$boundary==boundary, ] + s <- kernC[kernC$kernel==kern & kernC$order==order & + kernC$boundary==boundary, ] nu0 <- s$nu0 mup <- s[[paste0("mu", order+1)]] @@ -136,26 +135,24 @@ ROTBW.fit <- function(d, kern="triangular") { B <- deriv * mup V <- sigma2 * nu0 /f0 - (V/(B^2 * 2 * (order+1) * N))^(1/(2*order+3)) + (V / (B^2 * 2 * (order+1) * N))^(1 / (2*order+3)) } -## Imbens and Kalyanaraman bandwidth. Only used by NPRPrelimVar.fit +## Imbens and Kalyanaraman bandwidth. Only used by PrelimVar ## ## Reproduce bandwidth from Section 6.2 in Imbens and Kalyanaraman (2012) -IKBW.fit <- function(d, kern="triangular", verbose=FALSE) { +IKBW <- function(d, kern="triangular", verbose=FALSE) { Nm <- sum(d$m) Np <- sum(d$p) N <- Nm+Np ## STEP 0: Kernel constant - s <- kernC[kernC$order==1 & - kernC$boundary==TRUE & - kernC$kernel==kern, ] + s <- kernC[kernC$order==1 & kernC$boundary==TRUE & kernC$kernel==kern, ] const <- (s$nu0/s$mu2^2)^(1/5) ## STEP 1: Estimate f(0), sigma^2_(0) and sigma^2_+(0), using Silverman ## pilot bandwidth for uniform kernel - d <- NPRPrelimVar.fit(d, se.initial="Silverman") + d <- PrelimVar(d, se.initial="Silverman") h1 <- 1.84*stats::sd(d$X)/N^(1/5) f0 <- sum(abs(d$X) <= h1) / (2*N*h1) varm <- d$sigma2[d$m][1] @@ -169,21 +166,21 @@ IKBW.fit <- function(d, kern="triangular", verbose=FALSE) { ## Left and right bandwidths, Equation (15) and page 956. ## Optimal constant based on one-sided uniform Kernel, 7200^(1/7), - h2m <- 7200^(1/7) * (varm/(f0*m3^2))^(1/7) * Nm^(-1/7) - h2p <- 7200^(1/7) * (varp/(f0*m3^2))^(1/7) * Np^(-1/7) + h2m <- 7200^(1/7) * (varm / (f0*m3^2))^(1/7) * Nm^(-1/7) + h2p <- 7200^(1/7) * (varp / (f0*m3^2))^(1/7) * Np^(-1/7) ## estimate second derivatives by local quadratic m2m <- 2*stats::coef(stats::lm(d$Y ~ d$X + I(d$X^2), - subset=(d$X >= -h2m & d$X<0)))[3] + subset = (d$X >= -h2m & d$X<0)))[3] m2p <- 2*stats::coef(stats::lm(d$Y ~ d$X + I(d$X^2), - subset=(d$X <= h2p & d$X>=0)))[3] + subset = (d$X <= h2p & d$X>=0)))[3] ## STEP 3: Calculate regularization terms, Equation (16) rm <- 2160*varm / (sum(d$X >= -h2m & d$X<0) * h2m^4) rp <- 2160*varp / (sum(d$X <= h2p & d$X>=0) * h2p^4) - if(verbose) + if (verbose) cat("\n h1: ", h1, "\n N_{-}, N_{+}: ", Nm, Np, "\n f(0): ", f0, "\n sigma^2_{+}(0): ", sqrt(varp), "^2\n sigma^2_{+}(0):", sqrt(varm), "^2", "\n m3: ", m3, "\n h_{2, +}:", h2p, "h_{2, -}:", diff --git a/R/utils.R b/R/utils.R index 2f3ba7d..1cd4f1f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,6 +1,6 @@ ## Get rid of AsIs class attribute for making sort work unAsIs <- function(X) { - if(inherits(X, "AsIs")) { + if (inherits(X, "AsIs")) { class(X) <- class(X)[-match("AsIs", class(X))] } X @@ -10,7 +10,7 @@ unAsIs <- function(X) { NPRData <- function(d, cutoff, class) { Xindex <- if (class=="FRD") 3 else 2 ## Unclass in case generated by I() - if(is.unsorted(d[[Xindex]])) + if (is.unsorted(d[[Xindex]])) d <- d[sort(unAsIs(d[[Xindex]]), index.return=TRUE)$ix, ] Y <- if (class=="FRD") cbind(d[[1]], d[[2]]) else d[[1]] @@ -20,7 +20,7 @@ NPRData <- function(d, cutoff, class) { df$m <- df$X<0 df$sigma2 <- d$"(sigma2)" df$clusterid <- d$"(clusterid)" - df$w <- if(is.null(d$"(weights)")) rep(1L, length(df$X)) else d$"(weights)" + df$w <- if (is.null(d$"(weights)")) rep(1L, length(df$X)) else d$"(weights)" df } @@ -35,8 +35,8 @@ NPRData <- function(d, cutoff, class) { FindZero <- function(f, ival=1.1, negative=TRUE) { minval <- function(ival) if (negative==TRUE) -ival else min(1/ival, 1e-3) - while(sign(f(ival))==sign(f(minval(ival)))) - ival <- 2*ival + while (sign(f(ival))==sign(f(minval(ival)))) + ival <- 2*ival stats::uniroot(f, c(minval(ival), ival), tol=.Machine$double.eps^0.75)$root } diff --git a/data-raw/data-prep.R b/data-raw/data-prep.R index 8fdd904..f48b48f 100644 --- a/data-raw/data-prep.R +++ b/data-raw/data-prep.R @@ -98,10 +98,9 @@ fips <- rbind(fips, statefp=c(2, 30, 51, 12, 2, 2), countyfp=c(280, 113, 780, 25, 231, 201), county=c("Wrangell-Petersburg", - "Yellowstone National Park", - "South Boston", "Date County", - "Skagway-Yakutat-Angoon", - "Prince of Wales-Outer Ketchikan"), + "Yellowstone National Park", "South Boston", + "Date County", "Skagway-Yakutat-Angoon", + "Prince of Wales-Outer Ketchikan"), cfips=c(2280, 30113, 51780, 12025, 2231, 2201))) fips <- fips[fips$statefp<57, ] diff --git a/data-raw/kernel-moments.R b/data-raw/kernel-moments.R index abf4f4f..9fda709 100644 --- a/data-raw/kernel-moments.R +++ b/data-raw/kernel-moments.R @@ -33,7 +33,7 @@ kernC$pi1 <- c(1/2, 1/3, 3/8, 16/27, 3/8, 702464/1603125, 1/2, 1/3, 3/8, 1/2, 1/3, 3/8, 0.4875, 0.3115591197, 0.3603316352) kernC$pi2 <- c(1/3, 1/6, 1/5, 59/162, 3/16, 16520549/72140625, - 558*sqrt(6)/5^5, 12*sqrt(5)/5^3, 0.26617935, + 558*sqrt(6) / 5^5, 12*sqrt(5) / 5^3, 0.26617935, 1/3, 1/6, 1/5, 1/3, 1/6, 1/5, 0.2788548019, 0.1398694518, 0.1717750173) kernC$pi3 <- c(1/4, 1/10, 1/8, 113/405, 1/8, 235792912/1514953125, diff --git a/doc/RDHonest.R b/doc/RDHonest.R index 3f668bf..7d7c75c 100644 --- a/doc/RDHonest.R +++ b/doc/RDHonest.R @@ -10,16 +10,15 @@ library("RDHonest") ## ----fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"----------------- ## plot 25-bin averages in for observations 50 at most points ## away from the cutoff. See Figure 1. -plot_RDscatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, - avg=50, propdotsize=FALSE, - xlab="Margin of victory", - ylab="Vote share in next election") +RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, + avg=50, propdotsize=FALSE, xlab="Margin of victory", + ylab="Vote share in next election") ## ----fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"---------- ## see Figure 2 -f2 <- plot_RDscatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947, - avg=Inf, xlab="Year aged 14", ylab="Log earnings", - propdotsize=TRUE) +f2 <- RDScatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947, + avg=Inf, xlab="Year aged 14", ylab="Log earnings", + propdotsize=TRUE) ## Adjust size of dots if they are too big f2 + ggplot2::scale_size_area(max_size = 4) @@ -36,24 +35,24 @@ RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="H") ## ----------------------------------------------------------------------------- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", - M=0.1, opt.criterion="MSE", sclass="H") + M=0.1, opt.criterion="MSE", sclass="H") ## Choose bws optimal for length of CI RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1, - opt.criterion="FLCI", sclass="H") + opt.criterion="FLCI", sclass="H") ## ----------------------------------------------------------------------------- ## Replicate Table 2, column (10) RDHonest(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H") + data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H") ## Triangular kernel generally gives tigher CIs RDHonest(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H") + data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H") ## ----------------------------------------------------------------------------- ## Replicate Table 2, column (6), run local linear regression (order=1) ## with a uniform kernel (other kernels are not yet implemented) RDHonestBME(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, h=3, order=1) + data=cghs, h=3, order=1) ## ----------------------------------------------------------------------------- ## Data-driven choice of M @@ -62,18 +61,18 @@ RDHonest(voteshare ~ margin, data=lee08, kern="uniform", sclass="H", ## ----------------------------------------------------------------------------- r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1, - opt.criterion="FLCI", - se.method="nn")$coefficients + opt.criterion="FLCI", + se.method="nn")$coefficients r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1, - opt.criterion="FLCI", se.method="nn", - sclass="T")$coefficients + opt.criterion="FLCI", se.method="nn", + sclass="T")$coefficients r1$conf.high-r1$conf.low r2$conf.high-r2$conf.low ## ----------------------------------------------------------------------------- ## Add variance estimate to the Lee (2008) data so that the RDSmoothnessBound ## function doesn't have to compute them each time -## dl <- NPRPrelimVar.fit(dl, se.initial="nn") +## dl <- PrelimVar(dl, se.initial="nn") ### Only use three point-average for averages of a 100 points closest to cutoff, ### and report results separately for points above and below cutoff @@ -92,9 +91,9 @@ d$cell <- d$mod/100+d$yearat14 ## Data with cell averages dd <- data.frame() for (j in unique(d$cell)){ - dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]), - x=mean(d$yearat14[d$cell==j]), - weights=length(d$yearat14[d$cell==j]))) + dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]), + x=mean(d$yearat14[d$cell==j]), + weights=length(d$yearat14[d$cell==j]))) } ## ----------------------------------------------------------------------------- @@ -112,16 +111,16 @@ RDHonest(y~x, cutoff=1947, weights=weights, h=5, data=dd, M=0.1, ## ----------------------------------------------------------------------------- ## Initial estimate of treatment effect for optimal bandwidth calculations r <- RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0) + M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0) ## Use it to compute optimal bandwidth RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", - T0=r$coefficients$estimate) + M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", + T0=r$coefficients$estimate) ## ----------------------------------------------------------------------------- ## Data-driven choice of M RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate) + opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate) ## ----------------------------------------------------------------------------- ## Transform data, specify we're interested in inference at x0=20, diff --git a/doc/RDHonest.Rmd b/doc/RDHonest.Rmd index ccbb617..6e55f0f 100644 --- a/doc/RDHonest.Rmd +++ b/doc/RDHonest.Rmd @@ -72,17 +72,16 @@ library("RDHonest") ## Plots -The package provides a function `plot_RDscatter` to plot the raw data. To remove +The package provides a function `RDScatter` to plot the raw data. To remove some noise, the function plots averages over `avg` number of observations. The function takes an `RDData` object as an argument ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"} ## plot 25-bin averages in for observations 50 at most points ## away from the cutoff. See Figure 1. -plot_RDscatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, - avg=50, propdotsize=FALSE, - xlab="Margin of victory", - ylab="Vote share in next election") +RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, + avg=50, propdotsize=FALSE, xlab="Margin of victory", + ylab="Vote share in next election") ``` The running variable in the Oreopoulos dataset is discrete. It is therefore @@ -93,9 +92,9 @@ averages over. ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"} ## see Figure 2 -f2 <- plot_RDscatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947, - avg=Inf, xlab="Year aged 14", ylab="Log earnings", - propdotsize=TRUE) +f2 <- RDScatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947, + avg=Inf, xlab="Year aged 14", ylab="Log earnings", + propdotsize=TRUE) ## Adjust size of dots if they are too big f2 + ggplot2::scale_size_area(max_size = 4) ``` @@ -211,10 +210,10 @@ optimality criterion: ```{r} RDHonest(voteshare ~ margin, data=lee08, kern="triangular", - M=0.1, opt.criterion="MSE", sclass="H") + M=0.1, opt.criterion="MSE", sclass="H") ## Choose bws optimal for length of CI RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1, - opt.criterion="FLCI", sclass="H") + opt.criterion="FLCI", sclass="H") ``` ### Inference when running variable is discrete @@ -232,10 +231,10 @@ As an example, consider the @oreopoulos06 data, in which the running variable is ```{r} ## Replicate Table 2, column (10) RDHonest(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H") + data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H") ## Triangular kernel generally gives tigher CIs RDHonest(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H") + data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H") ``` In addition, the package provides function `RDHonestBME` that calculates honest @@ -246,11 +245,12 @@ no worse at the cutoff than away from the cutoff as in Section 5.2 in @KoRo16. ## Replicate Table 2, column (6), run local linear regression (order=1) ## with a uniform kernel (other kernels are not yet implemented) RDHonestBME(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, h=3, order=1) + data=cghs, h=3, order=1) ``` -Let us describe the implementation of the variance estimator $\hat{V}(W)$ used to construct the CI as described in in Section 5.2 in @KoRo16. Suppose the point +Let us describe the implementation of the variance estimator $\hat{V}(W)$ used +to construct the CI as described in in Section 5.2 in @KoRo16. Suppose the point estimate is given by the first element of the regression of the outcome $y_i$ on $m(x_i)$. For instance, local linear regression with uniform kernel and bandwidth $h$ corresponds to $m(x)=I(|x|\leq h)\cdot(I(x>c_0),1,x, x\cdot @@ -347,9 +347,10 @@ $M$ a priori, rather than attempting to use a data-driven method. Therefore, one should, whenever possible, use problem-specific knowledge to decide what choice of $M$ is reasonable a priori. -For cases in which this is difficult, the function `NPR_MROT.fit` implements the method -considered in Section 3.4.1 in @ArKo16honest based on a global polynomial -approximation: +For cases in which this is difficult, the internal function `RDHonest` +implements the method considered in Section 3.4.1 in @ArKo16honest based on a +global polynomial approximation: + ```{r} ## Data-driven choice of M RDHonest(voteshare ~ margin, data=lee08, kern="uniform", sclass="H", @@ -369,11 +370,11 @@ estimate used to compute optimal bandwidths: ```{r} r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1, - opt.criterion="FLCI", - se.method="nn")$coefficients + opt.criterion="FLCI", + se.method="nn")$coefficients r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1, - opt.criterion="FLCI", se.method="nn", - sclass="T")$coefficients + opt.criterion="FLCI", se.method="nn", + sclass="T")$coefficients r1$conf.high-r1$conf.low r2$conf.high-r2$conf.low ``` @@ -383,10 +384,13 @@ r2$conf.high-r2$conf.low The package also implements lower-bound estimates for the smoothness constant $M$ for the Taylor and Hölder smoothness class, as described in the supplements to @KoRo16 and @ArKo16optimal +TODO: `PrelimVar` is internal + + ```{r} ## Add variance estimate to the Lee (2008) data so that the RDSmoothnessBound ## function doesn't have to compute them each time -## dl <- NPRPrelimVar.fit(dl, se.initial="nn") +## dl <- PrelimVar(dl, se.initial="nn") ### Only use three point-average for averages of a 100 points closest to cutoff, ### and report results separately for points above and below cutoff @@ -412,9 +416,9 @@ d$cell <- d$mod/100+d$yearat14 ## Data with cell averages dd <- data.frame() for (j in unique(d$cell)){ - dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]), - x=mean(d$yearat14[d$cell==j]), - weights=length(d$yearat14[d$cell==j]))) + dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]), + x=mean(d$yearat14[d$cell==j]), + weights=length(d$yearat14[d$cell==j]))) } ``` @@ -491,11 +495,11 @@ estimator of the treatment effect ```{r} ## Initial estimate of treatment effect for optimal bandwidth calculations r <- RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0) + M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0) ## Use it to compute optimal bandwidth RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", - T0=r$coefficients$estimate) + M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", + T0=r$coefficients$estimate) ``` ## Data-driven choice of smoothness constant @@ -513,7 +517,7 @@ approximation: ```{r} ## Data-driven choice of M RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate) + opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate) ``` See @ArKo16honest for a discussion of the restrictions on the parameter space under which this method yields honest inference. diff --git a/doc/RDHonest.pdf b/doc/RDHonest.pdf index 172c779..ea04181 100644 Binary files a/doc/RDHonest.pdf and b/doc/RDHonest.pdf differ diff --git a/doc/manual.pdf b/doc/manual.pdf index cc0c6bf..463fdaf 100644 Binary files a/doc/manual.pdf and b/doc/manual.pdf differ diff --git a/man/plot_RDscatter.Rd b/man/RDScatter.Rd similarity index 94% rename from man/plot_RDscatter.Rd rename to man/RDScatter.Rd index 40fdb36..de06be6 100644 --- a/man/plot_RDscatter.Rd +++ b/man/RDScatter.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/plots.R -\name{plot_RDscatter} -\alias{plot_RDscatter} +\name{RDScatter} +\alias{RDScatter} \title{Scatterplot of binned raw observations} \usage{ -plot_RDscatter( +RDScatter( formula, data, subset, @@ -64,6 +64,6 @@ that is first in \code{data} and then in the environment of \code{formula}. } \examples{ -plot_RDscatter(log(earnings)~yearat14, data=cghs, cutoff=1947, +RDScatter(log(earnings)~yearat14, data=cghs, cutoff=1947, avg=Inf, propdotsize=TRUE) } diff --git a/man/RDSmoothnessBound.Rd b/man/RDSmoothnessBound.Rd index 9337a68..f196189 100644 --- a/man/RDSmoothnessBound.Rd +++ b/man/RDSmoothnessBound.Rd @@ -60,8 +60,9 @@ RDSmoothnessBound(r, s=2) { \cite{Michal Kolesár and Christoph Rothe. Inference in regression -discontinuity designs with a discrete running variable. American Economic -Review, 108(8):2277—-2304, August 2018. \doi{10.1257/aer.20160945}} + discontinuity designs with a discrete running variable. + American Economic Review, 108(8):2277—-2304, + August 2018. \doi{10.1257/aer.20160945}} } } diff --git a/tests/spelling.R b/tests/spelling.R index c61948b..33569a8 100644 --- a/tests/spelling.R +++ b/tests/spelling.R @@ -1,3 +1,3 @@ -if(requireNamespace("spelling", quietly = TRUE)) - spelling::spell_check_test(vignettes = TRUE, error = FALSE, - skip_on_cran = TRUE) +if (requireNamespace("spelling", quietly = TRUE)) + spelling::spell_check_test(vignettes = TRUE, error = FALSE, + skip_on_cran = TRUE) diff --git a/tests/testthat/test_clustering.R b/tests/testthat/test_clustering.R index 370f283..c201002 100644 --- a/tests/testthat/test_clustering.R +++ b/tests/testthat/test_clustering.R @@ -57,10 +57,10 @@ test_that("Test clustering formulas", { expect_equal(p1h$coefficients$std.error, 792.2299439) p2c <- RDHonest(c~elig_year, data=rr, clusterid=clusterid, - point.inference=TRUE) - h <- p2c$coefficients$bandwidth #8.27778 + point.inference=TRUE, h=8.27778063886) + h <- p2c$coefficients$bandwidth m3 <- lm(c~elig_year, data=rr, subset=abs(elig_year)<=h, - weights =(1 - abs(elig_year/h))) + weights = (1 - abs(elig_year/h))) ## sqrt(sandwich::vcovCL(m3, type="HC0", ## cluster=clusterid[abs(rr$elig_year)<=h], ## cadjust=FALSE)[1, 1]) @@ -88,18 +88,6 @@ test_that("Test clustering formulas", { dd <- data.frame(y=log(rr$c), x=rr$elig_year, "(clusterid)"=clusterid) names(dd)[3]<- "(clusterid)" d <- NPRData(dd, cutoff=0, class="SRD") - r0 <- NPRPrelimVar.fit(d) + r0 <- PrelimVar(d) expect_equal(r0$rho, -0.0008690793197) - - ## Alternative - ## h <- IKBW.fit(d) - ## m4 <- lm(d$Y~d$X*I(d$X>0), subset=abs(d$X)<=h, weights=(1-abs(d$X/h))) - ## r <- dfadjust::dfadjustSE(m4, IK=TRUE, - ## clustervar=as.factor(d$clusterid[abs(d$X)<=h])) - ## print(r$rho, digits=10) - - - - - }) diff --git a/tests/testthat/test_frd.R b/tests/testthat/test_frd.R index 9307b20..4f4961d 100644 --- a/tests/testthat/test_frd.R +++ b/tests/testthat/test_frd.R @@ -5,11 +5,11 @@ test_that("Selected bw is infinite", { cutoff=0, "FRD") ## Expect using all data - r0 <- NPRHonest.fit(d, M=c(0, 0), kern="triangular", opt.criterion="OCI", - T0=0)$coefficients - r1 <- NPRHonest.fit(d, M=c(0, 0), kern="uniform", opt.criterion="FLCI", - T0=0)$coefficients - r2 <- NPRreg.fit(d, Inf, "uniform") + r0 <- NPRHonest(d, M=c(0, 0), kern="triangular", opt.criterion="OCI", + T0=0)$coefficients + r1 <- NPRHonest(d, M=c(0, 0), kern="uniform", opt.criterion="FLCI", + T0=0)$coefficients + r2 <- NPReg(d, Inf, "uniform") expect_equal(c(r1$std.error, r1$estimate), c(r2$se, r2$estimate)) expect_identical(r1$maximum.bias, 0) @@ -18,9 +18,8 @@ test_that("Selected bw is infinite", { expect_equal(r1$bandwidth, max(abs(d$X))) d <- NPRData(lee08, cutoff=0, "SRD") - r1 <- NPRHonest.fit(d, M=0, kern="uniform", - opt.criterion="MSE")$coefficients - r2 <- NPRreg.fit(d, Inf, "uniform") + r1 <- NPRHonest(d, M=0, kern="uniform", opt.criterion="MSE")$coefficients + r2 <- NPReg(d, Inf, "uniform") expect_equal(c(r1$std.error, r1$estimate), c(r2$se, r2$estimate)) expect_identical(r1$maximum.bias, 0) @@ -30,13 +29,13 @@ context("Test FRD") test_that("FRD data example check", { d <- NPRData(cbind(logcn=log(rcp[, 6]), rcp[, c(3, 2)]), cutoff=0, "FRD") - M <- NPR_MROT.fit(d) - r1 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="MSE", - T0=0)$coefficients - r2 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="MSE", - T0=r1$estimate)$coefficients - r1a <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="MSE", - T0=0, alpha=r1$p.value)$coefficients + M <- MROT(d) + r1 <- NPRHonest(d, M, kern="triangular", opt.criterion="MSE", + T0=0)$coefficients + r2 <- NPRHonest(d, M, kern="triangular", opt.criterion="MSE", + T0=r1$estimate)$coefficients + r1a <- NPRHonest(d, M, kern="triangular", opt.criterion="MSE", T0=0, + alpha=r1$p.value)$coefficients expect_equal(r1a$conf.high, 0) expect_equal(max(abs(expect_equal(r1a$conf.high, 0))), 0) ## With positive T0, expect greater effective M, and thus smaller bandwidth @@ -48,12 +47,12 @@ test_that("FRD data example check", { test_that("FRD with almost perfect first stage", { d <- NPRData(data.frame(y=lee08$voteshare, d=lee08$margin>0, x=lee08$margin), cutoff=0, "FRD") - M <- NPR_MROT.fit(d) - r0 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="FLCI") - r1 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="FLCI", - T0=r0$coefficients$estimate)$coefficients - r2 <- NPRHonest.fit(NPRData(lee08, cutoff=0, "SRD"), unname(M[1]), - kern="triangular", opt.criterion="FLCI")$coefficients + M <- MROT(d) + r0 <- NPRHonest(d, M, kern="triangular", opt.criterion="FLCI") + r1 <- NPRHonest(d, M, kern="triangular", opt.criterion="FLCI", + T0=r0$coefficients$estimate)$coefficients + r2 <- NPRHonest(NPRData(lee08, cutoff=0, "SRD"), unname(M[1]), + kern="triangular", opt.criterion="FLCI")$coefficients expect_lt(max(abs(r2[2:6]-r2[2:6])), 3e-7) set.seed(42) @@ -61,12 +60,12 @@ test_that("FRD with almost perfect first stage", { d=lee08$margin+rnorm(n=length(lee08$margin), sd=0.1)>0, x=lee08$margin) d <- NPRData(df, cutoff=0, "FRD") - M <- NPR_MROT.fit(d) - r0 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="MSE") - r1 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="MSE", - T0=r0$coefficients$estimate)$coefficients - r2 <- NPRHonest.fit(NPRData(lee08, cutoff=0, "SRD"), unname(M[1]), - kern="triangular", opt.criterion="MSE")$coefficients + M <- MROT(d) + r0 <- NPRHonest(d, M, kern="triangular", opt.criterion="MSE") + r1 <- NPRHonest(d, M, kern="triangular", opt.criterion="MSE", + T0=r0$coefficients$estimate)$coefficients + r2 <- NPRHonest(NPRData(lee08, cutoff=0, "SRD"), unname(M[1]), + kern="triangular", opt.criterion="MSE")$coefficients expect_lt(abs(r1$estimate*r1$first.stage-r2$estimate), 1e-2) expect_lt(abs(r1$bandwidth-r2$bandwidth), 0.1) expect_lt(abs(r0$coefficients$conf.low-r1$conf.low), 0.01) @@ -75,11 +74,11 @@ test_that("FRD with almost perfect first stage", { test_that("FRD interface", { d <- NPRData(cbind(logf=log(rcp[1:10000, 6]), rcp[1:10000, c(3, 2)]), cutoff=0, "FRD") - M <- NPR_MROT.fit(d) + M <- MROT(d) rcp1 <- rcp[1:10000, ] - r1 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="OCI", T0=0) + r1 <- NPRHonest(d, M, kern="triangular", opt.criterion="OCI", T0=0) p1 <- RDHonest(log(cn)~retired | elig_year, data=rcp1, cutoff=0, M=M, - kern="triangular", opt.criterion="OCI", T0=0) + kern="triangular", opt.criterion="OCI", T0=0) expect_equal(r1$coefficients$estimate, p1$coefficients$estimate) p1.1 <- RDHonest(log(cn)~retired | elig_year, data=rcp1, cutoff=0, kern="triangular", T0=0, h=6) @@ -91,11 +90,11 @@ test_that("FRD interface", { expect_lt(abs(p1.2$coefficients$conf.high- 0.17195578263), 1e-6) expect_lt(abs(p1.2$coefficients$estimate+0.172378084757), 1e-6) - r2 <- NPRHonest.fit(d, M, kern="triangular", opt.criterion="OCI", - T0=r1$coefficients$estimate) + r2 <- NPRHonest(d, M, kern="triangular", opt.criterion="OCI", + T0=r1$coefficients$estimate) p2 <- RDHonest(log(cn)~retired | elig_year, data=rcp1, cutoff=0, M=M, - kern="triangular", opt.criterion="OCI", - T0=p1$coefficients$estimate) + kern="triangular", opt.criterion="OCI", + T0=p1$coefficients$estimate) expect_equal(r2$coefficients$estimate, p2$coefficients$estimate) ## codecov.io check expect_equal(capture.output(print(r2))[1:7], @@ -105,17 +104,16 @@ test_that("FRD interface", { expect_equal(capture.output(print(p2, digits=4))[14], "Maximal leverage for fuzzy RD Parameter: 0.00361") - r3 <- NPRHonest.fit(d, M, kern="triangular", h=7, - T0=r1$coefficients$estimate) + r3 <- NPRHonest(d, M, kern="triangular", h=7, T0=r1$coefficients$estimate) p3 <- RDHonest(log(cn)~retired | elig_year, data=rcp1, cutoff=0, M=M, - kern="triangular", opt.criterion="OCI", - T0=p1$coefficients$estimate, h=7) + kern="triangular", opt.criterion="OCI", + T0=p1$coefficients$estimate, h=7) expect_equal(r3$coefficients$estimate, p3$coefficients$estimate) - r4 <- NPROptBW.fit(d, M, kern="triangular", opt.criterion="OCI", - T0=r1$coefficients$estimate) + r4 <- OptBW(d, M, kern="triangular", opt.criterion="OCI", + T0=r1$coefficients$estimate) p4 <- RDHonest(log(cn)~retired | elig_year, data=rcp1, cutoff=0, M=M, - kern="triangular", opt.criterion="OCI", - T0=p1$coefficients$estimate) + kern="triangular", opt.criterion="OCI", + T0=p1$coefficients$estimate) expect_identical(r4, p4$coefficients$bandwidth) }) diff --git a/tests/testthat/test_lpp.R b/tests/testthat/test_lpp.R index 913e9bf..a4c730f 100644 --- a/tests/testthat/test_lpp.R +++ b/tests/testthat/test_lpp.R @@ -2,12 +2,12 @@ context("Test Inference at a point") test_that("Inference at point agrees with RD", { d <- NPRData(lee08, cutoff=0, "SRD") - rde <- NPRHonest.fit(d, h=5, M=2)$coefficients + rde <- NPRHonest(d, h=5, M=2)$coefficients dp <- NPRData(lee08[lee08$margin>=0, ], cutoff=0, "IP") dm <- NPRData(lee08[lee08$margin<0, ], cutoff=0, "IP") - p0 <- NPRHonest.fit(dp, h=5, M=2) + p0 <- NPRHonest(dp, h=5, M=2) pp <- p0$coefficients - mm <- NPRHonest.fit(dm, h=5, M=2)$coefficients + mm <- NPRHonest(dm, h=5, M=2)$coefficients expect_equal(pp$estimate-mm$estimate, rde$estimate) expect_equal(pp$std.error^2+mm$std.error^2, rde$std.error^2) expect_equal(pp$maximum.bias+mm$maximum.bias, rde$maximum.bias) @@ -20,9 +20,9 @@ test_that("Inference at point agrees with RD", { expect_equal(capture.output(print(p0))[1:7], capture.output(print(p2))[6:12]) - r <- NPRHonest.fit(d, h=7, M=2)$coefficients - rm <- NPRHonest.fit(dp, h=7, M=2)$coefficients - rp <- NPRHonest.fit(dm, h=7, M=2)$coefficients + r <- NPRHonest(d, h=7, M=2)$coefficients + rm <- NPRHonest(dp, h=7, M=2)$coefficients + rp <- NPRHonest(dm, h=7, M=2)$coefficients expect_equal(r$maximum.bias, rm$maximum.bias+rp$maximum.bias) expect_equal(sqrt(rp$std.error^2+rm$std.error^2), r$std.error) @@ -33,9 +33,9 @@ test_that("MROT matches paper", { Mh <- RDHonest(mortHS~povrate, data=headst, cutoff=0, h=0)$coefficients$M expect_equal(Mh, 0.29939992) - Mp <- RDHonest(mortHS~povrate, data=headst, subset=(povrate>=0), + Mp <- RDHonest(mortHS~povrate, data=headst, subset = (povrate>=0), cutoff=0, h=0, point.inference=TRUE) - Mm <- RDHonest(mortHS~povrate, data=headst, subset=(povrate<0), + Mm <- RDHonest(mortHS~povrate, data=headst, subset = (povrate<0), h=0, point.inference=TRUE) expect_equal(Mp$coefficients$M, Mh) expect_lt(Mm$coefficients$M, Mh) @@ -44,29 +44,29 @@ test_that("MROT matches paper", { test_that("ROT bandwidth check", { ## Interior d <- NPRData(lee08, cutoff=0, "IP") - b1 <- ROTBW.fit(d, kern="uniform") + b1 <- ROTBW(d, kern="uniform") ## f0 using Silverman: f0 <- 0.0089934638 C <- 9/8 # nu0/(4*mu_2^2) ll <- lm(d$Y~d$X+I(d$X^2)+I(d$X^3)+I(d$X^4)) - h <- (C*sigma(ll)^2/(length(d$X)*f0*ll$coefficients[3]^2))^(1/5) + h <- (C*sigma(ll)^2 / (length(d$X)*f0*ll$coefficients[3]^2))^(1/5) expect_equal(b1, unname(h)) dp <- NPRData(lee08[lee08$margin>0, ], cutoff=0, "IP") - bp1 <- ROTBW.fit(dp, kern="uniform") + bp1 <- ROTBW(dp, kern="uniform") ## f0 using Silverman: f0 <- 0.0079735105 C <- 36 # nu0/(4*mu_2^2) ll <- lm(dp$Y~dp$X+I(dp$X^2)+I(dp$X^3)+I(dp$X^4)) der <- unname(ll$coefficients[3]) - h <- (C*sigma(ll)^2/(length(dp$X)*f0*der^2))^(1/5) + h <- (C*sigma(ll)^2 / (length(dp$X)*f0*der^2))^(1/5) expect_equal(bp1, h) }) test_that("Optimal bandwidth calculations", { - rr <- RDHonest(voteshare ~ margin, data=lee08, subset=(margin>0), + rr <- RDHonest(voteshare ~ margin, data=lee08, subset = (margin>0), kern="uniform", opt.criterion="FLCI", point.inference=TRUE) expect_equal(rr$coefficients$conf.high.onesided, 55.24963853) @@ -76,7 +76,7 @@ test_that("Optimal bandwidth calculations", { dp <- NPRData(lee08[lee08$margin>0, ], cutoff=0, "IP") Mh <- rr$coefficients$M - re <- RDHonest(voteshare ~ margin, data=lee08, subset=(margin>0), + re <- RDHonest(voteshare ~ margin, data=lee08, subset = (margin>0), kern="uniform", opt.criterion="FLCI", point.inference=TRUE, se.method="EHW") ## Should match regression @@ -92,12 +92,13 @@ test_that("Optimal bandwidth calculations", { opt.criterion="MSE", point.inference=TRUE) r <- capture.output(print(r1, digits=4)) expect_equal(r[11], "Bandwidth: 13.41, Kernel: triangular") - r2 <- NPROptBW.fit(dp, M=2*Mh, opt.criterion="MSE") + r2 <- OptBW(dp, M=2*Mh, opt.criterion="MSE") expect_identical(r2, r1$coefficients$bandwidth) expect_lt(abs(r2- 13.4109133), 1e-6) ## Make sure we're getting positive worst-case bias - r <- RDHonest(voteshare ~ margin, data=lee08, subset=(margin>0), cutoff=20, + r <- RDHonest(voteshare ~ margin, data=lee08, subset = (margin>0), + cutoff=20, kern="uniform", opt.criterion="MSE", point.inference=TRUE) expect_equal(r$coefficients$maximum.bias, 0.2482525) }) diff --git a/tests/testthat/test_npr.R b/tests/testthat/test_npr.R index 0c9a3e5..12d7985 100644 --- a/tests/testthat/test_npr.R +++ b/tests/testthat/test_npr.R @@ -33,7 +33,7 @@ test_that("Test LPreg", { expect_identical(r0e$eff.obs, r1e$eff.obs) }) -test_that("Test NPRreg", { +test_that("Test NPReg", { ## Replicate Ludwig-Miller lumi <- headst[!is.na(headst$mortHS), c("mortHS", "povrate")] mort <- NPRData(lumi, cutoff=0, "SRD") @@ -43,14 +43,14 @@ test_that("Test NPRreg", { t2 <- data.frame() bws <- c(9, 18, 36) for (bw in bws) { - rfe <- NPRreg.fit(mort, bw, "uniform", order=1, se.method="EHW") - rfn <- NPRreg.fit(mort, bw, "uniform", order=1, se.method="nn") + rfe <- NPReg(mort, bw, "uniform", order=1, se.method="EHW") + rfn <- NPReg(mort, bw, "uniform", order=1, se.method="nn") t1 <- rbind(t1, data.frame(bw=bw, estimate=rfe$estimate, EHW=rfe$se, nn=rfn$se)) - rme <- NPRreg.fit(mortm, bw, "uniform", order=1, se.method="EHW") - rmn <- NPRreg.fit(mortm, bw, "uniform", order=1, se.method="nn") - rpe <- NPRreg.fit(mortp, bw, "uniform", order=1, se.method="EHW") - rpn <- NPRreg.fit(mortp, bw, "uniform", order=1, se.method="nn") + rme <- NPReg(mortm, bw, "uniform", order=1, se.method="EHW") + rmn <- NPReg(mortm, bw, "uniform", order=1, se.method="nn") + rpe <- NPReg(mortp, bw, "uniform", order=1, se.method="EHW") + rpn <- NPReg(mortp, bw, "uniform", order=1, se.method="nn") t2 <- rbind(t2, data.frame(bw=bw, estimate=rpe$estimate-rme$estimate, EHW=sqrt(rpe$se^2+rme$se^2), nn=sqrt(rpn$se^2+rmn$se^2))) @@ -66,13 +66,13 @@ test_that("Test NPRreg", { dr <- NPRData(cbind(logcn=log(rcp[, 6]), rcp[, 2, drop=FALSE]), cutoff=0, "SRD") d <- NPRData(cbind(logcn=log(rcp[, 6]), rcp[, c(3, 2)]), cutoff=0, "FRD") - rf <- NPRreg.fit(df, 10, "uniform", order=1, se.method="EHW") - rr <- NPRreg.fit(dr, 10, "uniform", order=1, se.method="EHW") + rf <- NPReg(df, 10, "uniform", order=1, se.method="EHW") + rr <- NPReg(dr, 10, "uniform", order=1, se.method="EHW") expect_equal(unname(c(rf$estimate, round(rf$se, 4))), c(0.43148435, 0.0181)) expect_equal(unname(c(rr$estimate, round(rr$se, 4))), c(-0.0355060, 0.0211)) - r1 <- NPRreg.fit(d, 10, "uniform", order=1, se.method="EHW") + r1 <- NPReg(d, 10, "uniform", order=1, se.method="EHW") expect_equal(c(r1$estimate, r1$se), c(-0.08228802, 0.0483039)) ## Numbers from @@ -83,8 +83,7 @@ test_that("Test NPRreg", { ## summary(r4, vcov = sandwich::sandwich, ## diagnostics=TRUE)$coefficients[2, 1:2] expect_lt(abs(r1$estimate- rr$estimate/rf$estimate), 1e-10) - r2 <- NPRreg.fit(d, 5, function(x) abs(x)<=1, order=0, - se.method="EHW") + r2 <- NPReg(d, 5, function(x) abs(x)<=1, order=0, se.method="EHW") ## r3 <- AER::ivreg(logcn~retired | Z, subset=(abs(elig_year)<=5), data=dt) ## summary(r3, vcov = sandwich::sandwich, ## diagnostics = TRUE)$coefficients[2, 1:2] diff --git a/tests/testthat/test_rd.R b/tests/testthat/test_rd.R index 050ea99..8a65ee7 100644 --- a/tests/testthat/test_rd.R +++ b/tests/testthat/test_rd.R @@ -22,9 +22,9 @@ test_that("Test class constructor sorting", { -RDHonest(voteshare ~ I(-margin), data=lee08, M=0, h=2)$coefficients$estimate) expect_equal(RDHonest(cn~retired | elig_year, data=rcp, cutoff=0, - M=c(1, 0.1), h=3)$coefficients$estimate, + M=c(1, 0.1), h=3)$coefficients$estimate, RDHonest(cn~retired | I(2*elig_year), data=rcp, cutoff=0, - M=c(1, 0.1), h=6)$coefficients$estimate) + M=c(1, 0.1), h=6)$coefficients$estimate) }) test_that("IK bandwidth calculations", { @@ -33,7 +33,7 @@ test_that("IK bandwidth calculations", { dig <- getOption("digits") options(digits=8) - r1 <- capture.output(IKBW.fit(d, verbose=TRUE)) + r1 <- capture.output(IKBW(d, verbose=TRUE)) expect_equal(r1[c(2, 4, 5, 6, 8)], c(" h1: 14.44507 ", " f(0): 0.0089622411 ", " sigma^2_{+}(0): 12.024424 ^2", @@ -41,35 +41,35 @@ test_that("IK bandwidth calculations", { " h_{2, +}: 60.513312 h_{2, -}: 60.993358 ")) options(digits=dig) - expect_equal(IKBW.fit(d), 29.3872649956) + expect_equal(IKBW(d), 29.3872649956) - r <- NPRreg.fit(d, IKBW.fit(d, kern="uniform"), "uniform") + r <- NPReg(d, IKBW(d, kern="uniform"), "uniform") expect_equal(r$estimate, 8.0770003749) - d <- NPRPrelimVar.fit(NPRData(lee08, cutoff=0, "SRD"), se.initial="EHW") + d <- PrelimVar(NPRData(lee08, cutoff=0, "SRD"), se.initial="EHW") expect_equal(sqrt(mean(d$sigma2[d$p])), 12.58183131) expect_equal(sqrt(mean(d$sigma2[d$m])), 10.79067278) }) test_that("Plots", { if (requireNamespace("ggplot2", quietly = TRUE)) { - expect_silent(invisible(plot_RDscatter( - earnings~yearat14, data=cghs, - cutoff=1947, avg=Inf, propdotsize=TRUE))) - expect_silent(invisible(plot_RDscatter( - voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, - avg=50, propdotsize=FALSE, - xlab="margin", ylab="effect"))) - } + expect_silent(invisible(RDScatter(earnings~yearat14, data=cghs, + cutoff=1947, avg=Inf, + propdotsize=TRUE))) + expect_silent(invisible(RDScatter(voteshare~margin, data=lee08, + subset=abs(lee08$margin)<=50, avg=50, + propdotsize=FALSE, xlab="margin", + ylab="effect"))) + } }) test_that("Honest inference in Lee and LM data", { ## Replicate 1606.01200v2, except we no longer provide SilvermanNN - r1 <- RDHonest(mortHS ~ povrate, data=headst, - kern="uniform", h=18, M=0.1, se.method="EHW") - r2 <- RDHonest(mortHS ~ povrate, data=headst, - kern="uniform", h=18, M=0.1, se.method="nn") + r1 <- RDHonest(mortHS ~ povrate, data=headst, kern="uniform", h=18, M=0.1, + se.method="EHW") + r2 <- RDHonest(mortHS ~ povrate, data=headst, kern="uniform", h=18, M=0.1, + se.method="nn") ## Should match regression rl <- lm(mortHS ~ povrate*I(povrate>=0), data=headst, subset=abs(povrate)<=18) @@ -94,15 +94,15 @@ test_that("Honest inference in Lee and LM data", { expect_equal(r1o[16], "24 observations with missing values dropped") d <- NPRData(headst[!is.na(headst$mortHS), c("mortHS", "povrate")], - cutoff=0, "SRD") - d <- NPRPrelimVar.fit(d, se.initial="Silverman") + cutoff=0, "SRD") + d <- PrelimVar(d, se.initial="Silverman") es <- function(kern, se.method) { - NPRHonest.fit(d, M=0.0076085544, kern=kern, sclass="H", - se.method=se.method, J=3, alpha=0.05, opt.criterion="MSE") + NPRHonest(d, M=0.0076085544, kern=kern, sclass="H", se.method=se.method, + J=3, alpha=0.05, opt.criterion="MSE") } ff <- function(h, kern, se.method) { - NPRHonest.fit(d, M=0.0076085544, kern=kern, sclass="H", - se.method=se.method, J=3, alpha=0.05, h=h) + NPRHonest(d, M=0.0076085544, kern=kern, sclass="H", se.method=se.method, + J=3, alpha=0.05, h=h) } ## In this case the objective is not unimodal, but the modification still @@ -135,9 +135,8 @@ test_that("Honest inference in Lee and LM data", { M=0.2, opt.criterion="MSE", se.method="supplied.var") ## expect_equal(unname(r$lower), 2.2838100315) expect_equal(unname(r$coefficients$conf.low.onesided), 2.983141711) - r <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", - M=0.04, opt.criterion="OCI", se.method="supplied.var", - beta=0.8) + r <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.04, + opt.criterion="OCI", se.method="supplied.var", beta=0.8) expect_equal(r$coefficients$conf.low.onesided, 2.761343298) r <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.04, opt.criterion="FLCI", se.method="supplied.var") @@ -184,14 +183,13 @@ test_that("Honest inference in Lee and LM data", { ## expect_equal(r3$h, 5.0590753991) ## Decrease M, these results are not true minima... - d <- NPRPrelimVar.fit(NPRData(lee08, cutoff=0, "SRD"), - se.initial="Silverman") - r1 <- NPRHonest.fit(d, M=0.01, kern="uniform", opt.criterion="MSE", - beta=0.8, sclass="T")$coefficients - r2 <- NPRHonest.fit(d, M=0.01, kern="uniform", opt.criterion="MSE", - beta=0.8, sclass="H")$coefficients - r3 <- NPROptBW.fit(d, kern = "uniform", M = 0.1, - opt.criterion = "MSE", sclass = "T") + d <- PrelimVar(NPRData(lee08, cutoff=0, "SRD"), se.initial="Silverman") + r1 <- NPRHonest(d, M=0.01, kern="uniform", opt.criterion="MSE", beta=0.8, + sclass="T")$coefficients + r2 <- NPRHonest(d, M=0.01, kern="uniform", opt.criterion="MSE", beta=0.8, + sclass="H")$coefficients + r3 <- OptBW(d, kern = "uniform", M = 0.1, opt.criterion = "MSE", + sclass = "T") expect_equal(unname(r1$bandwidth), 12.85186708) expect_equal(unname(r2$conf.low.onesided), 6.056860266) expect_equal(unname(r3), 5.086645484) @@ -215,21 +213,19 @@ test_that("Honest inference in Lee and LM data", { }) test_that("BME CIs match paper", { - r1 <- RDHonestBME(log(earnings)~yearat14, data=cghs, - cutoff=1947, h=6, order=1) - expect_equal(as.numeric(r1$coefficients[ - c("estimate", "std.error", "conf.low", - "conf.high", "eff.obs")]), + r1 <- RDHonestBME(log(earnings)~yearat14, data=cghs, cutoff=1947, h=6, + order=1)$coefficients + expect_equal(as.numeric(r1[c("estimate", "std.error", "conf.low", + "conf.high", "eff.obs")]), c(0.0212923111, 0.03272404599, -0.13218978736, 0.17499392431, 20883)) - r2 <- RDHonestBME(log(earnings)~yearat14, cghs, cutoff=1947, - regformula="y~I(x>=0)+x+I(x^2)+I(x^3)+I(x^4)") - ## expect_equal(r1$CI, c(-0.23749230603, 0.34429708773)) + r2 <- RDHonestBME(log(earnings)~yearat14, cghs, + regformula="y~I(x>=0)+x+I(x^2)+I(x^3)+I(x^4)", + cutoff=1947)$coefficients ## Slightly different numbers with bug fixed - expect_equal(as.numeric(r2$coefficients[ - c("estimate", "std.error", "conf.low", - "conf.high", "eff.obs")]), + expect_equal(as.numeric(r2[c("estimate", "std.error", "conf.low", + "conf.high", "eff.obs")]), c(0.05481510635, 0.02975117527, -0.2473386327, 0.3548003576, 73954)) r3 <- RDHonestBME(log(cghs$earnings)~yearat14, data=cghs, h=3, @@ -237,9 +233,9 @@ test_that("BME CIs match paper", { r4 <- RDHonestBME(log(cghs$earnings)~yearat14, data=cghs, h=3, order=0, cutoff=1947) expect_equal(r3$coefficients, r4$coefficients) - r5 <- RDHonestBME(duration~age, data=rebp, subset=(period==1 & female==0), + r5 <- RDHonestBME(duration~age, data=rebp, subset = (period==1 & female==0), order=1, h=Inf, cutoff=50) - r6 <- RDHonestBME(duration~age, data=rebp, subset=(period==1 & female==0), + r6 <- RDHonestBME(duration~age, data=rebp, subset = (period==1 & female==0), order=3, h=1, cutoff=50, alpha=0.1) r5 <- capture.output(print(r5, digits=5)) est <- paste0(" Sharp RD parameter 14.798 ", @@ -271,7 +267,7 @@ test_that("Optimizing bw", { x <- sample(xsupp, 100, prob=xprobs, replace=TRUE) d <- data.frame(y=rnorm(100, sd=1), x=x) r1 <- RDHonest(y~x, data=d, cutoff=0, M=40, kern="triangular", - opt.criterion="FLCI") + opt.criterion="FLCI") r2 <- RDHonest(y~x, data=d, cutoff=0, M=40, kern="triangular", h=0.51) expect_equal(r1$coefficients$conf.low, r2$coefficients$conf.low) }) diff --git a/tests/testthat/test_weights.R b/tests/testthat/test_weights.R index c630a9d..07c8d0d 100644 --- a/tests/testthat/test_weights.R +++ b/tests/testthat/test_weights.R @@ -5,7 +5,7 @@ test_that("Test weighting using cghs", { d <- s0$data ## Make 10 groups - d$mod <- floor(10*(d$Y - floor(d$Y))) + d$mod <- floor(10 * (d$Y - floor(d$Y))) ## Make cells by group and year d$cell <- d$mod/10+d$X dd <- data.frame() @@ -22,8 +22,8 @@ test_that("Test weighting using cghs", { d2 <- NPRData(data.frame(y=log(cghs$earnings), x=cghs$yearat14), cutoff= 1947, "SRD") ## Initial estimates - r2 <- NPRreg.fit(d2, 5, "triangular") - r1 <- NPRreg.fit(d1, 5, "triangular") + r2 <- NPReg(d2, 5, "triangular") + r1 <- NPReg(d1, 5, "triangular") ## Checks weights match wp1 <- vapply(unique(d1$X[d1$p]), function(j) sum(r1$w[d1$X==j]), @@ -47,12 +47,12 @@ test_that("Test weighting using cghs", { d1$sigma2[d1$m] <- mean(r2$sigma2[r2$w!=0 & d2$m])/d1$w[d1$m] v1 <- sqrt(sum(wp1^2/np)*mean(r2$sigma2[r2$w!=0 & d2$p])+ - sum(wm1^2/nm)*mean(r2$sigma2[r2$w!=0 & d2$m])) + sum(wm1^2/nm)*mean(r2$sigma2[r2$w!=0 & d2$m])) - m2 <- NPRHonest.fit(d2, M=1, kern="triangular", h=5, - se.method="supplied.var")$coefficients - m1 <- NPRHonest.fit(d1, M=1, kern="triangular", h=5, - se.method="supplied.var")$coefficients + m2 <- NPRHonest(d2, M=1, kern="triangular", h=5, + se.method="supplied.var")$coefficients + m1 <- NPRHonest(d1, M=1, kern="triangular", h=5, + se.method="supplied.var")$coefficients expect_equal(v1, m1$std.error) expect_equal(m1[2:6], m2[2:6]) diff --git a/vignettes/RDHonest.Rmd b/vignettes/RDHonest.Rmd index ccbb617..0fc8a29 100644 --- a/vignettes/RDHonest.Rmd +++ b/vignettes/RDHonest.Rmd @@ -72,17 +72,16 @@ library("RDHonest") ## Plots -The package provides a function `plot_RDscatter` to plot the raw data. To remove +The package provides a function `RDScatter` to plot the raw data. To remove some noise, the function plots averages over `avg` number of observations. The function takes an `RDData` object as an argument ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"} ## plot 25-bin averages in for observations 50 at most points ## away from the cutoff. See Figure 1. -plot_RDscatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, - avg=50, propdotsize=FALSE, - xlab="Margin of victory", - ylab="Vote share in next election") +RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50, + avg=50, propdotsize=FALSE, xlab="Margin of victory", + ylab="Vote share in next election") ``` The running variable in the Oreopoulos dataset is discrete. It is therefore @@ -93,9 +92,9 @@ averages over. ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"} ## see Figure 2 -f2 <- plot_RDscatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947, - avg=Inf, xlab="Year aged 14", ylab="Log earnings", - propdotsize=TRUE) +f2 <- RDScatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947, + avg=Inf, xlab="Year aged 14", ylab="Log earnings", + propdotsize=TRUE) ## Adjust size of dots if they are too big f2 + ggplot2::scale_size_area(max_size = 4) ``` @@ -211,10 +210,10 @@ optimality criterion: ```{r} RDHonest(voteshare ~ margin, data=lee08, kern="triangular", - M=0.1, opt.criterion="MSE", sclass="H") + M=0.1, opt.criterion="MSE", sclass="H") ## Choose bws optimal for length of CI RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1, - opt.criterion="FLCI", sclass="H") + opt.criterion="FLCI", sclass="H") ``` ### Inference when running variable is discrete @@ -232,10 +231,10 @@ As an example, consider the @oreopoulos06 data, in which the running variable is ```{r} ## Replicate Table 2, column (10) RDHonest(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H") + data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H") ## Triangular kernel generally gives tigher CIs RDHonest(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H") + data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H") ``` In addition, the package provides function `RDHonestBME` that calculates honest @@ -246,11 +245,12 @@ no worse at the cutoff than away from the cutoff as in Section 5.2 in @KoRo16. ## Replicate Table 2, column (6), run local linear regression (order=1) ## with a uniform kernel (other kernels are not yet implemented) RDHonestBME(log(earnings) ~ yearat14, cutoff=1947, - data=cghs, h=3, order=1) + data=cghs, h=3, order=1) ``` -Let us describe the implementation of the variance estimator $\hat{V}(W)$ used to construct the CI as described in in Section 5.2 in @KoRo16. Suppose the point +Let us describe the implementation of the variance estimator $\hat{V}(W)$ used +to construct the CI as described in in Section 5.2 in @KoRo16. Suppose the point estimate is given by the first element of the regression of the outcome $y_i$ on $m(x_i)$. For instance, local linear regression with uniform kernel and bandwidth $h$ corresponds to $m(x)=I(|x|\leq h)\cdot(I(x>c_0),1,x, x\cdot @@ -347,9 +347,10 @@ $M$ a priori, rather than attempting to use a data-driven method. Therefore, one should, whenever possible, use problem-specific knowledge to decide what choice of $M$ is reasonable a priori. -For cases in which this is difficult, the function `NPR_MROT.fit` implements the method -considered in Section 3.4.1 in @ArKo16honest based on a global polynomial -approximation: +For cases in which this is difficult, the internal function `RDHonest` +implements the method considered in Section 3.4.1 in @ArKo16honest based on a +global polynomial approximation: + ```{r} ## Data-driven choice of M RDHonest(voteshare ~ margin, data=lee08, kern="uniform", sclass="H", @@ -369,11 +370,11 @@ estimate used to compute optimal bandwidths: ```{r} r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1, - opt.criterion="FLCI", - se.method="nn")$coefficients + opt.criterion="FLCI", + se.method="nn")$coefficients r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1, - opt.criterion="FLCI", se.method="nn", - sclass="T")$coefficients + opt.criterion="FLCI", se.method="nn", + sclass="T")$coefficients r1$conf.high-r1$conf.low r2$conf.high-r2$conf.low ``` @@ -383,10 +384,13 @@ r2$conf.high-r2$conf.low The package also implements lower-bound estimates for the smoothness constant $M$ for the Taylor and Hölder smoothness class, as described in the supplements to @KoRo16 and @ArKo16optimal +TODO: `PrelimVar` is internal + + ```{r} ## Add variance estimate to the Lee (2008) data so that the RDSmoothnessBound ## function doesn't have to compute them each time -## dl <- NPRPrelimVar.fit(dl, se.initial="nn") +## dl <- PrelimVar(dl, se.initial="nn") ### Only use three point-average for averages of a 100 points closest to cutoff, ### and report results separately for points above and below cutoff @@ -412,9 +416,9 @@ d$cell <- d$mod/100+d$yearat14 ## Data with cell averages dd <- data.frame() for (j in unique(d$cell)){ - dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]), - x=mean(d$yearat14[d$cell==j]), - weights=length(d$yearat14[d$cell==j]))) + dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]), + x=mean(d$yearat14[d$cell==j]), + weights=length(d$yearat14[d$cell==j]))) } ``` @@ -491,11 +495,11 @@ estimator of the treatment effect ```{r} ## Initial estimate of treatment effect for optimal bandwidth calculations r <- RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0) + M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0) ## Use it to compute optimal bandwidth RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", - T0=r$coefficients$estimate) + M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", + T0=r$coefficients$estimate) ``` ## Data-driven choice of smoothness constant @@ -507,13 +511,14 @@ attempting to use a data-driven method. Therefore, one should, whenever possible, use problem-specific knowledge to decide what choices of $M_1$ and $M_2$ are reasonable a priori. -For cases in which this is difficult, the function `NPR_MROT.fit` implements the method -considered in Section 3.4.1 in @ArKo16honest based on a global polynomial +For cases in which this is difficult, the function `RDHonest` implements the +method considered in Section 3.4.1 in @ArKo16honest based on a global polynomial approximation: + ```{r} ## Data-driven choice of M RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular", - opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate) + opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate) ``` See @ArKo16honest for a discussion of the restrictions on the parameter space under which this method yields honest inference.