Update vignette and documentation

kolesarm · Mar 13, 2024 · a309e49 · a309e49
1 parent 661fe6c
commit a309e49
Show file tree

Hide file tree

Showing 21 changed files with 1,700 additions and 948 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: RDHonest
 Title: Honest Inference in Regression Discontinuity Designs
-Version: 0.5.0
+Version: 0.9.0
 Authors@R:
     c(person(given = "Michal",
             family = "Kolesár",
@@ -13,7 +13,7 @@ Authors@R:
              email = "[email protected]"))
 Description: Honest and nearly-optimal confidence intervals in fuzzy and sharp
     regression discontinuity designs and for inference at a point based on local
-    linear regression.
+    linear regression. Supports covariates, clustering, and weighting.
 Depends: R (>= 4.3.0)
 License: GPL-3
 Encoding: UTF-8

diff --git a/R/RDHonest.R b/R/RDHonest.R
@@ -13,15 +13,15 @@
 #' @param cutoff specifies the RD cutoff in the running variable. For inference
 #'     at a point, specifies the point \eqn{x_0} at which to calculate the
 #'     conditional mean.
-#' @param kern specifies kernel function used in the local regression. It can
-#'     either be a string equal to \code{"triangular"} (\eqn{k(u)=(1-|u|)_{+}}),
-#'     \code{"epanechnikov"} (\eqn{k(u)=(3/4)(1-u^2)_{+}}), or \code{"uniform"}
-#'     (\eqn{k(u)= (|u|<1)/2}), or else a kernel function. If equal to
-#'     \code{"optimal"}, use the finite-sample optimal linear estimator under
-#'     Taylor smoothness class, instead of a local linear estimator.
-#' @param se.method Vector with methods for estimating standard error of
-#'     estimate. If \code{NULL}, standard errors are not computed. The elements
-#'     of the vector can consist of the following methods:
+#' @param kern specifies the kernel function used in the local regression. It
+#'     can either be a string equal to \code{"triangular"}
+#'     (\eqn{k(u)=(1-|u|)_{+}}), \code{"epanechnikov"}
+#'     (\eqn{k(u)=(3/4)(1-u^2)_{+}}), or \code{"uniform"} (\eqn{k(u)=
+#'     (|u|<1)/2}), or else a kernel function. If equal to \code{"optimal"}, use
+#'     the finite-sample optimal linear estimator under Taylor smoothness class,
+#'     instead of a local linear estimator.
+#' @param se.method method for estimating standard error of the estimate, one
+#'     of:
 #'
 #' \describe{
 #'     \item{"nn"}{Nearest neighbor method}
@@ -30,12 +30,13 @@
 #'     (local polynomial estimators only).}
 #'
 #'    \item{"supplied.var"}{Use conditional variance supplied by \code{sigmaY2}
-#'         instead of computing residuals}
+#'         instead of computing residuals. For fuzzy RD, \code{sigmaD2} and
+#'         \code{sigmaYD} also need to be supplied in this case.}
 #'
 #' }
-#' @param J Number of nearest neighbors, if "nn" is specified in
-#'     \code{se.method}.
-#' @param opt.criterion Optimality criterion that bandwidth is designed to
+#' @param J Number of nearest neighbors, if \code{se.method="nn"} is specified.
+#'     Otherwise ignored.
+#' @param opt.criterion Optimality criterion that the bandwidth is designed to
 #'     optimize. The options are:
 #'
 #'    \describe{
@@ -51,66 +52,99 @@
 #'     }
 #'
 #'     The methods use conditional variance given by \code{sigmaY2}, if
-#'     supplied. Otherwise, for the purpose of estimating the optimal bandwidth,
-#'     conditional variance is estimated using the method specified by
-#'     \code{se.initial}.
+#'     supplied. For fuzzy RD, \code{sigmaD2} and \code{sigmaYD} also need to be
+#'     supplied in this case. Otherwise, the methods use preliminary variance
+#'     estimates based on assuming homoskedasticity on either side of the
+#'     cutoff.
 #' @param beta Determines quantile of excess length to optimize, if bandwidth
 #'     optimizes given quantile of excess length of one-sided confidence
-#'     intervals; otherwise ignored.
+#'     intervals (\code{opt.criterion="OCI"}); otherwise ignored.
 #' @param alpha determines confidence level, \code{1-alpha} for
 #'     constructing/optimizing confidence intervals.
-#' @param M Bound on second derivative of the conditional mean function.
+#' @param M Bound on second derivative of the conditional mean function, a
+#'     numeric vector of length one. For fuzzy RD, \code{M} needs to be a
+#'     numeric vector of length two, specifying the smoothness of the
+#'     conditional mean for the outcome and treatment, respectively.
 #' @param sclass Smoothness class, either \code{"T"} for Taylor or \code{"H"}
 #'     for Hölder class.
 #' @param h bandwidth, a scalar parameter. If not supplied, optimal bandwidth is
 #'     computed according to criterion given by \code{opt.criterion}.
 #' @param weights Optional vector of weights to weight the observations (useful
-#'     for aggregated data). Disregarded if optimal kernel is used.
+#'     for aggregated data). The weights are intepreted as the number of
+#'     observations that each aggregated datapoint averages over. Disregarded if
+#'     optimal kernel is used.
 #' @param point.inference Do inference at a point determined by \code{cutoff}
 #'     instead of RD.
 #' @param T0 Initial estimate of the treatment effect for calculating the
-#'     optimal bandwidth. Only relevant for Fuzzy RD.
+#'     optimal bandwidth. Only relevant for fuzzy RD.
 #' @param sigmaY2 Supply variance of outcome. Ignored when kernel is optimal.
 #' @param sigmaD2 Supply variance of treatment (fuzzy RD only).
 #' @param sigmaYD Supply covariance of treatment and outcome (fuzzy RD only).
-#' @param clusterid Cluster id for cluster-robust standard errors
+#' @param clusterid Vector specifying cluster membership. If supplied,
+#'     \code{se.method="EHW"} is required, and standard errors use
+#'     cluster-robust variance formulas.
 #' @return Returns an object of class \code{"RDResults"}. The function
 #'     \code{print} can be used to obtain and print a summary of the results. An
-#'     object of class \code{"RDResults"} is a list containing the following
-#'     components
+#'     object of class \code{"RDResults"} is a list containing four components.
+#'     First, a data frame \code{"coefficients"} containing the following
+#'     columns:
 #'
 #'     \describe{
-#'   \item{\code{estimate}}{Point estimate. This estimate is MSE-optimal if
-#'                   \code{opt.criterion="MSE"}}
+#'     \item{\code{term}}{type of parameter being estimated}
 #'
-#'   \item{\code{lff}}{Least favorable function, only relevant for optimal
-#'              estimator under Taylor class.}
+#'     \item{\code{estimate}}{point estimate}
 #'
-#'   \item{\code{maxbias}}{Maximum bias of \code{estimate}}
+#'     \item{\code{std.error}}{standard error of \code{estimate}}
 #'
-#'   \item{\code{sd}}{Standard deviation of estimate}
+#'     \item{\code{maximum.bias}}{maximum bias of \code{estimate}}
 #'
-#'   \item{\code{lower}, \code{upper}}{Lower (upper) end-point of a one-sided CI
-#'         based on \code{estimate}. This CI is optimal if
-#'         \code{opt.criterion=="OCI"}}
+#'     \item{\code{conf.low}, \code{conf.high}}{lower (upper) end-point of a
+#'     two-sided CI based on \code{estimate}}
 #'
-#'   \item{\code{hl}}{Half-length of a two-sided CI based on \code{estimate}, so
-#'             that the CI is given by \code{c(estimate-hl, estimate+hl)}. The
-#'             CI is optimal if \code{opt.criterion="FLCI"}}
+#'     \item{\code{conf.low.onesided}, \code{conf.high.onesided}}{lower (upper)
+#'         end-point of a one-sided CIs based on \code{estimate}}
 #'
-#'   \item{\code{eff.obs}}{Effective number of observations used by
-#'             \code{estimate}}
+#'     \item{\code{bandwidth}}{bandwidth used. If \code{kern="optimal"}, the
+#'       smoothing parameters \code{bandwidth.m} and \code{bandwidth.p} on
+#'       either side of the cutoff are reported intead}
 #'
-#'   \item{\code{h}}{Bandwidth used}
+#'     \item{\code{eff.obs}}{number of effective observations}
 #'
-#'   \item{\code{naive}}{Coverage of CI that ignores bias and uses
-#'                \code{qnorm(1-alpha/2)} as critical value}
+#'     \item{\code{leverage}}{maximal leverage of \code{estimate}}
 #'
-#'   \item{\code{call}}{the matched call}
+#'     \item{\code{cv}}{critical value used to compute two-sided CIs}
+#'
+#'     \item{\code{alpha}}{coverage level, as specified by option \code{alpha}}
+#'
+#'     \item{\code{method}}{\code{sclass} is used}
+#'
+#'      \item{\code{M}}{curvature bound used for worst-case bias
+#'         calculations. For fuzzy RD, equals
+#'         \code{(abs(estimate)*M.fs+M.rf)/first.stage}}
+#'
+#'      \item{\code{M.rf}, \code{M.fs}}{curvature bound for the outcome (i.e.
+#'      reduced-form) and first-stage regressions. Fuzzy RD only.}
+#'
+#'       \item{\code{first.stage}}{estimate of the first-stage coefficient.
+#'       Fuzzy RD only.}
+#'
+#'       \item{\code{kernel}}{kernel used}
+#'
+#'        \item{\code{p.value}}{p-value for testing the null of no effect}
+#'      }
+#'
+#'     Second, a list called \code{"data"} containing the data used for
+#'     estimation. This is useful mostly for internal calculations. Third, an
+#'     object of class \code{"lm"} containing the local linear regression
+#'     estimates. Finally, a \code{call} object containing the matched call
+#'     called \code{"call"}.
+#'
+#'     If \code{kern="optimal"}, the \code{"lm"} object is empty, and the
+#'     numeric vectors \code{"delta"} and \code{"omega"} are returned in
+#'     addition. These correspond to the parameters in the modulus problem used
+#'     to compute the optimal estimation weights.
 #'
-#'   \item{\code{fs}}{Estimate of the first-stage coefficient (sharp RD only)}
 #'
-#' }
 #' @references{
 #'
 #' \cite{Timothy B. Armstrong and Michal Kolesár. Optimal inference in a class
@@ -121,10 +155,6 @@
 #' intervals in nonparametric regression. Quantitative Economics, 11(1):1–39,
 #' January 2020. \doi{10.3982/QE1199}}
 #'
-#' \cite{Guido W. Imbens and Karthik Kalyanaraman. Optimal bandwidth choice for
-#' the regression discontinuity estimator. The Review of Economic Studies,
-#' 79(3):933–959, July 2012. \doi{10.1093/restud/rdr043}}
-#'
 #' \cite{Michal Kolesár and Christoph Rothe. Inference in regression
 #' discontinuity designs with a discrete running variable. American Economic
 #' Review, 108(8):2277—-2304, August 2018. \doi{10.1257/aer.20160945}}
@@ -348,7 +378,7 @@ print.RDResults <- function(x, digits = getOption("digits"), ...) {
     fmt <- function(x) format(x, digits=digits, width=digits+1)
     y <- x$coefficients
     cat("Inference for ", y$term, " (using ", y$method,
-        " class), confidence level ", 100*(1-y$alpha), "%:\n", sep="")
+        " class), confidence level ", 100 * (1-y$alpha), "%:\n", sep="")
     nm <- c("Estimate", "Std. Error", "Maximum Bias")
     colnames(y)[2:4] <- nm
     y$"Confidence Interval" <- paste0("(", fmt(y$conf.low), ", ",
@@ -377,7 +407,7 @@ print.RDResults <- function(x, digits = getOption("digits"), ...) {
     if (!is.null(y$bandwidth)) {
         cat("\nBased on local regression with bandwidth: ", fmt(y$bandwidth),
             ", kernel: ", y$kernel, "\nRegression coefficients:\n", sep="")
-        print.default(format(coef(x$lm), digits = max(3L, digits - 3L)),
+        print.default(format(stats::coef(x$lm), digits = max(3L, digits - 3L)),
                       print.gap = 2L, quote = FALSE)
     } else {
         cat("\nSmoothing parameters below and above cutoff: ",

diff --git a/R/RD_bme.R b/R/RD_bme.R
@@ -14,13 +14,13 @@ RDlpformula <- function(order) {
 
 #' Honest CIs in sharp RD with discrete regressors under BME function class
 #'
-#' Computes honest CIs for local polynomial regression with uniform kernel under
-#' the assumption that the conditional mean lies in the bounded misspecification
-#' error (BME) class of functions, as considered in Kolesár and Rothe (2018).
-#' This class formalizes the notion that the fit of the chosen model is no worse
-#' at the cutoff than elsewhere in the estimation window.
+#' Computes honest CIs for local polynomial regression with uniform kernel in
+#' sharp RD under the assumption that the conditional mean lies in the bounded
+#' misspecification error (BME) class of functions, as considered in Kolesár and
+#' Rothe (2018). This class formalizes the notion that the fit of the chosen
+#' model is no worse at the cutoff than elsewhere in the estimation window.
 #'
-#' @template RDFormula
+#' @template RDFormulaSimple
 #' @param cutoff specifies the RD cutoff in the running variable.
 #' @param h bandwidth, a scalar parameter.
 #' @param alpha determines confidence level, \eqn{1-\alpha}{1-alpha}
@@ -44,6 +44,9 @@ RDlpformula <- function(order) {
 #'
 #'    \item{\code{"call"}}{The matched call.}
 #'
+#'    \item{\code{"lm"}}{An \code{"lm"} object containing the fitted
+#'    regression.}
+#'
 #'    \item{\code{"na.action"}}{(If relevant) information on the special
 #'    handling of \code{NA}s.}
 #'

diff --git a/R/plots.R b/R/plots.R
@@ -4,7 +4,7 @@
 #' average.
 #'
 #' @param cutoff specifies the RD cutoff for the running variable.
-#' @template RDFormula
+#' @template RDFormulaSimple
 #' @param avg Number of observations to average over. If set to \code{Inf}, then
 #'     take averages for each possible value of the running variable (convenient
 #'     when the running variable is discrete).