diff --git a/DESCRIPTION b/DESCRIPTION
index e3f2702..f104149 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: RDHonest
 Title: Honest Inference in Regression Discontinuity Designs
-Version: 0.5.0
+Version: 0.9.0
 Authors@R:
     c(person(given = "Michal",
             family = "Kolesár",
@@ -13,7 +13,7 @@ Authors@R:
              email = "timothy.armstrong@usc.edu"))
 Description: Honest and nearly-optimal confidence intervals in fuzzy and sharp
     regression discontinuity designs and for inference at a point based on local
-    linear regression.
+    linear regression. Supports covariates, clustering, and weighting.
 Depends: R (>= 4.3.0)
 License: GPL-3
 Encoding: UTF-8
diff --git a/R/RDHonest.R b/R/RDHonest.R
index 2c0a82a..747bc0c 100644
--- a/R/RDHonest.R
+++ b/R/RDHonest.R
@@ -13,15 +13,15 @@
 #' @param cutoff specifies the RD cutoff in the running variable. For inference
 #'     at a point, specifies the point \eqn{x_0} at which to calculate the
 #'     conditional mean.
-#' @param kern specifies kernel function used in the local regression. It can
-#'     either be a string equal to \code{"triangular"} (\eqn{k(u)=(1-|u|)_{+}}),
-#'     \code{"epanechnikov"} (\eqn{k(u)=(3/4)(1-u^2)_{+}}), or \code{"uniform"}
-#'     (\eqn{k(u)= (|u|<1)/2}), or else a kernel function. If equal to
-#'     \code{"optimal"}, use the finite-sample optimal linear estimator under
-#'     Taylor smoothness class, instead of a local linear estimator.
-#' @param se.method Vector with methods for estimating standard error of
-#'     estimate. If \code{NULL}, standard errors are not computed. The elements
-#'     of the vector can consist of the following methods:
+#' @param kern specifies the kernel function used in the local regression. It
+#'     can either be a string equal to \code{"triangular"}
+#'     (\eqn{k(u)=(1-|u|)_{+}}), \code{"epanechnikov"}
+#'     (\eqn{k(u)=(3/4)(1-u^2)_{+}}), or \code{"uniform"} (\eqn{k(u)=
+#'     (|u|<1)/2}), or else a kernel function. If equal to \code{"optimal"}, use
+#'     the finite-sample optimal linear estimator under Taylor smoothness class,
+#'     instead of a local linear estimator.
+#' @param se.method method for estimating standard error of the estimate, one
+#'     of:
 #'
 #' \describe{
 #'     \item{"nn"}{Nearest neighbor method}
@@ -30,12 +30,13 @@
 #'     (local polynomial estimators only).}
 #'
 #'    \item{"supplied.var"}{Use conditional variance supplied by \code{sigmaY2}
-#'         instead of computing residuals}
+#'         instead of computing residuals. For fuzzy RD, \code{sigmaD2} and
+#'         \code{sigmaYD} also need to be supplied in this case.}
 #'
 #' }
-#' @param J Number of nearest neighbors, if "nn" is specified in
-#'     \code{se.method}.
-#' @param opt.criterion Optimality criterion that bandwidth is designed to
+#' @param J Number of nearest neighbors, if \code{se.method="nn"} is specified.
+#'     Otherwise ignored.
+#' @param opt.criterion Optimality criterion that the bandwidth is designed to
 #'     optimize. The options are:
 #'
 #'    \describe{
@@ -51,66 +52,99 @@
 #'     }
 #'
 #'     The methods use conditional variance given by \code{sigmaY2}, if
-#'     supplied. Otherwise, for the purpose of estimating the optimal bandwidth,
-#'     conditional variance is estimated using the method specified by
-#'     \code{se.initial}.
+#'     supplied. For fuzzy RD, \code{sigmaD2} and \code{sigmaYD} also need to be
+#'     supplied in this case. Otherwise, the methods use preliminary variance
+#'     estimates based on assuming homoskedasticity on either side of the
+#'     cutoff.
 #' @param beta Determines quantile of excess length to optimize, if bandwidth
 #'     optimizes given quantile of excess length of one-sided confidence
-#'     intervals; otherwise ignored.
+#'     intervals (\code{opt.criterion="OCI"}); otherwise ignored.
 #' @param alpha determines confidence level, \code{1-alpha} for
 #'     constructing/optimizing confidence intervals.
-#' @param M Bound on second derivative of the conditional mean function.
+#' @param M Bound on second derivative of the conditional mean function, a
+#'     numeric vector of length one. For fuzzy RD, \code{M} needs to be a
+#'     numeric vector of length two, specifying the smoothness of the
+#'     conditional mean for the outcome and treatment, respectively.
 #' @param sclass Smoothness class, either \code{"T"} for Taylor or \code{"H"}
 #'     for Hölder class.
 #' @param h bandwidth, a scalar parameter. If not supplied, optimal bandwidth is
 #'     computed according to criterion given by \code{opt.criterion}.
 #' @param weights Optional vector of weights to weight the observations (useful
-#'     for aggregated data). Disregarded if optimal kernel is used.
+#'     for aggregated data). The weights are intepreted as the number of
+#'     observations that each aggregated datapoint averages over. Disregarded if
+#'     optimal kernel is used.
 #' @param point.inference Do inference at a point determined by \code{cutoff}
 #'     instead of RD.
 #' @param T0 Initial estimate of the treatment effect for calculating the
-#'     optimal bandwidth. Only relevant for Fuzzy RD.
+#'     optimal bandwidth. Only relevant for fuzzy RD.
 #' @param sigmaY2 Supply variance of outcome. Ignored when kernel is optimal.
 #' @param sigmaD2 Supply variance of treatment (fuzzy RD only).
 #' @param sigmaYD Supply covariance of treatment and outcome (fuzzy RD only).
-#' @param clusterid Cluster id for cluster-robust standard errors
+#' @param clusterid Vector specifying cluster membership. If supplied,
+#'     \code{se.method="EHW"} is required, and standard errors use
+#'     cluster-robust variance formulas.
 #' @return Returns an object of class \code{"RDResults"}. The function
 #'     \code{print} can be used to obtain and print a summary of the results. An
-#'     object of class \code{"RDResults"} is a list containing the following
-#'     components
+#'     object of class \code{"RDResults"} is a list containing four components.
+#'     First, a data frame \code{"coefficients"} containing the following
+#'     columns:
 #'
 #'     \describe{
-#'   \item{\code{estimate}}{Point estimate. This estimate is MSE-optimal if
-#'                   \code{opt.criterion="MSE"}}
+#'     \item{\code{term}}{type of parameter being estimated}
 #'
-#'   \item{\code{lff}}{Least favorable function, only relevant for optimal
-#'              estimator under Taylor class.}
+#'     \item{\code{estimate}}{point estimate}
 #'
-#'   \item{\code{maxbias}}{Maximum bias of \code{estimate}}
+#'     \item{\code{std.error}}{standard error of \code{estimate}}
 #'
-#'   \item{\code{sd}}{Standard deviation of estimate}
+#'     \item{\code{maximum.bias}}{maximum bias of \code{estimate}}
 #'
-#'   \item{\code{lower}, \code{upper}}{Lower (upper) end-point of a one-sided CI
-#'         based on \code{estimate}. This CI is optimal if
-#'         \code{opt.criterion=="OCI"}}
+#'     \item{\code{conf.low}, \code{conf.high}}{lower (upper) end-point of a
+#'     two-sided CI based on \code{estimate}}
 #'
-#'   \item{\code{hl}}{Half-length of a two-sided CI based on \code{estimate}, so
-#'             that the CI is given by \code{c(estimate-hl, estimate+hl)}. The
-#'             CI is optimal if \code{opt.criterion="FLCI"}}
+#'     \item{\code{conf.low.onesided}, \code{conf.high.onesided}}{lower (upper)
+#'         end-point of a one-sided CIs based on \code{estimate}}
 #'
-#'   \item{\code{eff.obs}}{Effective number of observations used by
-#'             \code{estimate}}
+#'     \item{\code{bandwidth}}{bandwidth used. If \code{kern="optimal"}, the
+#'       smoothing parameters \code{bandwidth.m} and \code{bandwidth.p} on
+#'       either side of the cutoff are reported intead}
 #'
-#'   \item{\code{h}}{Bandwidth used}
+#'     \item{\code{eff.obs}}{number of effective observations}
 #'
-#'   \item{\code{naive}}{Coverage of CI that ignores bias and uses
-#'                \code{qnorm(1-alpha/2)} as critical value}
+#'     \item{\code{leverage}}{maximal leverage of \code{estimate}}
 #'
-#'   \item{\code{call}}{the matched call}
+#'     \item{\code{cv}}{critical value used to compute two-sided CIs}
+#'
+#'     \item{\code{alpha}}{coverage level, as specified by option \code{alpha}}
+#'
+#'     \item{\code{method}}{\code{sclass} is used}
+#'
+#'      \item{\code{M}}{curvature bound used for worst-case bias
+#'         calculations. For fuzzy RD, equals
+#'         \code{(abs(estimate)*M.fs+M.rf)/first.stage}}
+#'
+#'      \item{\code{M.rf}, \code{M.fs}}{curvature bound for the outcome (i.e.
+#'      reduced-form) and first-stage regressions. Fuzzy RD only.}
+#'
+#'       \item{\code{first.stage}}{estimate of the first-stage coefficient.
+#'       Fuzzy RD only.}
+#'
+#'       \item{\code{kernel}}{kernel used}
+#'
+#'        \item{\code{p.value}}{p-value for testing the null of no effect}
+#'      }
+#'
+#'     Second, a list called \code{"data"} containing the data used for
+#'     estimation. This is useful mostly for internal calculations. Third, an
+#'     object of class \code{"lm"} containing the local linear regression
+#'     estimates. Finally, a \code{call} object containing the matched call
+#'     called \code{"call"}.
+#'
+#'     If \code{kern="optimal"}, the \code{"lm"} object is empty, and the
+#'     numeric vectors \code{"delta"} and \code{"omega"} are returned in
+#'     addition. These correspond to the parameters in the modulus problem used
+#'     to compute the optimal estimation weights.
 #'
-#'   \item{\code{fs}}{Estimate of the first-stage coefficient (sharp RD only)}
 #'
-#' }
 #' @references{
 #'
 #' \cite{Timothy B. Armstrong and Michal Kolesár. Optimal inference in a class
@@ -121,10 +155,6 @@
 #' intervals in nonparametric regression. Quantitative Economics, 11(1):1–39,
 #' January 2020. \doi{10.3982/QE1199}}
 #'
-#' \cite{Guido W. Imbens and Karthik Kalyanaraman. Optimal bandwidth choice for
-#' the regression discontinuity estimator. The Review of Economic Studies,
-#' 79(3):933–959, July 2012. \doi{10.1093/restud/rdr043}}
-#'
 #' \cite{Michal Kolesár and Christoph Rothe. Inference in regression
 #' discontinuity designs with a discrete running variable. American Economic
 #' Review, 108(8):2277—-2304, August 2018. \doi{10.1257/aer.20160945}}
@@ -348,7 +378,7 @@ print.RDResults <- function(x, digits = getOption("digits"), ...) {
     fmt <- function(x) format(x, digits=digits, width=digits+1)
     y <- x$coefficients
     cat("Inference for ", y$term, " (using ", y$method,
-        " class), confidence level ", 100*(1-y$alpha), "%:\n", sep="")
+        " class), confidence level ", 100 * (1-y$alpha), "%:\n", sep="")
     nm <- c("Estimate", "Std. Error", "Maximum Bias")
     colnames(y)[2:4] <- nm
     y$"Confidence Interval" <- paste0("(", fmt(y$conf.low), ", ",
@@ -377,7 +407,7 @@ print.RDResults <- function(x, digits = getOption("digits"), ...) {
     if (!is.null(y$bandwidth)) {
         cat("\nBased on local regression with bandwidth: ", fmt(y$bandwidth),
             ", kernel: ", y$kernel, "\nRegression coefficients:\n", sep="")
-        print.default(format(coef(x$lm), digits = max(3L, digits - 3L)),
+        print.default(format(stats::coef(x$lm), digits = max(3L, digits - 3L)),
                       print.gap = 2L, quote = FALSE)
     } else {
         cat("\nSmoothing parameters below and above cutoff: ",
diff --git a/R/RD_bme.R b/R/RD_bme.R
index ef61228..22b4d86 100644
--- a/R/RD_bme.R
+++ b/R/RD_bme.R
@@ -14,13 +14,13 @@ RDlpformula <- function(order) {
 
 #' Honest CIs in sharp RD with discrete regressors under BME function class
 #'
-#' Computes honest CIs for local polynomial regression with uniform kernel under
-#' the assumption that the conditional mean lies in the bounded misspecification
-#' error (BME) class of functions, as considered in Kolesár and Rothe (2018).
-#' This class formalizes the notion that the fit of the chosen model is no worse
-#' at the cutoff than elsewhere in the estimation window.
+#' Computes honest CIs for local polynomial regression with uniform kernel in
+#' sharp RD under the assumption that the conditional mean lies in the bounded
+#' misspecification error (BME) class of functions, as considered in Kolesár and
+#' Rothe (2018). This class formalizes the notion that the fit of the chosen
+#' model is no worse at the cutoff than elsewhere in the estimation window.
 #'
-#' @template RDFormula
+#' @template RDFormulaSimple
 #' @param cutoff specifies the RD cutoff in the running variable.
 #' @param h bandwidth, a scalar parameter.
 #' @param alpha determines confidence level, \eqn{1-\alpha}{1-alpha}
@@ -44,6 +44,9 @@ RDlpformula <- function(order) {
 #'
 #'    \item{\code{"call"}}{The matched call.}
 #'
+#'    \item{\code{"lm"}}{An \code{"lm"} object containing the fitted
+#'    regression.}
+#'
 #'    \item{\code{"na.action"}}{(If relevant) information on the special
 #'    handling of \code{NA}s.}
 #'
diff --git a/R/plots.R b/R/plots.R
index 4f4d1ca..5725932 100644
--- a/R/plots.R
+++ b/R/plots.R
@@ -4,7 +4,7 @@
 #' average.
 #'
 #' @param cutoff specifies the RD cutoff for the running variable.
-#' @template RDFormula
+#' @template RDFormulaSimple
 #' @param avg Number of observations to average over. If set to \code{Inf}, then
 #'     take averages for each possible value of the running variable (convenient
 #'     when the running variable is discrete).
diff --git a/doc/RDHonest.R b/doc/RDHonest.R
index 201556c..dbca8a1 100644
--- a/doc/RDHonest.R
+++ b/doc/RDHonest.R
@@ -4,11 +4,9 @@ knitr::opts_knit$set(self.contained = FALSE)
 knitr::opts_chunk$set(tidy = TRUE, collapse=TRUE, comment = "#>",
                       tidy.opts=list(blank=FALSE, width.cutoff=55))
 
-## -----------------------------------------------------------------------------
-library("RDHonest")
-
 ## ----fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"-----------------
-## plot 25-bin averages in for observations 50 at most points
+library("RDHonest")
+## plot 50-bin averages in for observations 50 at most points
 ## away from the cutoff. See Figure 1.
 RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
           avg=50, propdotsize=FALSE, xlab="Margin of victory",
@@ -16,7 +14,7 @@ RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
 
 ## ----fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"----------
 ## see Figure 2
-f2 <- RDScatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947,
+f2 <- RDScatter(log(earnings)~yearat14, data=cghs, cutoff=1947,
                 avg=Inf, xlab="Year aged 14", ylab="Log earnings",
                 propdotsize=TRUE)
 ## Adjust size of dots if they are too big
@@ -30,97 +28,130 @@ CVb(1/2, alpha=0.05)
 CVb(0:5, alpha=0.1)
 
 ## -----------------------------------------------------------------------------
-RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="T")
-RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="H")
+r0 <- RDHonest(voteshare~margin, data=lee08, kern="triangular", M=0.1, h=8)
+print(r0)
 
 ## -----------------------------------------------------------------------------
 RDHonest(voteshare ~ margin, data=lee08, kern="triangular",
-         M=0.1, opt.criterion="MSE", sclass="H")
+         M=0.1, opt.criterion="MSE")
 ## Choose bws optimal for length of CI
 RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
-         opt.criterion="FLCI", sclass="H")
+         opt.criterion="FLCI")
 
 ## -----------------------------------------------------------------------------
-## Replicate Table 2, column (10)
+## Data-driven choice of M
+RDHonest(voteshare ~ margin, data=lee08)
+
+## -----------------------------------------------------------------------------
+## Replicate Table 2, column (10) in Kolesar and Rothe (2018)
 RDHonest(log(earnings) ~ yearat14, cutoff=1947,
          data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H")
-## Triangular kernel generally gives tigher CIs
-RDHonest(log(earnings) ~ yearat14, cutoff=1947,
-         data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H")
 
 ## -----------------------------------------------------------------------------
 ## Replicate Table 2, column (6), run local linear regression (order=1)
-## with a uniform kernel (other kernels are not yet implemented)
+## with a uniform kernel (other kernels are not implemented for RDHonestBME)
 RDHonestBME(log(earnings) ~ yearat14, cutoff=1947,
             data=cghs, h=3, order=1)
 
 ## -----------------------------------------------------------------------------
-## Data-driven choice of M
-RDHonest(voteshare ~ margin, data=lee08, kern="uniform", sclass="H",
-         opt.criterion="MSE")
+## Initial estimate of treatment effect for optimal bandwidth calculations
+r <- RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
+## Use it to compute optimal bandwidth
+RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
+         T0=r$coefficients$estimate)
 
 ## -----------------------------------------------------------------------------
-r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
-               opt.criterion="FLCI",
-               se.method="nn")$coefficients
-r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
-               opt.criterion="FLCI", se.method="nn",
-               sclass="T")$coefficients
-r1$conf.high-r1$conf.low
-r2$conf.high-r2$conf.low
+## Data-driven choice of M
+RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)
 
 ## -----------------------------------------------------------------------------
-r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
-### Only use three point-average for averages of a 100 points closest to cutoff,
-### and report results separately for points above and below cutoff
-RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")
+## No covariates
+rn <- RDHonest(mortHS ~ povrate, data=headst)
+## Use Percent attending school aged 14-17, urban, black,
+## and their interaction as covariates.
+rc <- RDHonest(mortHS ~ povrate | urban*black + sch1417, data=headst)
+rn
+rc
 
-## Pool estimates based on observations below and above cutoff, and
-## use three-point averages over the entire support of the running variable
-RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")
+## -----------------------------------------------------------------------------
+100 * (1 - (rc$coefficients$conf.high-rc$coefficients$conf.low) /
+           (rn$coefficients$conf.high-rn$coefficients$conf.low))
 
 ## -----------------------------------------------------------------------------
-d <- cghs
-## Make 20 groups based on observation number
-d$mod <- seq_along(d$yearat14) %% 20
-## Make cells defined as intersection of group and year
-d$cell <- d$mod/100+d$yearat14
-## Data with cell averages
 dd <- data.frame()
-for (j in unique(d$cell)){
-    dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]),
-                               x=mean(d$yearat14[d$cell==j]),
-                               weights=length(d$yearat14[d$cell==j])))
+## Collapse data by running variable
+for (j in unique(cghs$yearat14)) {
+    ix <- cghs$yearat14==j
+    df <- data.frame(y=mean(log(cghs$earnings[ix])), x=j,
+                     weights=sum(ix),
+                     sigma2=var(log(cghs$earnings[ix]))/sum(ix))
+    dd <- rbind(dd, df)
 }
 
 ## -----------------------------------------------------------------------------
-RDHonest(log(earnings)~yearat14, cutoff=1947, h=5, data=cghs, M=0.1,
-         se.method="nn")
-RDHonest(y~x, cutoff=1947, weights=weights, h=5, data=dd, M=0.1,
-         se.method="nn")
+s0 <- RDHonest(log(earnings)~yearat14, cutoff=1947, data=cghs)
+## keep same bandwidth
+s1 <- RDHonest(y~x, cutoff=1947, data=dd, weights=weights,
+               sigmaY2=sigma2, se.method="supplied.var",
+               h=s0$coefficients$bandwidth)
+## Results are identical:
+s0
+s1
 
 ## -----------------------------------------------------------------------------
-## Assumes first column in the data frame corresponds to outcome,
-##  second to the treatment variable, and third to the running variable
-## Outcome here is log of non-durables consumption
-## dr <- FRDData(cbind(logf=log(rcp[, 6]), rcp[, c(3, 2)]), cutoff=0)
+r0 <- RDHonest(log(cn)|retired~elig_year, data=rcp, h=7)
+dd <- data.frame(x=sort(unique(rcp$elig_year)), y=NA, d=NA, weights=NA,
+                 sig11=NA, sig12=NA, sig21=NA, sig22=NA)
+for (j in seq_len(NROW(dd))) {
+    ix <- rcp$elig_year==dd$x[j]
+    Y <- cbind(log(rcp$cn[ix]), rcp$retired[ix])
+    dd[j, -1] <- c(colMeans(Y), sum(ix), as.vector(var(Y))/sum(ix))
+}
+r1 <- RDHonest(y|d~x, data=dd, weights=weights,
+               sigmaY2=sig11, T0=0, sigmaYD=sig21,
+               sigmaD2=sig22, h=7,
+               se.method="supplied.var")
+## Outputs match up to numerical precision
+max(abs(r0$coefficients[2:11]-r1$coefficients[2:11]))
 
 ## -----------------------------------------------------------------------------
-## Initial estimate of treatment effect for optimal bandwidth calculations
-r <- RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular",
-              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
-## Use it to compute optimal bandwidth
-RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular",
-         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
-         T0=r$coefficients$estimate)
+## make fake clusters
+set.seed(42)
+clusterid <- sample(1:50, NROW(lee08), replace=TRUE)
+sc <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
+               clusterid=clusterid, M=0.14, h=7)
+## Since clusters are unrelated to outcomes, not clustering
+## should yield similar standard errors
+sn <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
+               M=0.14, h=7)
+sc
+sn
 
 ## -----------------------------------------------------------------------------
-## Data-driven choice of M
-RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular",
-         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)
+r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
+### Only use three point-average for averages of a 100 points closest to cutoff,
+### and report results separately for points above and below cutoff
+RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")
+
+## Pool estimates based on observations below and above cutoff, and
+## use three-point averages over the entire support of the running variable
+RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")
+
+## -----------------------------------------------------------------------------
+r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
+               opt.criterion="FLCI",
+               se.method="nn")$coefficients
+r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
+               opt.criterion="FLCI", se.method="nn",
+               sclass="T")$coefficients
+r1$conf.high-r1$conf.low
+r2$conf.high-r2$conf.low
 
 ## -----------------------------------------------------------------------------
-## Transform data, specify we're interested in inference at x0=20,
+## Specify we're interested in inference at x0=20,
 ## and drop observations below cutoff
 RDHonest(voteshare ~ margin, data=lee08, subset=margin>0,
          cutoff=20, kern="uniform",
diff --git a/doc/RDHonest.Rmd b/doc/RDHonest.Rmd
index 8242d56..6ec6ac4 100644
--- a/doc/RDHonest.Rmd
+++ b/doc/RDHonest.Rmd
@@ -3,21 +3,21 @@ output:
   pdf_document:
     latex_engine: pdflatex
     citation_package: natbib
-    template: mk_Rpackage_template.tex
     toc: true
     toc_depth: 2
     includes:
         in_header: vignette_head.tex
     keep_tex: true
+    number_sections: true
 title: "Honest inference in Regression Discontinuity Designs"
 author: "Michal Kolesár"
 date: "`r format(Sys.time(), '%B %d, %Y')`"
 geometry: margin=1in
 fontfamily: mathpazo
-fontsize: 10pt
+fontsize: 11pt
 bibliography: np-testing-library.bib
 vignette: >
-  %\VignetteIndexEntry{Honest inference in Sharp Regression Discontinuity}
+  %\VignetteIndexEntry{RDHonest}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
@@ -31,53 +31,57 @@ knitr::opts_chunk$set(tidy = TRUE, collapse=TRUE, comment = "#>",
 
 # Introduction
 
-The package `RDHonest` implements confidence intervals for the regression
-discontinuity parameter considered in @ArKo16optimal, @ArKo16honest, and
-@KoRo16. In this vignette, we demonstrate the implementation of these confidence
-intervals using datasets from @lee08, @oreopoulos06, and @battistin09, which are
-included in the package as a data frame `lee08`, `cghs`, and `rcp`. The datasets
-from @lalive08 and @LuMi07 that are used in @ArKo16honest, and @KoRo16 are also
-included in the package as data frames `rebp` and `headst`.
+The package `RDHonest` implements bias-aware inference methods in regression
+discontinuity (RD) designs developed in @ArKo18optimal, @ArKo20, and
+@KoRo16. In this vignette, we demonstrate the implementation of these methods
+using datasets from @lee08, @oreopoulos06, and @battistin09 and @LuMi07, which
+are included in the package as a data frame `lee08`, `cghs`, `rcp` and `headst`.
+The dataset from @lalive08 used in @KoRo16 is also included in the package as
+data frame `rebp`.
 
 # Sharp RD
 
 ## Model
 
-In the sharp regression discontinuity model, we observe units $i=1,\dotsc,n$,
-with the outcome $y_i$ for the $i$th unit given by $$ y_i = f(x_i) + u_i, $$
-where $f(x_i)$ is the expectation of $y_i$ conditional on the running variable
-$x_i$ and $u_i$ is the regression error. A unit is treated if and only if the
-running variable $x_{i}$ lies weakly above a known cutoff $x_{i}\geq c_{0}$. The
-parameter of interest is given by the jump of $f$ at the cutoff, $$
-\beta=\lim_{x\downarrow c_{0}}f(x)-\lim_{x\uparrow c_{0}}f(x).$$ Let
-$\sigma^2(x_i)$ denote the conditional variance of $u_i$.
-
-In the @lee08 dataset, the running variable corresponds to the margin of victory of
+We observe units $i=1,\dotsc, n$, with the outcome $Y_i$ for the $i$th unit given
+by
+\begin{equation}
+Y_i = f_Y(x_i) + u_{Y, i}
+\end{equation}
+where $f_Y(x_i)$ is the expectation of $Y_i$
+conditional on the running variable $x_i$ and $u_{Y, i}$ is the regression error that
+is conditionally mean zero by definition.
+
+A unit is assigned to treatment if and only if the running variable $x_{i}$ lies
+weakly above a known cutoff. We denote the assignment indicator by
+$Z_{i}=I\{x_{i}\geq c_{0}\}$. In a sharp RD design, all units comply with the
+assigned treatment, so that the observed treatment coincides with treatment
+assignment, $D_{i}=Z_{i}$. The parameter of interest is given by the jump of $f$
+at the cutoff, $$ \tau_Y=\lim_{x\downarrow c_{0}}f_Y(x)-\lim_{x\uparrow
+c_{0}}f_Y(x).$$ Under mild continuity conditions, $\tau_{Y}$ can
+be interpreted as the effect of the treatment for units at the threshold
+(@htv01). Let $\sigma_Y^2(x_i)$ denote the conditional variance of $Y_i$.
+
+In the @lee08 dataset `lee08`, the running variable corresponds to the margin of victory of
 a Democratic candidate in a US House election, and the treatment corresponds to
 winning the election. Therefore, the cutoff is zero. The outcome of interest is
 the Democratic vote share in the following election.
 
-The Oreopoulos dataset consists of a subsample of British workers, and it
-exploits a change in minimum school leaving age in the UK from 14 to 15, which
-occurred in 1947. The running variable is the year in which the individual turned
-14, with the cutoff equal to 1947 so that the "treatment" is being subject to a
-higher minimum school-leaving age. The outcome is log earnings in 1998.
-
-Some of the functions in the package require the data to be transformed into a
-custom `RDData` format. This can be accomplished with the `RDData` function:
-
-```{r}
-library("RDHonest")
-```
+The `cghs` dataset from @oreopoulos06 consists of a subsample of British
+workers. The RD design exploits a change in minimum school-leaving age in the UK
+from 14 to 15, which occurred in 1947. The running variable is the year in which
+the individual turned 14, with the cutoff equal to 1947 so that the "treatment"
+is being subject to a higher minimum school-leaving age. The outcome is log
+earnings in 1998.
 
 ## Plots
 
 The package provides a function `RDScatter` to plot the raw data. To remove
-some noise, the function plots averages over `avg` number of observations. The
-function takes an `RDData` object as an argument
+some noise, the function plots averages over `avg` number of observations.
 
 ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"}
-## plot 25-bin averages in for observations 50 at most points
+library("RDHonest")
+## plot 50-bin averages in for observations 50 at most points
 ## away from the cutoff. See Figure 1.
 RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
           avg=50, propdotsize=FALSE, xlab="Margin of victory",
@@ -86,13 +90,13 @@ RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
 
 The running variable in the Oreopoulos dataset is discrete. It is therefore
 natural to plot the average outcome by each value of the running variable, which
-is achieved using by setting `avg=Inf`. The option `dotsize="count"` makes the
+is achieved using by setting `avg=Inf`. The option `propdotsize=TRUE` makes the
 size of the points proportional to the number of observations that the point
 averages over.
 
 ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"}
 ## see Figure 2
-f2 <- RDScatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947,
+f2 <- RDScatter(log(earnings)~yearat14, data=cghs, cutoff=1947,
                 avg=Inf, xlab="Year aged 14", ylab="Log earnings",
                 propdotsize=TRUE)
 ## Adjust size of dots if they are too big
@@ -102,53 +106,110 @@ f2 + ggplot2::scale_size_area(max_size = 4)
 ## Inference based on local polynomial estimates
 
 The function `RDHonest` constructs one- and two-sided confidence intervals (CIs)
-around local linear and local quadratic estimators using either a user-supplied
-bandwidth (which is allowed to differ on either side of the cutoff), or
+around local linear estimators using either a user-supplied bandwidth, or
 bandwidth that is optimized for a given performance criterion. The sense of
 honesty is that, if the regression errors are normally distributed with known
 variance, the CIs are guaranteed to achieve correct coverage _in finite
 samples_, and achieve correct coverage asymptotically uniformly over the
 parameter space otherwise. Furthermore, because the CIs explicitly take into
-account the possible bias of the estimators, the asymptotic approximation
-doesn't rely on the bandwidth to shrink to zero at a particular rate.
-
-To describe the form of the CIs, let $\hat{\beta}_{h_{+},h_{-}}$ denote a a
-local polynomial estimator with bandwidth equal to $h_{+}$ above the cutoff and
-equal to $h_{-}$ below the cutoff. Let $\beta_{h_{+},h_{-}}(f)$ denote its
-expectation conditional on the covariates when the regression function equals
-$f$. Then the bias of the estimator is given by $\beta_{h_{+},h_{-}}(f)-\beta$.
-Let $$
-B(\hat{\beta}_{h_{+},h_{-}})=\sup_{f\in\mathcal{F}}|\beta_{h_{+},h_{-}}(f)-\beta|
-$$ denote the worst-case bias over the parameter space $\mathcal{F}$. Then the
-lower limit of a one-sided CI is given by $$\hat{\beta}_{h_{+},h_{-}}-
-B(\hat{\beta}_{h_{+},h_{-}})-z_{1-\alpha}\widehat{se}(\hat{\beta}_{h_{+},h_{-}}),
-$$ where $z_{1-\alpha}$ is the $1-\alpha$ quantile of a standard normal
-distribution, and $\widehat{se}(\hat{\beta}_{h_{+},h_{-}})$ is the standard
-error (an estimate of the standard deviation of the estimator). Subtracting the
-worst-case bias in addition to the usual critical value times standard error
-ensures correct coverage at all points in the parameter space.
-
-A two-sided CI is given by $$ \hat{\beta}_{h_{+},h_{-}} \pm
-cv_{1-\alpha}(B(\hat{\beta}_{h_{+},h_{-}})/\widehat{se}(\hat{\beta}_{h_{+},h_{-}}))\times
-\widehat{se}(\hat{\beta}_{h_{+},h_{-}}),$$ where the critical value function
-$cv_{1-\alpha}(b)$ corresponds to the $1-\alpha$ quantile of the $|N(b,1)|$
-distribution. To see why using this critical value ensures honesty, decompose
-the $t$-statistic as $$
-\frac{\hat{\beta}_{h_{+},h_{-}}-\beta}{\widehat{se}(\hat{\beta}_{h_{+},h_{-}})}
-=
-\frac{\hat{\beta}_{h_{+},h_{-}}-\beta_{h_{+},h_{-}}(f)}{\widehat{se}(\hat{\beta}_{h_{+},h_{-}})}
-+\frac{\beta_{h_{+},h_{-}}(f)-\beta}{\widehat{se}(\hat{\beta}_{h_{+},h_{-}})} $$
-By a central limit theorem, the first term on the right-hand side will by
-distributed standard normal, irrespective of the bias. The second term is
-bounded in absolute value by
-$B(\hat{\beta}_{h_{+},h_{-}})/\widehat{se}(\hat{\beta}_{h_{+},h_{-}})$, so that,
-in large samples, the $1-\alpha$ quantile of the absolute value of the
-$t$-statistic will be bounded by
-$cv_{1-\alpha}(B(\hat{\beta}_{h_{+},h_{-}})/\widehat{se}(\hat{\beta}_{h_{+},h_{-}}))$.
-This approach gives tighter CIs than simply adding and subtracting
-$B(\hat{\beta}_{h_+,h_-})$ from the point estimate, in addition to adding and subtracting $z_{1-\alpha}\widehat{se}(\hat{\beta}_{h_+,h_-})$
-
-The function `CVb` gives these critical values:
+account the possible bias of the estimators, the asymptotic approximation does
+not rely on the bandwidth to shrinking to zero at a particular rate---in fact,
+the CIs are valid even if the bandwidth is fixed as $n\to\infty$.
+
+We first estimate $\tau_{Y}$ using local linear regression: instead of using all
+available observations, we only use observations within a narrow estimation
+window around the cutoff, determined by a bandwidth $h$. Within this estimation
+window, we may choose to give more weight to observations closer to the
+cutoff---this weighing is determined a kernel $K$. The local linear regression
+is just a weighted least squares (WLS) regression of $Y_{i}$ onto the treatment
+indicator, the running variable, and their interaction, weighting each
+observation using kernel weights $K(x_{i}/h)$. The local linear regression
+estimator $\hat{\tau}_{Y, h}$ or $\tau_{Y}$ is given by the first element of the
+vector of regression coefficients from this regression,
+$$\hat{\beta}_{Y, h}=
+  \left(\sum_{i=1}^{n}K(x_{i}/h)m(x_{i})m(x_{i})'\right)^{-1}
+  \sum_{i=1}^{n}K(x_{i}/h)m(x_{i})Y_{i},$$
+where $m(x)=(I\{x\geq 0\}, I\{x\geq 0\}x, 1, x)'$ collects all the regressors, and we normalize the cutoff to zero.
+When the kernel is uniform, $K(u)=I\{\abs{u}\leq 1\}$, $\hat{\tau}_{Y, h}$ is
+simply the treatment coefficient from an OLS regression of $Y_{i}$ onto
+$m(x_{i})$ for observations that are within distance $h$ of the cutoff. Other
+kernel choices may weight observations within the estimation window $[-h, h]$
+differently, giving more weight to observations that are relatively closer to
+the cutoff.
+
+Equivalently, using the Frisch–Waugh–Lovell theorem, $\hat{\tau}_{Y, h}$ may
+also be computed by first running an auxiliary WLS regression of the
+treatment indicator $D_{i}$ onto the remaining regressors,
+$(I\{x_{i}\geq 0\}x_{i}, 1, x_{i})$ and then running a WLS regression of the
+outcome $Y_{i}$ onto the residuals $\tilde{D}_{i}$ from this auxiliary
+regression,
+$$\hat{\tau}_{Y, h}=\sum_{i=1}^{n}k_{i, h}Y_{i}, \quad k_{i, h}
+  =\frac{K(x_{i}/h)\tilde{D}_{i}}{\sum_{i=1}^{n}K(x_{i}/h)\tilde{D}_{i}^{2}}.$$
+This representation makes it clear that the estimator simply a weighted average
+of the outcomes. By definition of the residual $\tilde{D}_{i}$, the weights sum
+to zero, and satisfy $\sum_{i}k_{i, h}x_{i}=\sum_{i}k_{i, h}x_{i}I\{x_{i}\geq 0\}=0$:
+this ensures that our estimate of the jump at $0$ is unbiased when the
+regression function is piecewise linear inside the estimation window.
+
+### Bias-aware confidence intervals
+
+The estimator $\hat{\tau}_{Y, h}$ is a regression estimator, so it will be
+asymptotically normal under mild regularity conditions. In particular, if the
+residuals $u_{i}$ are well-behaved, a sufficient condition is that none of the
+weights $k_{i, h}$ are too influential in the sense that the maximal leverage
+goes to zero, as we discuss in the diagnostics section.
+
+Due to the asymptotic normality, the simplest approach to inference is to use
+the usual CI,
+$\hat{\tau}_{Y, h}\pm z_{1-\alpha/2}\hatse(\hat{\tau}_{Y, h})$, where
+$z_{\alpha}$ is the $\alpha$ quantile of the standard normal distribution.
+However, this CI will typically undercover relative to its nominal
+confidence level $1-\alpha$ because it's not correctly centered: unless the
+regression function $f_{Y}$ is exactly linear inside the estimation window, the
+estimator $\hat{\tau}_{Y, h}$ will be biased. If $f_{Y}$ is "close" to linear,
+then this bias will be small, but if it is wiggly, the bias may be substantial,
+leading to severe coverage distortions.
+
+The idea behind bias-aware inference methods is to bound the potential bias of
+the estimator by making an explicit assumption on the smoothness of $f_{Y}$. A
+convenient way of doing this is to bound the curvature of $f_{Y}$ by
+restricting its second derivative. To allow $f_{Y}$ to be discontinuous at zero,
+we assume that it's twice differentiable on either side of the cutoff, with a
+second derivative bounded by a known constant $M$. The choice of the exact
+curvature parameter $M$ is key to implementing bias-aware methods, and we
+discuss it below. Once $M$ is selected, we can work out the potential
+finite-sample bias of the estimator and account for it in the CI construction.
+In particular, it turns out that the absolute value of the bias of
+$\hat{\tau}_{Y, h}$ is maximized at the piecewise quadratic function $x\mapsto
+M x^{2}(I\{x< 0\}-I\{x\geq 0\})/2$, so that $$\abs{E[\hat{\tau}_{Y,
+h}-\tau_{Y}]}\leq B_{M, h}:=-\frac{M}{2}\sum_{i=1}^{n} k_{i,
+h}x_{i}^{2}\operatorname{sign}(x_{i}).$$ Hence, a simple way to ensure that we
+achieve correct coverage regardless of the true shape of the regression function
+$f_{Y}$ (so long as $\abs{f_{Y}''(x)}\leq M$) is to simply enlarge the usual CI
+by this bias bound, leading to the CI $\hat{\tau}_{Y, h}\pm (B_{M,
+h}+z_{1-\alpha/2}\hatse(\hat{\tau}_{Y, h}))$. We can actually to slightly better
+than this, since the bias bound can't simultaneously be binding on both
+endpoints of the CI. In particular, observe that in large samples, the
+$t$-statistic $(\hat{\tau}_{Y, h}-\tau_{Y})/\hatse(\hat{\tau}_{Y, h})$ is
+normally distributed with variance one, and mean that is bounded by $t=B_{M,
+h}/\hatse(\hat{\tau}_{Y, h})$ (ignoring sampling variability in the standard
+error, which is negligible in large samples). To ensure correct coverage, we
+therefore replace the usual critical value $z_{1-\alpha/2}$, with the $1-\alpha$
+quantile of folded normal distribution $\abs{\mathcal{N}(t,1)}$,
+$\cv_{\alpha}(t)$ (note $\cv_{\alpha}(0)=z_{1-\alpha/2}$). This leads to the
+bias-aware CI
+\begin{equation}
+\hat{\tau}_{Y, h} \pm
+\cv_{\alpha}(B_{M, h}/\hatse(\hat{\tau}_{Y, h})) \hatse(\hat{\tau}_{Y, h})
+\end{equation}
+Notice the bias bound $B_{M, h}$ accounts for the exact *finite-sample bias* of the estimator.
+ The only asymptotic approximation we have used in its
+construction is the approximate normality of the estimator $\hat{\tau}_{Y, h}$,
+which obtains without any restrictions on $f_{Y}$---we only need the maximal
+leverage to be close to zero, mirroring a standard leverage condition from
+parametric regression settings.
+
+The function `CVb` gives the critical values $\cv_{\alpha}(t)$:
 
 ```{r, }
 CVb(0, alpha=0.05) ## Usual critical value
@@ -158,83 +219,146 @@ CVb(1/2, alpha=0.05)
 CVb(0:5, alpha=0.1)
 ```
 
-### Parameter space
-
-To implement the honest CIs, one needs to specify the parameter space
-$\mathcal{F}$. The function `RDHonest` computes honest CIs when the parameter
-space $\mathcal{F}$ corresponds to a second-order Taylor or second-order Hölder
-smoothness class, which capture two different types of smoothness restrictions.
-The second-order Taylor class assumes that $f$ lies in the the class of
-functions $$\mathcal{F}_{\text{Taylor}}(M)= \left \{f_{+}-f_{-}\colon
-f_{+}\in\mathcal{F}_{T}(M; [c_{0}
-%] %(
-,\infty)),\;
-f_{-}\in\mathcal{F}_{T}(M;(-\infty, c_{0})) \right\},$$
-
-where $\mathcal{F}_{T}(M;\mathcal{X})$ consists of functions $f$ such that the
-approximation error from second-order Taylor expansion of $f(x)$ about $c_{0}$
-is bounded by $M|x|^{2}/2$, uniformly over $\mathcal{X}$: \begin{align*}
-\mathcal{F}_{T}(M;\mathcal{X}) =\left\{f\colon \left|
-f(x)-f(c_0)-f'(c_0)x\right| \leq M|x|^2/2\text{ all }x\in\mathcal{X}\right\}.
-\end{align*} The class $\mathcal{F}_{T}(M;\mathcal{X})$ formalizes the idea that
-the second derivative of $f$ at zero should be bounded by $M$. See Section 2 in
-@ArKo16optimal (note the constant $C$ in that paper equals $C=M/2$ here). This
-class is doesn't impose smoothness away from boundary, which may be undesirable
-in some empirical applications. The Hölder class addresses this problem by
-bounding the second derivative globally. In particular, it assumes that $f$ lies
-in the class of functions $$\mathcal{F}_{\text{Hölder}}(M)= \left
-\{f_{+}-f_{-}\colon f_{+}\in\mathcal{F}_{H}(M;[c_{0}%] %( ,\infty)),\;
-f_{-}\in\mathcal{F}_{H}(M;(-\infty, c_{0})) \right\},$$
-
-where $$ \mathcal{F}_{H}(M;\mathcal{X})=\{f\colon |f'(x)-f'(y)|\leq M|x-y|
-\;\;x,y\in\mathcal{X}\}.$$
-
-The smoothness class is specified using the option `sclass`. CIs around a local
-linear estimator with bandwidth that equals to 10 on either side of the cutoff
-when the parameter space is given by a Taylor and Hölder smoothness class,
-respectively, with $M=0.1$:
+The function `RDHonest` puts all these steps together. Specifying curvature
+parameter $M=0.1$, bandwidth $h=8$, and a triangular kernel yields:
 
 ```{r}
-RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="T")
-RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="H")
+r0 <- RDHonest(voteshare~margin, data=lee08, kern="triangular", M=0.1, h=8)
+print(r0)
 ```
 
-The confidence intervals use the nearest-neighbor method to estimate the
-standard error by default (this can be changed using the option `se.method`, see
-help file for `RDHonest`). The package reports two-sided as well one-sided CIs
-(with lower as well as upper limit) by default.
-
-Instead of specifying a bandwidth, one can just specify the smoothness class and
-smoothness constant $M$, and the bandwidth will be chosen optimally for a given
-optimality criterion:
+- The default for calculating the standard errors is to use the nearest neighbor
+  method. Specifying `se.method="EHW"` changes them to the regression-based
+  heteroskedasticity-robust Eicker-Huber-White standard errors. It can be shown
+  that unless the regression function $f_{Y}$ is linear inside the estimation
+  window, the `EHW` standard errors generally overestimate the conditional
+  variance.
+- The default option `sclass="H"` specifies the parameter space as second-order
+  Hölder smoothness class, which formalizes our assumption above that the second
+  derivative of $f_Y$ is bounded by $M$ on either side of the cutoff. The
+  package also allows the user to use a Taylor smoothness class by setting
+  `sclass="T"`. This changes the computation of the worst-case bias, and allows
+  $f_Y$ to correspond to any function such that the approximation error from a
+  second-order Taylor expansion around the cutoff is bounded by $M x^{2}/2$. For
+  more discussion, see Section 2 in @ArKo18optimal (note the constant $C$ in
+  that paper equals $C=M/2$ here).
+- Other options for `kern` are `"uniform"` and `"epanechnikov"`, or the user can
+  also supply their own kernel function.
+- `RDHonest` reports two-sided as well one-sided CIs. One-sided CIs simply
+  subtract off the worst-case bias bound $B_{M, h}$ in addition to subtracting
+  the standard error times the $z_{1-\alpha}$ critical value from the estimate.
+  It also reports the p-value for the hypothesis that $\tau_Y=0$.
+- `RDHonest` also reports the fitted regression coefficients $\hat{\beta}_{Y,
+  h}$, and returns the `lm` object under `r0$lm`. We see from the above that the
+  fitted slopes below and above the cutoff differ by
+  `r round(unname(r0$lm$coefficients[2]), 2)`, for instance.
+
+## Automatic bandwidth choice
+
+Instead of specifying a bandwidth, one can just specify the curvature parameter
+$M$, and the bandwidth will be chosen optimally for a given optimality
+criterion---minimizing the worst-case MSE of the estimator, or minimizing the
+length the resulting confidence interval. Typically, this makes little difference:
 
 ```{r}
 RDHonest(voteshare ~ margin, data=lee08, kern="triangular",
-         M=0.1, opt.criterion="MSE", sclass="H")
+         M=0.1, opt.criterion="MSE")
 ## Choose bws optimal for length of CI
 RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
-         opt.criterion="FLCI", sclass="H")
+         opt.criterion="FLCI")
 ```
 
-### Inference when running variable is discrete
+To compute the optimal bandwidth, the package assumes homoskedastic variance on
+either side of the cutoff, which it estimates based on a preliminary local
+linear regression using the @ImKa12 bandwidth selector. This homoskedasticity
+assumption is dropped when the final standard errors are computed.
+
+Notice that, when the variance function $\sigma^2_Y(x)$ is known, neither the
+conditional variance of the estimator, $\sd(\hat{\tau}_{Y,
+h})^{2}=\sum_{i=1}^{n}k_{i, h}^{2}\sigma^{2}_{Y}(x_{i})$, nor the bias bound
+$B_{M, h}$ depend on the outcome data. Therefore, the MSE and the length of the
+infeasible CI, $2\cv_{\alpha}(B_{M, h}/\sd(\hat{\tau}_{Y, h}))
+\sd(\hat{\tau}_{Y, h})$, do not depend on the outcome data. To stress this
+property, we refer to this infeasible CI (and, with some abuse of terminology,
+also the feasible version in eq. (2)) as a fixed-length confidence interval
+(FLCI). As a consequence of this property, optimizing the bandwidth for CI
+length does not impact the coverage of the resulting CI.
+
+## Choice of curvature parameter
+
+The curvature parameter $M$ is the most important implementation choice. It
+would be convenient if one could use data-driven methods to automate its
+selection. Unfortunately, if one only assume that the second derivative of
+$f_{Y}$ is bounded by some constant $M$, it is not possible to do that: one
+cannot use data to select $M$ without distorting coverage
+(@low97, @ArKo18optimal). This result is essentially an instance of the general
+issue with using pre-testing or using model selection rules, such as using
+cross-validation or information criteria like AIC or BIC to pick which controls
+to include in a regression: doing so leads to distorted confidence intervals.
+Here the curvature parameter $M$ indexes the size of the model: a large $M$ is
+the analog of saying that all available covariates need to be included in the
+model to purge omitted variables bias; a small $M$ is the analog of saying that
+a small subset of them will do. Just like one needs to use institutional
+knowledge of the problem at hand to decide which covariates to include in a
+regression, ideally one uses problem-specific knowledge to select $M$. Analogous
+to reporting results based on different subsets of controls in columns of a
+table with regression results, one can vary the choice of $M$ by way of
+sensitivity analysis.
+
+Depending on the problem at hand, it may be difficult to translate
+problem-specific intuition about how close we think the regression function is
+to a linear function into a statement about the curvature parameter $M$. In such
+cases, it is convenient to have a rule of thumb for selecting $M$ using the
+data. To do this, we need to impose additional restrictions on $f_{Y}$ besides
+assuming that its second derivative is bounded in order to get around the
+results on impossibility of post-model selection inference discussed above. An
+appealing way of doing this is to relate the assumption about the local
+smoothness of $f_{Y}$ at the cutoff point, which drives the bias of the
+estimator but is difficult to measure in the data, to the global smoothness of
+$f_{Y}$, which is much easier to measure. In our implementation, we measure
+global smoothness by fitting a global quartic polynomial $\check{f}$, separately
+on either side of the cutoff, following @ArKo20. We assume the local second
+derivative of $f_{Y}$, $M$, is no larger than the maximum second derivative of
+the global polynomial approximation $\check{f}$. Under this assumption, we can
+calibrate $M$ by setting
+$$\hat{M}_{ROT}=\sup_{x\in[x_{\min}, x_{\max}]}\abs{\check{f}''(x)}.$$
+There are
+different ways of relating local and global smoothness, which lead to different
+calibrations of $M$. For instance, @ImWa19 propose fitting a global quadratic
+polynomial instead, and then multiplying the maximal curvature of the fitted
+model by a constant such as $2$ or $4$. An important question for future
+research is to figure out whether there is a way of relating local and global
+smoothness that is empirically appealing across a wide range of scenarios. See
+also @NoRo21 for a discussion how to visualize the choice of $M$ to aid with its
+interpretation.
+
+When the user doesn't supply `M`, the package uses the rule of thumb
+$\hat{M}_{ROT}$, and prints a message to inform the user:
+```{r}
+## Data-driven choice of M
+RDHonest(voteshare ~ margin, data=lee08)
+```
+
+
+
+
+
+## Inference when running variable is discrete
 
 The confidence intervals described above can also be used when the running
 variable is discrete, with $G$ support points: their construction makes no
 assumptions on the nature of the running variable (see Section 5.1 in @KoRo16
 for more detailed discussion).
 
-Note that units that lies exactly at the cutoff are considered treated, since
-the definition of treatment is that the running variable
- $x_i\geq c_0$.
+Units that lie exactly at the cutoff are considered treated, since the
+definition of treatment assignment is that the running variable lies weakly
+above the cutoff, $x_i\geq c_0$.
 
 As an example, consider the @oreopoulos06 data, in which the running variable is age in years:
 ```{r}
-## Replicate Table 2, column (10)
+## Replicate Table 2, column (10) in Kolesar and Rothe (2018)
 RDHonest(log(earnings) ~ yearat14, cutoff=1947,
          data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H")
-## Triangular kernel generally gives tigher CIs
-RDHonest(log(earnings) ~ yearat14, cutoff=1947,
-         data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H")
 ```
 
 In addition, the package provides function `RDHonestBME` that calculates honest
@@ -243,14 +367,14 @@ no worse at the cutoff than away from the cutoff as in Section 5.2 in @KoRo16.
 
 ```{r}
 ## Replicate Table 2, column (6), run local linear regression (order=1)
-## with a uniform kernel (other kernels are not yet implemented)
+## with a uniform kernel (other kernels are not implemented for RDHonestBME)
 RDHonestBME(log(earnings) ~ yearat14, cutoff=1947,
             data=cghs, h=3, order=1)
 ```
 
 
 Let us describe the implementation of the variance estimator $\hat{V}(W)$ used
-to construct the CI as described in in Section 5.2 in @KoRo16. Suppose the point
+to construct the CI following Section 5.2 in @KoRo16. Suppose the point
 estimate is given by the first element of the regression of the outcome $y_i$ on
 $m(x_i)$. For instance, local linear regression with uniform kernel and
 bandwidth $h$ corresponds to $m(x)=I(|x|\leq h)\cdot(I(x>c_0),1,x, x\cdot
@@ -267,7 +391,7 @@ $x_g$ is the $g$th support point of $x_i$, is given by the $g$th element of the
 regression estimand $S^{-1}E[w(x_i)y_i]$, where $S=E[w(x_i)w(x_i)']$. Let
 $\hat{\mu}=\hat{S}^{-1}\sum_i w(x_i)y_i$, where $\hat{S}=\sum_i w(x_i)w(x_i)'$
 denote the least squares estimator. Then an estimate of
-$(\delta(x_1),\dotsc,\delta(x_G))'$ is given by $\hat{\delta}$, the vector with
+$(\delta(x_1), \dotsc, \delta(x_G))'$ is given by $\hat{\delta}$, the vector with
 elements $\hat{\mu}_g-x_g\hat{\theta}$.
 
 By standard regression results, the asymptotic distribution of $\hat{\theta}$
@@ -309,7 +433,7 @@ T_i'=\begin{pmatrix}
 Note that the upper left block and lower right block correspond simply to the
 Eicker-Huber-White estimators of the asymptotic variance of $\hat{\theta}$ and
 $\hat{\mu}$. By the delta method, a consistent estimator of the asymptotic
-variance of $(\hat\delta,\hat{\theta}_1)$ is given by
+variance of $(\hat\delta, \hat{\theta}_1)$ is given by
 \begin{equation*}
     \hat{\Sigma}=
 \begin{pmatrix}
@@ -323,222 +447,458 @@ e_1'& 0\\
 where $X$ is a matrix with $g$th row equal to $x_g'$, and $e_1$ is the first
 unit vector.
 
-Recall that in the notation of @KoRo16, $W=(g^-,g^+,s^-,s^+)$, and $g^{+}$ and
+Recall that in the notation of @KoRo16, $W=(g^-, g^+, s^-, s^+)$, and $g^{+}$ and
 $g^{-}$ are such that $x_{g^{-}}< c_0\leq x_{g^{+}}$, and
-$s^{+},s^{-}\in\{-1,1\}$. An upper limit for a right-sided CI for
+$s^{+}, s^{-}\in\{-1,1\}$. An upper limit for a right-sided CI for
 $\theta_1+b(W)$ is then given by
 
 \begin{equation*}
 \hat{\theta}_{1}+s^{+}\hat\delta(x_{g^+})+
 s^{-}\hat\delta(x_{g^-})+z_{1-\alpha}\hat{V}(W),
 \end{equation*}
-
 where $\hat{V}(W)=a(W)'\hat{\Sigma}a(W)$, and $a(W)\in\mathbb{R}^{G_{h}+1}$
 denotes a vector with the $g_{-}$th element equal to $s^{-}$,
 $(G_{h}^{-}+g_{+})$th element equal to $s^{+}$, the last element equal to one,
 and the remaining elements equal to zero. The rest of the construction then
 follows the description in Section 5.2 in @KoRo16.
 
-## Data-driven choice of smoothness constant
 
-Without further restrictions, the smoothness constant $M$ cannot be data-driven:
-to maintain honesty over the whole function class, a researcher must choose
-$M$ a priori, rather than attempting to use a data-driven method. Therefore,
-one should, whenever possible, use problem-specific knowledge to decide what
-choice of $M$ is reasonable a priori.
+# Fuzzy RD
+
+## Model
+
+In a fuzzy RD design, the treatment status $D_i$ of a unit does not necessarily
+equate the treatment assignment $Z_i=I\{x_{i}\geq c_{0}\}$. Instead, the
+treatment assignment induces a jump in the treatment probability at the cutoff.
+Correspondingly, we augment the outcome model with a first stage that measures
+the effect of the running variable on the treatment:
+\begin{equation}
+Y_i = f_Y(x_i) + u_{Y, i}, \qquad D_i = f_D(x_i) + u_{Y, i},
+\end{equation}
+where $f_D, f_Y$ are the conditional mean functions.
+
+To account for imperfect compliance the fuzzy RD parameter scales the jump in the
+outcome equation $\tau_Y$ by the jump in the treatment probability at the
+cutoff, $\tau_D=\lim_{x\downarrow c_{0}}f_D(x)-\lim_{x\uparrow c_{0}}f_D(x)$.
+This fuzzy RD parameter, $\theta=\tau_{Y}/\tau_{D}$, measures the local average
+treatment effect for individuals at the threshold who comply with the
+treatment assignment, provided mild continuity conditions and a monotonicity
+condition hold (@htv01). Under perfect compliance, the treatment
+probability jumps all the way from zero to one at the threshold so that
+$\tau_{D}=1$, and the two parameters coincide.
+
+For example, in the @battistin09 dataset, the treatment variable is an indicator
+for retirement, and the running variable is the number of years since being eligible
+to retire. The cutoff is $0$. Individuals exactly at the cutoff are dropped from
+the dataset. If there were individuals exactly at the cutoff, they are assumed
+to receive the treatment assignment (i.e. be eligible for retirement).
+
+## Inference based on local polynomial estimates
+
+A natural estimator for the fuzzy RD parameter $\theta$ is the sample analog
+based on local linear regression,
+\begin{equation*}
+  \hat{\theta}_{h}=\frac{\hat{\tau}_{Y, h}}{\hat{\tau}_{D, h}}.
+\end{equation*}
 
-For cases in which this is difficult, the internal function `RDHonest`
-implements the method considered in Section 3.4.1 in @ArKo16honest based on a
-global polynomial approximation:
+Unlike in the sharp case, our bias-aware CIs do rely on the consistency of the
+estimator, which generally requires the bandwidth to shrink with the sample
+size. Since this estimator is a ratio of regression coefficients, it follows by
+the delta method that so long as $\tau_{D}\neq 0$, the estimator will be
+asymptotically normal in large samples. In fact, the estimator is equivalent to
+a weighted IV regression of $Y_i$ onto $D_i$, using $Z_i$ as an instrument, and
+$x_i$ and its interaction with $Z_i$ as controls, so the variance formula is
+analogous to the IV variance formula:
+\begin{equation*}
+\sd(\hat{\theta}_h)^2=\frac{\sd(\tau_{Y, h})^2+\theta^2\sd(\tau_{D, h})^2
+-2\cov(\tau_{D, h}, \tau_{Y, h})\theta}{\tau_D^2},
+\end{equation*}
+where $\cov(\tau_{D, h}, \tau_{Y, h})=\sum_i
+k_{i, h}^2\cov(Y_i, D_i\mid x_i)$ is the covariance of the
+estimators.
+
+If the second derivative of $f_Y$ is bounded by $M_Y$ and the second derivative
+of $f_D$ is bounded by $M_D$, a linearization argument from Section 3.2.3 in
+@ArKo20 that the bias can be bounded in large samples by $B_{M, h}$, with
+$M=(M_{Y}+\abs{\theta}M_{D})/\abs{\tau_{D}}$, which now depends on $\theta$
+itself. Therefore, optimal bandwidth calculations will require a preliminary
+estimate of $\abs{\theta}$, which can be passed to `RDHonest` via the option
+`T0`. Like in the sharp case, the optimal bandwidth calculations assume
+homoskedastic covariance of $(u_{Y, i}, u_{D, i})$ on either side of the cutoff,
+which are estimated based on a preliminary local linear regression for both the
+outcome and first stage equation, with bandwidth given by the @ImKa12 bandwidth
+selector applied to the outcome equation.
 
 ```{r}
-## Data-driven choice of M
-RDHonest(voteshare ~ margin, data=lee08, kern="uniform", sclass="H",
-         opt.criterion="MSE")
+## Initial estimate of treatment effect for optimal bandwidth calculations
+r <- RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
+## Use it to compute optimal bandwidth
+RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
+         T0=r$coefficients$estimate)
 ```
-See @ArKo16honest for a discussion of the restrictions on the
-parameter space under which this method yields honest inference.
 
-## Optimal inference
+## Choice of curvature parameters
 
-For the second-order Taylor smoothness class, the function `RDHonest`, with
-`kernel="optimal"`, computes finite-sample optimal estimators and confidence
-intervals, as described in Section 2.2 in @ArKo16optimal. This typically yields
-tighter CIs. Comparing the lengths of two-sided CIs with optimally chosen
-bandwidths, using Silverman's rule of thumb to estimate the preliminary variance
-estimate used to compute optimal bandwidths:
+Like in the sharp RD case, without further restrictions, the curvature
+parameters $M_Y$ and $M_D$ cannot be data-driven: to maintain honesty over the
+whole function class, a researcher must choose them a priori, rather than
+attempting to use a data-driven method. Therefore, one should, whenever
+possible, use problem-specific knowledge to decide what choices of $M_Y$ and
+$M_D$ are reasonable a priori.
+
+For cases in which this is difficult, the function `RDHonest` implements the
+rule of thumb @ArKo20 described earlier, based on computing the global
+smoothness of both $f_Y$ and $f_D$ using a quartic polynomial. When the user
+doesn't supply the curvature bounds, the package uses the rule of thumb
+$\hat{M}_{ROT}$, and prints a message to inform the user:
 
 ```{r}
-r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
-               opt.criterion="FLCI",
-               se.method="nn")$coefficients
-r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
-               opt.criterion="FLCI", se.method="nn",
-               sclass="T")$coefficients
-r1$conf.high-r1$conf.low
-r2$conf.high-r2$conf.low
+## Data-driven choice of M
+RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)
 ```
+See @ArKo20 for a discussion of the restrictions on the
+parameter space under which this method yields honest inference.
 
-## Specification testing
+# Extensions
 
-The package also implements lower-bound estimates for the smoothness constant
-$M$ for the Taylor and Hölder smoothness class, as described in the supplements
-to @KoRo16 and @ArKo16optimal
+## Covariates
+
+RD datasets often contain information on a vector of $K$ pre-treatment
+covariates $W_{i}$, such as pre-intervention outcomes, demographic, or
+socioeconomic characteristics of the units. Similar to randomized controlled
+trials, while the presence of covariates doesn't help to weaken the fundamental
+identifying assumptions, augmenting the RD estimator with predetermined
+covariates can increase its precision.
 
+Let us first describe covariate adjustment in the sharp RD case. We implement
+the covariate adjustment studied in @ccft19, namely to include $W_{i}$ as one of
+the regressors in the WLS regression, regressing
+$Y_{i}$ onto $m(x_{i})$ and $W_{i}$. As in the case with no covariates, we weight each observation with the
+kernel weight $K(x_{i}/h)$. This leads to the estimator
+\begin{equation*}
+  \tilde{\tau}_{Y, h}=\tilde{\beta}_{Y, h,1}, \qquad \tilde{\beta}_{Y, h}=\left(\sum_{i=1}^{n}K(x_{i}/h)
+    \tilde{m}(x_{i}, W_{i})\tilde{m}(x_{i}, W_{i})'\right)^{-1}
+  \sum_{i=1}^{n}K(x_{i}/h)\tilde{m}(x_{i}, W_{i})Y_{i},
+\end{equation*}
+where $\tilde{m}(x_{i}, W_{i})=(m(x_{i})', W_{i}')'$. Denote the coefficient on
+$W_{i}$ in this regression by $\tilde{\gamma}_{Y, h}$; this corresponds to the
+last $K$ elements of $\tilde{\beta}_{Y, h}$. As in the case without covariates,
+we first take the bandwidth $h$ as given, and defer bandwidth selection choice
+to the end of this subsection.
+
+To motivate the estimator under our framework, and to derive bias-aware CIs
+that incorporate covariates, we need to formalize the assumption that the
+covariates are predetermined (without any assumptions on the covariates, it is
+optimal to ignore the covariates and use the unadjusted estimator
+$\hat{\tau}_{Y, h}$). Let $f_{W}(x)=E[W_{i}\mid X_{i}=x]$ denote the regression
+function from regressing the covariates on the running variable, and let
+\begin{equation*}
+  \Sigma_{WW}(x)=\operatorname{var}(W_{i}\mid X_{i}=x), \qquad
+  \Sigma_{WY}(x)=\cov(W_{i}, Y_{i}\mid X_{i}=x).
+\end{equation*}
+We assume that the variance and covariance functions are continuous, except
+possibly at zero. Let
+$\gamma_{Y}=(\Sigma_{WW}(0_{+})+\Sigma_{WW}(0_{-}))^{-1}
+(\Sigma_{WY}(0_{+})+\Sigma_{WY}(0_{-}))$ denote the coefficient on $W_{i}$ when
+we regress $Y_{i}$ onto $W_{i}$ for observations at the cutoff. Let
+$\tilde{Y}_{i}:=Y_{i}-W_{i}'\gamma_{Y}$ denote the covariate-adjusted outcome.
+To formalize the assumption that the covariates are pre-determined, we assume
+that $\tau_{W}=\lim_{x\downarrow 0}f_{W}(0)-\lim_{x\uparrow 0}f_{W}(0)=0$, which
+implies that $\tau_{Y}$ can be identified as the jump in the covariate-adjusted
+outcome $\tilde{Y}_{i}$ at $0$. Following Appendix B.1 in
+@ArKo18optimal, we also assume that the covariate-adjusted outcome
+varies smoothly with the running variable (except for a possible jump at the
+cutoff), in that the second derivative of
+\begin{equation*}
+  \tilde{f}(x):=f_{Y}(x)-f_{W}(x)'\gamma_{Y}
+\end{equation*}
+is bounded by a known constant $\tilde{M}$. In addition, we assume $f_{W}$ has
+bounded second derivatives.
+
+Under these assumptions, if $\gamma_{Y}$ was known and hence $\tilde{Y}_{i}$ was
+directly observable, we could estimate $\tau$ as in the case without covariates,
+replacing $M$ with $\tilde{M}$ and $Y_{i}$ with
+$\tilde{Y}_{i}$. Furthermore, as discussed in @ArKo18optimal, such
+approach would be optimal under homoskedasticity assumptions.
+Although $\gamma_{Y}$ is unknown, it turns out that the estimator
+$\tilde{\tau}_{Y, h}$ has the same large sample behavior as the infeasible
+estimator $\hat{\tau}_{\tilde{Y}, h}$. To show this, note that by standard
+regression algebra, $\tilde{\tau}_{Y, h}$ can equivalently be written as
+\begin{equation*}
+  \tilde{\tau}_{Y, h}=\hat{\tau}_{Y-W'\tilde{\gamma}_{Y, h}, h}=
+  \hat{\tau}_{\tilde{Y}, h}-
+  \sum_{k=1}^{K}\hat{\tau}_{W_{k}, h}(\tilde{\gamma}_{Y, h, k}-\gamma_{Y, k}).
+\end{equation*}
+The first equality says that covariate-adjusted estimate is the same as an
+unadjusted estimate that replaces the original outcome $Y_{i}$ with the
+covariate-adjusted outcome $Y_{i}-W_{i}'\tilde{\gamma}_{Y, h}$. The second
+equality uses the decomposition
+$Y_{i}-W_{i}'\tilde{\gamma}_{Y, h}=\tilde{Y}_{i}-W_{i}'
+(\tilde{\gamma}_{Y h}-\gamma_{Y})$
+to write the estimator as a sum of the infeasible estimator and a linear
+combination of "placebo RD estimators" $\hat{\tau}_{W_{k}, h}$, that
+replace $Y_{i}$ in the outcome equation with the $k$th element of $W_{i}$,
+Since $f_{W}$ has bounded second derivatives, these placebo estimators converge
+to zero, with rate that is at least as fast as the rate of convergence of the
+infeasible estimator $\hat{\tau}_{\tilde{Y}, h}$:
+$\hat{\tau}_{W_{k}, h}=O_{p}(B_{\tilde{M}, h}+\sd(\hat{\tau}_{\tilde{Y}, h}))$.
+Furthermore, under regularity conditions, $\tilde{\gamma}_{Y, h}$ converges to
+$\gamma_{Y}$, so that the second term in the previous display is asymptotically
+negligible relative to the first. Consequently, we can form bias-aware CIs
+based on $\tilde{\tau}_{Y, h}$ as in the case without covariates, treating
+the covariate-adjusted outcome $Y_{i}-W_{i}'\tilde{\gamma}_{Y}$ as the outcome,
+\begin{equation*}
+  \tilde{\tau}_{Y, h}\pm
+  \cv_{1-\alpha}(B_{\tilde{M}, h} / \sd(\hat{\tau}_{\tilde{Y}, h}))\sd(\hat{\tau}_{\tilde{Y}, h}), \qquad
+  \sd(\hat{\tau}_{Y, h})=\sum_{i=1}^{n}k_{i, h}^{2}\sigma^{2}_{\tilde{Y}}(x_{i}),
+\end{equation*}
+where
+$\sigma^{2}_{\tilde{Y}}(x_{i})=\sigma^{2}_{Y}(x_{i})+{\gamma}_{Y}'\Sigma_{WW}(x_{i}){\gamma}_{Y}-2{\gamma}_{Y}'\Sigma_{WY}(x_{i})$.
+If the covariates are effective at explaining variation in the outcomes, then
+the quantity
+$\sum_{i}k_{i, h}^{2}\left({\gamma}_{Y}'\Sigma_{WW}(x_{i}){\gamma}_{Y}-2{\gamma}_{Y}'\Sigma_{WY}(x_{i})
+\right)$ will be negative, and
+$\sd(\hat{\tau}_{\tilde{Y}, h})\leq \sd(\hat{\tau}_{Y, h})$. If the smoothness of
+the covariate-adjusted conditional mean function $f_{Y}-f_{W}'\gamma_{Y}$ is
+greater than the smoothness of the unadjusted conditional mean function
+$f_{Y}$, so that $\tilde{M}\leq M$, then using the covariates will help tighten
+the confidence intervals.
+
+Implementation of covariate-adjustment requires a choice of $\tilde{M}$, and
+computing the optimal bandwidth requires a preliminary estimate of the variance
+of the covariate-adjusted outcome. In our implementation, we first estimate the
+model without covariates (using a rule of thumb to calibrate $M$, the bound on
+the second derivative of $f_{Y}$), and compute the bandwidth $\check{h}$ that's
+MSE optimal without covariates. Based on this bandwidth, we compute a
+preliminary estimate $\check{\gamma}_{Y, \check{h}}$ of $\gamma_{Y}$, and use
+this preliminary estimate to compute the covariate-adjusted outcome
+$Y_{i}-W_{i}'\check{\gamma}_{Y, \check{h}}$. If $\tilde{M}$ is not supplied, we
+calibrate $\tilde{M}$ using the rule of thumb, using this covariate-adjusted
+outcome as the outcome. Similarly, we use this covariate-adjusted outcome as the
+outcome to compute a preliminary estimator of the conditional variance
+$\sigma^{2}_{\tilde{Y}}(x_{i})$, for optimal bandwidth calculations, as in the
+case without covariates.
+
+A demonstration using the `headst` data:
 ```{r}
-r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
-### Only use three point-average for averages of a 100 points closest to cutoff,
-### and report results separately for points above and below cutoff
-RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")
+## No covariates
+rn <- RDHonest(mortHS ~ povrate, data=headst)
+## Use Percent attending school aged 14-17, urban, black,
+## and their interaction as covariates.
+rc <- RDHonest(mortHS ~ povrate | urban*black + sch1417, data=headst)
+rn
+rc
+```
 
-## Pool estimates based on observations below and above cutoff, and
-## use three-point averages over the entire support of the running variable
-RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")
+We see that the inclusion of covariates leads to a reduction in the
+rule-of-thumb curvature and also smaller standard errors (this would be true
+even if the bandwidth was kept fixed). Correspondingly, the CIs are tighter by
+about 6 percentage points:
+```{r}
+100 * (1 - (rc$coefficients$conf.high-rc$coefficients$conf.low) /
+           (rn$coefficients$conf.high-rn$coefficients$conf.low))
 ```
 
-## Weighted regression
+In the fuzzy RD case, we need to covariate-adjust the treatment $D_i$ as well as
+the outcome. The implementation mirrors the sharp case. Define $\gamma_{D}$
+analogously to $\gamma_{Y}$, and assume that the second derivative of
+$f_{Y}(x)-f_{W}(x)'\gamma_{Y}$ is bounded by a known constant $\tilde{M}_{Y}$,
+and that $f_{D}(x)-f_{W}(x)'\gamma_{D}$ is bounded by a known constant
+$\tilde{M}_{D}$. The covariate-adjusted estimator is given by
+$\tilde{\theta}_{h}=\tilde{\tau}_{Y, h}/\tilde{\tau}_{D, h}$, with variances and
+worst-case bias computed as in the case without covariates, replacing the
+treatment and outcome with their covariate-adjusted versions.
+
+
+## Aggregated data and weighted regression
 
 In some cases, data is only observed as cell averages. For instance, suppose
 that instead of observing the original `cghs` data, we only observe averages for
 cells as follows:
 
 ```{r}
-d <- cghs
-## Make 20 groups based on observation number
-d$mod <- seq_along(d$yearat14) %% 20
-## Make cells defined as intersection of group and year
-d$cell <- d$mod/100+d$yearat14
-## Data with cell averages
 dd <- data.frame()
-for (j in unique(d$cell)){
-    dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]),
-                               x=mean(d$yearat14[d$cell==j]),
-                               weights=length(d$yearat14[d$cell==j])))
+## Collapse data by running variable
+for (j in unique(cghs$yearat14)) {
+    ix <- cghs$yearat14==j
+    df <- data.frame(y=mean(log(cghs$earnings[ix])), x=j,
+                     weights=sum(ix),
+                     sigma2=var(log(cghs$earnings[ix]))/sum(ix))
+    dd <- rbind(dd, df)
 }
 ```
 
 The column `weights` gives the number of observations that each cell averages
 over. In this case, if we weight the observations using `weights`, we can
-recover the original estimates (and the same worst-case bias):
+recover the original estimates (and the same worst-case bias). If we use the
+estimates of the conditional variance of the outcome, `dd$sigma2`, then we can
+also replicate the standard error calculations:
 
 ```{r}
-RDHonest(log(earnings)~yearat14, cutoff=1947, h=5, data=cghs, M=0.1,
-         se.method="nn")
-RDHonest(y~x, cutoff=1947, weights=weights, h=5, data=dd, M=0.1,
-         se.method="nn")
+s0 <- RDHonest(log(earnings)~yearat14, cutoff=1947, data=cghs)
+## keep same bandwidth
+s1 <- RDHonest(y~x, cutoff=1947, data=dd, weights=weights,
+               sigmaY2=sigma2, se.method="supplied.var",
+               h=s0$coefficients$bandwidth)
+## Results are identical:
+s0
+s1
 ```
 
-Note the variance estimates don't quite match, since the variance estimator is
-different, but the worst-case bias and the point estimate are identical.
-
-# Initial bandwidth estimation
-
-We use @FaGi96 rule of thumb and @ImKa12 bandwidth selectors for preliminary
-variance estimation.
-
-
-# Fuzzy RD
-
-## Model
-
-In a fuzzy RD design, units are assigned to treatment if their running variable
-$x_{i}$ weakly exceeds a cutoff $x_i\geq c_{0}$. However, the actual treatment
-$d_{i}$ does not perfectly comply with the treatment assignment. Instead, the cutoff
-induces a jump in the treatment probability. The resulting reduced-form and
-first-stage regressions are given by
-\begin{align*}
-  y_{i}&=f_{1}(x_{i})+u_{i1}, &  d_{i}&=f_{2}(d_{i})+u_{i2},
-\end{align*}
-See Section 3.3 in @ArKo16honest for a more detailed description.
-
-In the @battistin09 dataset, the treatment variable is an indicator for
-retirement, and the running variable is number of years since being eligible to
-retire. The cutoff is $0$. Individuals exactly at the cutoff are dropped from
-the dataset. If there were individuals exactly at the cutoff, they are assumed
-to be assigned to the treatment group.
-
-Similarly to the `RDData` function, the `FRDData` function transforms the data
-into an appropriate format:
+Without supplying the variance estimates and specifying
+`se.method="supplied.var"`, the variance estimates will not match, since the
+collapsed data is not generally not sufficient to learn about the true
+variability of the collapsed outcomes.
 
+The same method works in fuzzy designs, but one has to also save the conditional
+variance of the treatment and its covariance with the outcome:
 ```{r}
-## Assumes first column in the data frame corresponds to outcome,
-##  second to the treatment variable, and third to the running variable
-## Outcome here is log of non-durables consumption
-## dr <- FRDData(cbind(logf=log(rcp[, 6]), rcp[, c(3, 2)]), cutoff=0)
+r0 <- RDHonest(log(cn)|retired~elig_year, data=rcp, h=7)
+dd <- data.frame(x=sort(unique(rcp$elig_year)), y=NA, d=NA, weights=NA,
+                 sig11=NA, sig12=NA, sig21=NA, sig22=NA)
+for (j in seq_len(NROW(dd))) {
+    ix <- rcp$elig_year==dd$x[j]
+    Y <- cbind(log(rcp$cn[ix]), rcp$retired[ix])
+    dd[j, -1] <- c(colMeans(Y), sum(ix), as.vector(var(Y))/sum(ix))
+}
+r1 <- RDHonest(y|d~x, data=dd, weights=weights,
+               sigmaY2=sig11, T0=0, sigmaYD=sig21,
+               sigmaD2=sig22, h=7,
+               se.method="supplied.var")
+## Outputs match up to numerical precision
+max(abs(r0$coefficients[2:11]-r1$coefficients[2:11]))
 ```
 
-## Inference based on local polynomial estimates
+## Clustering
+
+In some applications, the data are collected by clustered sampling. In such
+cases, the user can specify a vector `clusterid` signifying cluster membership.
+In this case, preliminary bandwidth calculations assume that the regression
+errors have a Moulton-type structure, with homoskedasticity on either side of
+the cutoff: \begin{equation*} \cov(Y_{i}, Y_{j})=\begin{cases} \sigma^{2}_{+} &
+\text{if $i=j$ and $x_{i}\geq 0$,} \\ \sigma^{2}_{-} & \text{if $i=j$ and
+$x_{i}< 0$,} \\ \rho & \text{if $i\neq j$ and $g(i)=g(j)$,} \\ 0 &
+\text{otherwise,} \end{cases} \end{equation*} where $g(i)\in\{1,\dotsc, G\}$
+denotes cluster membership. Since it appears difficult to generalize the nearest
+neighbor variance estimator to clustering, we use regression-based
+cluster-robust variance formulas to compute estimator variances, so that option
+`se.method="EHW"` is required.
 
-The function `RDHonest` constructs one- and two-sided confidence intervals
-(CIs) around local linear and local quadratic estimators using either a
-user-supplied bandwidth (which is allowed to differ on either side of the
-cutoff), or bandwidth that is optimized for a given performance criterion.
+```{r}
+## make fake clusters
+set.seed(42)
+clusterid <- sample(1:50, NROW(lee08), replace=TRUE)
+sc <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
+               clusterid=clusterid, M=0.14, h=7)
+## Since clusters are unrelated to outcomes, not clustering
+## should yield similar standard errors
+sn <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
+               M=0.14, h=7)
+sc
+sn
+```
 
-### Parameter space and initial estimate
 
-To implement the honest CIs, one needs to specify the parameter space
-$\mathcal{F}$ for $f_1$ and $f_2$. The function `RDHonest` computes honest CIs
-when $f_1$ and $f_2$ both lie in a second-order Taylor or second-order Hölder
-smoothness class, $\mathcal{F}_{T}(M_1, M_2)$ and
-$\mathcal{F}_{\text{Hölder}}(M_1, M_2)$, where the smoothness constants $M_1$
-and $M_2$ for the reduced form and the first stage are allowed to differ. Also,
-since the worst-case bias calculation requires an estimate of the treatment
-effect, for optimal bandwidth calculations, the user needs to supply an initial
-estimator of the treatment effect
+## Specification testing
 
+The package also implements lower-bound estimates for the smoothness constant
+$M$ for the Taylor and Hölder smoothness class, as described in the supplements
+to @KoRo16 and @ArKo18optimal
 
 ```{r}
-## Initial estimate of treatment effect for optimal bandwidth calculations
-r <- RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular",
-              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
-## Use it to compute optimal bandwidth
-RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular",
-         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
-         T0=r$coefficients$estimate)
-```
+r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
+### Only use three point-average for averages of a 100 points closest to cutoff,
+### and report results separately for points above and below cutoff
+RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")
 
-## Data-driven choice of smoothness constant
+## Pool estimates based on observations below and above cutoff, and
+## use three-point averages over the entire support of the running variable
+RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")
+```
 
-Like in the sharp RD case, Without further restrictions, the smoothness
-constants $M_1$ and $M_2$ cannot be data-driven: to maintain honesty over the
-whole function class, a researcher must choose them a priori, rather than
-attempting to use a data-driven method. Therefore, one should, whenever
-possible, use problem-specific knowledge to decide what choices of $M_1$ and
-$M_2$ are reasonable a priori.
+## Optimal weights under Taylor smoothness class
 
-For cases in which this is difficult, the function `RDHonest` implements the
-method considered in Section 3.4.1 in @ArKo16honest based on a global polynomial
-approximation:
+For the second-order Taylor smoothness class, the function `RDHonest`, with
+`kernel="optimal"`, computes finite-sample optimal estimators and confidence
+intervals, as described in Section 2.2 in @ArKo18optimal. This typically yields
+tighter CIs:
 
 ```{r}
-## Data-driven choice of M
-RDHonest(log(cn) ~ retired | elig_year, data=rcp, kern="triangular",
-         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)
+r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
+               opt.criterion="FLCI",
+               se.method="nn")$coefficients
+r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
+               opt.criterion="FLCI", se.method="nn",
+               sclass="T")$coefficients
+r1$conf.high-r1$conf.low
+r2$conf.high-r2$conf.low
 ```
-See @ArKo16honest for a discussion of the restrictions on the
-parameter space under which this method yields honest inference.
-
 # Inference at a point
 
-The package can also perform inference at
-a point, and optimal bandwidth selection for inference at a point. Suppose, for example, one
-was interested in the vote share for candidates with margin of victory equal to
-20 points:
+The package can also perform inference at a point, and optimal bandwidth
+selection for inference at a point. Suppose, for example, one was interested in
+the vote share for candidates with margin of victory equal to 20 points:
 ```{r}
-## Transform data, specify we're interested in inference at x0=20,
+## Specify we're interested in inference at x0=20,
 ## and drop observations below cutoff
 RDHonest(voteshare ~ margin, data=lee08, subset=margin>0,
          cutoff=20, kern="uniform",
          opt.criterion="MSE", sclass="H", point.inference=TRUE)
 ```
 
-# Lindeberg weights
-
-In fuzzy RD, by Theorem B.2 in the appendix to @ArKo16honest, the estimator is asymptotically equivalent to
-\begin{equation*}
-\sum_i w_i (Y_i-D_i\beta)/first stage,
-\end{equation*}
-so the Lindeberg weights are the same as in the sharp case.
-
+To compute the optimal bandwidth, the package assumes homoskedastic variance on
+either side of the cutoff, which it estimates based on a preliminary local
+linear regression using the @FaGi96 rule of thumb bandwidth selector. This
+homoskedasticity assumption is dropped when the final standard errors are
+computed.
+
+# Diagnostics: leverage and effective observations
+
+The estimators in this package are just weighted regression estimators, or
+ratios of regression estimators in the fuzzy RD case. Regression estimators are
+linear in outcomes, taking the form $\sum_i k_{i, h} Y_i$, where $k_{i, h}$ are
+estimation weights, returned by `data$est_w` part of the `RDHonest` output (see
+expression for $\hat{\tau}_{Y, h}$ above).
+
+For the sampling distribution of the estimator to be well-approximated by a
+normal distribution, it is important that these regression weights not be too
+large: asymptotic normality requires $L_{\max}=\max_j k_{j, h}^2/\sum_i k_{i,
+h}^2\to 0$. If uniform kernel is used, the weights $k_{i, h}$ are just the
+diagonal elements of the partial projection matrix. We therefore refer to
+$L_{\max}$ as maximal (partial) leverage, and it is reported in the `RDHonest`
+output. The package issues a warning if the maximal leverage exceeds $0.1$---in
+such cases using a bigger bandwidth is advised.
+
+In the fuzzy RD case, by Theorem B.2 in the appendix to @ArKo20, the estimator
+is asymptotically equivalent to $\sum_i k_{i, h} (Y_i-D_i\theta)/\tau_D$, where
+$k_{i, h}$ are the weights for $\hat{\tau}_{Y, h}$. The maximal leverage
+calculations are thus analogous to the sharp case.
+
+With local regression methods, it is clear that observations outside the
+estimation window don't contribute to estimation, reducing the effective sample
+size. If the uniform kernel is used, the package therefore reports the number of
+observations inside the estimation window as the "number of effective
+observations".
+
+To make this number comparable across different kernels, observe that, under
+homoskedasticity, the variance of a linear estimator $\sum_{i} k_{i} Y_i$ is
+$\sigma^2\sum_{i} k_i^2$. We expect this to scale in inverse proportion to the
+sample size: with twice as many observations and the same bandwidth, we expect
+the variance to halve. Therefore, if the variance ratio relative to a uniform
+kernel estimator with weights $\sum_i k_{uniform, i}Y_i$ is $\sigma^2\sum_{i}
+k_{uniform, i}^2/\sigma^2\sum_{i} k_i^2=\sum_{i} k_{uniform, i}^2/\sum_{i}
+k_i^2$, the precision of this estimator is the same as if we used a uniform
+kernel, but with $\sum_{i} k_{uniform, i}^2/\sum_{i} k_i^2$ as many
+observations. Correspondingly, we define the number of effective observations
+for other kernels as the number of observations inside the estimation window
+times $\sum_{i} k_{uniform, i}^2/\sum_{i} k_i^2$. With this definition, using a
+triangular kernel typically yields effective samples sizes equal to about 80% of
+the number of observations inside the estimation window.
+
+Finally, to assess which observations are important for pinning down the
+estimate, it can be useful to explicitly plot the estimation weights.
 
 # References
diff --git a/doc/RDHonest.pdf b/doc/RDHonest.pdf
index 6d9f466..33aae85 100644
Binary files a/doc/RDHonest.pdf and b/doc/RDHonest.pdf differ
diff --git a/doc/manual.pdf b/doc/manual.pdf
index 463fdaf..ac2a7bb 100644
Binary files a/doc/manual.pdf and b/doc/manual.pdf differ
diff --git a/inst/WORDLIST b/inst/WORDLIST
index 5b11833..011117b 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -5,6 +5,8 @@ CMD
 Christoph
 Econometrica
 Eicker
+FLCI
+Frisch
 Guglielmo
 Hölder
 Imbens
@@ -13,38 +15,50 @@ Kalyanaraman
 Karthik
 Lalive
 Lalive's
-Lindeberg
+Lovell
 Optimality
 Oreopoulos
 REBP
 Rettore
 Rothe
 SES
-Silverman's
 Stata
+WLS
 cdot
 coercible
+cov
 cv
 dotsc
 downarrow
+eq
 estimand
 frac
 geq
+hatse
+heteroskedasticity
+homoskedastic
+homoskedasticity
 infty
 leq
 lim
+mapsto
 mathbb
 mathcal
 misspecification
+monotonicity
+neq
 nn
+operatorname
 optimality
 overset
 pmatrix
+pre
 preprint
 priori
 qquad
-se
+quartic
+sd
 th
+unadjusted
 uparrow
 vStata
-widehat
diff --git a/man-roxygen/RDFormula.R b/man-roxygen/RDFormula.R
index d793190..8946ee2 100644
--- a/man-roxygen/RDFormula.R
+++ b/man-roxygen/RDFormula.R
@@ -2,8 +2,13 @@
 #' \code{subset} is evaluated in the same way as variables in \code{formula},
 #' that is first in \code{data} and then in the environment of \code{formula}.
 
-#' @param formula object of class \code{"formula"} (or one that can be coerced
-#'     to that class) of the form \code{outcome ~ running_variable}
+#' @param formula an object of class \code{"formula"} (or one that can be
+#'     coerced to that class). The formula syntax is \code{outcome ~
+#'     running_variable} for inference at a point. For sharp RD, it is
+#'     \code{outcome ~ running_variable} if there are no covariates, or
+#'     \code{outcome ~ running_variable | covariates} if covariates are present.
+#'     For fuzzy RD, it is \code{outcome | treatment ~ running_variable |
+#'     covariates}, with \code{covariates} optional.
 #' @param data optional data frame, list or environment (or object coercible by
 #'     \code{as.data.frame} to a data frame) containing the outcome and running
 #'     variables in the model. If not found in \code{data}, the variables are
diff --git a/man-roxygen/RDFormulaSimple.R b/man-roxygen/RDFormulaSimple.R
new file mode 100644
index 0000000..d793190
--- /dev/null
+++ b/man-roxygen/RDFormulaSimple.R
@@ -0,0 +1,17 @@
+#' @section Note:
+#' \code{subset} is evaluated in the same way as variables in \code{formula},
+#' that is first in \code{data} and then in the environment of \code{formula}.
+
+#' @param formula object of class \code{"formula"} (or one that can be coerced
+#'     to that class) of the form \code{outcome ~ running_variable}
+#' @param data optional data frame, list or environment (or object coercible by
+#'     \code{as.data.frame} to a data frame) containing the outcome and running
+#'     variables in the model. If not found in \code{data}, the variables are
+#'     taken from \code{environment(formula)}, typically the environment from
+#'     which the function is called.
+#' @param subset optional vector specifying a subset of observations to be used
+#'     in the fitting process.
+#' @param na.action function which indicates what should happen when the data
+#'     contain \code{NA}s. The default is set by the \code{na.action} setting of
+#'     \code{options} (usually \code{na.omit}). Another possible value is
+#'     \code{na.fail}
diff --git a/man/RDHonest.Rd b/man/RDHonest.Rd
index 6bfd682..f241b82 100644
--- a/man/RDHonest.Rd
+++ b/man/RDHonest.Rd
@@ -29,8 +29,13 @@ RDHonest(
 )
 }
 \arguments{
-\item{formula}{object of class \code{"formula"} (or one that can be coerced
-to that class) of the form \code{outcome ~ running_variable}}
+\item{formula}{an object of class \code{"formula"} (or one that can be
+coerced to that class). The formula syntax is \code{outcome ~
+running_variable} for inference at a point. For sharp RD, it is
+\code{outcome ~ running_variable} if there are no covariates, or
+\code{outcome ~ running_variable | covariates} if covariates are present.
+For fuzzy RD, it is \code{outcome | treatment ~ running_variable |
+covariates}, with \code{covariates} optional.}
 
 \item{data}{optional data frame, list or environment (or object coercible by
 \code{as.data.frame} to a data frame) containing the outcome and running
@@ -42,27 +47,33 @@ which the function is called.}
 in the fitting process.}
 
 \item{weights}{Optional vector of weights to weight the observations (useful
-for aggregated data). Disregarded if optimal kernel is used.}
+for aggregated data). The weights are intepreted as the number of
+observations that each aggregated datapoint averages over. Disregarded if
+optimal kernel is used.}
 
 \item{cutoff}{specifies the RD cutoff in the running variable. For inference
 at a point, specifies the point \eqn{x_0} at which to calculate the
 conditional mean.}
 
-\item{M}{Bound on second derivative of the conditional mean function.}
+\item{M}{Bound on second derivative of the conditional mean function, a
+numeric vector of length one. For fuzzy RD, \code{M} needs to be a
+numeric vector of length two, specifying the smoothness of the
+conditional mean for the outcome and treatment, respectively.}
 
-\item{kern}{specifies kernel function used in the local regression. It can
-either be a string equal to \code{"triangular"} (\eqn{k(u)=(1-|u|)_{+}}),
-\code{"epanechnikov"} (\eqn{k(u)=(3/4)(1-u^2)_{+}}), or \code{"uniform"}
-(\eqn{k(u)= (|u|<1)/2}), or else a kernel function. If equal to
-\code{"optimal"}, use the finite-sample optimal linear estimator under
-Taylor smoothness class, instead of a local linear estimator.}
+\item{kern}{specifies the kernel function used in the local regression. It
+can either be a string equal to \code{"triangular"}
+(\eqn{k(u)=(1-|u|)_{+}}), \code{"epanechnikov"}
+(\eqn{k(u)=(3/4)(1-u^2)_{+}}), or \code{"uniform"} (\eqn{k(u)=
+(|u|<1)/2}), or else a kernel function. If equal to \code{"optimal"}, use
+the finite-sample optimal linear estimator under Taylor smoothness class,
+instead of a local linear estimator.}
 
 \item{na.action}{function which indicates what should happen when the data
 contain \code{NA}s. The default is set by the \code{na.action} setting of
 \code{options} (usually \code{na.omit}). Another possible value is
 \code{na.fail}}
 
-\item{opt.criterion}{Optimality criterion that bandwidth is designed to
+\item{opt.criterion}{Optimality criterion that the bandwidth is designed to
     optimize. The options are:
 
    \describe{
@@ -78,16 +89,16 @@ contain \code{NA}s. The default is set by the \code{na.action} setting of
     }
 
     The methods use conditional variance given by \code{sigmaY2}, if
-    supplied. Otherwise, for the purpose of estimating the optimal bandwidth,
-    conditional variance is estimated using the method specified by
-    \code{se.initial}.}
+    supplied. For fuzzy RD, \code{sigmaD2} and \code{sigmaYD} also need to be
+    supplied in this case. Otherwise, the methods use preliminary variance
+    estimates based on assuming homoskedasticity on either side of the
+    cutoff.}
 
 \item{h}{bandwidth, a scalar parameter. If not supplied, optimal bandwidth is
 computed according to criterion given by \code{opt.criterion}.}
 
-\item{se.method}{Vector with methods for estimating standard error of
-    estimate. If \code{NULL}, standard errors are not computed. The elements
-    of the vector can consist of the following methods:
+\item{se.method}{method for estimating standard error of the estimate, one
+    of:
 
 \describe{
     \item{"nn"}{Nearest neighbor method}
@@ -96,7 +107,8 @@ computed according to criterion given by \code{opt.criterion}.}
     (local polynomial estimators only).}
 
    \item{"supplied.var"}{Use conditional variance supplied by \code{sigmaY2}
-        instead of computing residuals}
+        instead of computing residuals. For fuzzy RD, \code{sigmaD2} and
+        \code{sigmaYD} also need to be supplied in this case.}
 
 }}
 
@@ -105,16 +117,16 @@ constructing/optimizing confidence intervals.}
 
 \item{beta}{Determines quantile of excess length to optimize, if bandwidth
 optimizes given quantile of excess length of one-sided confidence
-intervals; otherwise ignored.}
+intervals (\code{opt.criterion="OCI"}); otherwise ignored.}
 
-\item{J}{Number of nearest neighbors, if "nn" is specified in
-\code{se.method}.}
+\item{J}{Number of nearest neighbors, if \code{se.method="nn"} is specified.
+Otherwise ignored.}
 
 \item{sclass}{Smoothness class, either \code{"T"} for Taylor or \code{"H"}
 for Hölder class.}
 
 \item{T0}{Initial estimate of the treatment effect for calculating the
-optimal bandwidth. Only relevant for Fuzzy RD.}
+optimal bandwidth. Only relevant for fuzzy RD.}
 
 \item{point.inference}{Do inference at a point determined by \code{cutoff}
 instead of RD.}
@@ -125,46 +137,71 @@ instead of RD.}
 
 \item{sigmaYD}{Supply covariance of treatment and outcome (fuzzy RD only).}
 
-\item{clusterid}{Cluster id for cluster-robust standard errors}
+\item{clusterid}{Vector specifying cluster membership. If supplied,
+\code{se.method="EHW"} is required, and standard errors use
+cluster-robust variance formulas.}
 }
 \value{
 Returns an object of class \code{"RDResults"}. The function
     \code{print} can be used to obtain and print a summary of the results. An
-    object of class \code{"RDResults"} is a list containing the following
-    components
+    object of class \code{"RDResults"} is a list containing four components.
+    First, a data frame \code{"coefficients"} containing the following
+    columns:
 
     \describe{
-  \item{\code{estimate}}{Point estimate. This estimate is MSE-optimal if
-                  \code{opt.criterion="MSE"}}
+    \item{\code{term}}{type of parameter being estimated}
 
-  \item{\code{lff}}{Least favorable function, only relevant for optimal
-             estimator under Taylor class.}
+    \item{\code{estimate}}{point estimate}
 
-  \item{\code{maxbias}}{Maximum bias of \code{estimate}}
+    \item{\code{std.error}}{standard error of \code{estimate}}
 
-  \item{\code{sd}}{Standard deviation of estimate}
+    \item{\code{maximum.bias}}{maximum bias of \code{estimate}}
 
-  \item{\code{lower}, \code{upper}}{Lower (upper) end-point of a one-sided CI
-        based on \code{estimate}. This CI is optimal if
-        \code{opt.criterion=="OCI"}}
+    \item{\code{conf.low}, \code{conf.high}}{lower (upper) end-point of a
+    two-sided CI based on \code{estimate}}
 
-  \item{\code{hl}}{Half-length of a two-sided CI based on \code{estimate}, so
-            that the CI is given by \code{c(estimate-hl, estimate+hl)}. The
-            CI is optimal if \code{opt.criterion="FLCI"}}
+    \item{\code{conf.low.onesided}, \code{conf.high.onesided}}{lower (upper)
+        end-point of a one-sided CIs based on \code{estimate}}
 
-  \item{\code{eff.obs}}{Effective number of observations used by
-            \code{estimate}}
+    \item{\code{bandwidth}}{bandwidth used. If \code{kern="optimal"}, the
+      smoothing parameters \code{bandwidth.m} and \code{bandwidth.p} on
+      either side of the cutoff are reported intead}
 
-  \item{\code{h}}{Bandwidth used}
+    \item{\code{eff.obs}}{number of effective observations}
 
-  \item{\code{naive}}{Coverage of CI that ignores bias and uses
-               \code{qnorm(1-alpha/2)} as critical value}
+    \item{\code{leverage}}{maximal leverage of \code{estimate}}
 
-  \item{\code{call}}{the matched call}
+    \item{\code{cv}}{critical value used to compute two-sided CIs}
 
-  \item{\code{fs}}{Estimate of the first-stage coefficient (sharp RD only)}
+    \item{\code{alpha}}{coverage level, as specified by option \code{alpha}}
 
-}
+    \item{\code{method}}{\code{sclass} is used}
+
+     \item{\code{M}}{curvature bound used for worst-case bias
+        calculations. For fuzzy RD, equals
+        \code{(abs(estimate)*M.fs+M.rf)/first.stage}}
+
+     \item{\code{M.rf}, \code{M.fs}}{curvature bound for the outcome (i.e.
+     reduced-form) and first-stage regressions. Fuzzy RD only.}
+
+      \item{\code{first.stage}}{estimate of the first-stage coefficient.
+      Fuzzy RD only.}
+
+      \item{\code{kernel}}{kernel used}
+
+       \item{\code{p.value}}{p-value for testing the null of no effect}
+     }
+
+    Second, a list called \code{"data"} containing the data used for
+    estimation. This is useful mostly for internal calculations. Third, an
+    object of class \code{"lm"} containing the local linear regression
+    estimates. Finally, a \code{call} object containing the matched call
+    called \code{"call"}.
+
+    If \code{kern="optimal"}, the \code{"lm"} object is empty, and the
+    numeric vectors \code{"delta"} and \code{"omega"} are returned in
+    addition. These correspond to the parameters in the modulus problem used
+    to compute the optimal estimation weights.
 }
 \description{
 Calculate estimators and bias-aware CIs for the sharp or fuzzy RD parameter,
@@ -201,10 +238,6 @@ of regression models. Econometrica, 86(2):655–683, March 2018.
 intervals in nonparametric regression. Quantitative Economics, 11(1):1–39,
 January 2020. \doi{10.3982/QE1199}}
 
-\cite{Guido W. Imbens and Karthik Kalyanaraman. Optimal bandwidth choice for
-the regression discontinuity estimator. The Review of Economic Studies,
-79(3):933–959, July 2012. \doi{10.1093/restud/rdr043}}
-
 \cite{Michal Kolesár and Christoph Rothe. Inference in regression
 discontinuity designs with a discrete running variable. American Economic
 Review, 108(8):2277—-2304, August 2018. \doi{10.1257/aer.20160945}}
diff --git a/man/RDHonestBME.Rd b/man/RDHonestBME.Rd
index 92a7a2a..4e2df54 100644
--- a/man/RDHonestBME.Rd
+++ b/man/RDHonestBME.Rd
@@ -63,17 +63,20 @@ An object of class \code{"RDResults"}. This is a list with at least
 
    \item{\code{"call"}}{The matched call.}
 
+   \item{\code{"lm"}}{An \code{"lm"} object containing the fitted
+   regression.}
+
    \item{\code{"na.action"}}{(If relevant) information on the special
    handling of \code{NA}s.}
 
 }
 }
 \description{
-Computes honest CIs for local polynomial regression with uniform kernel under
-the assumption that the conditional mean lies in the bounded misspecification
-error (BME) class of functions, as considered in Kolesár and Rothe (2018).
-This class formalizes the notion that the fit of the chosen model is no worse
-at the cutoff than elsewhere in the estimation window.
+Computes honest CIs for local polynomial regression with uniform kernel in
+sharp RD under the assumption that the conditional mean lies in the bounded
+misspecification error (BME) class of functions, as considered in Kolesár and
+Rothe (2018). This class formalizes the notion that the fit of the chosen
+model is no worse at the cutoff than elsewhere in the estimation window.
 }
 \section{Note}{
 
diff --git a/tests/testthat/test_clustering.R b/tests/testthat/test_clustering.R
index 6ede954..42448c5 100644
--- a/tests/testthat/test_clustering.R
+++ b/tests/testthat/test_clustering.R
@@ -11,11 +11,12 @@ test_that("Test clustering formulas", {
     expect_message(f0c <- RDHonest(c|retired ~ elig_year, data=rr,
                                    clusterid=seq_along(rr$c), se.method="EHW"))
     expect_equal(f0$coefficients, f0c$coefficients)
-    expect_message(p0 <- RDHonest(c~elig_year, data=rr,
+
+    expect_message(p0 <- RDHonest(c~elig_year, data=rr[1:100, ], h=10,
                                   se.method="EHW", point.inference=TRUE))
-    expect_message(p0c <- RDHonest(c~elig_year, data=rr,
-                                   clusterid=seq_along(rr$c), se.method="EHW",
-                                   point.inference=TRUE))
+    expect_message(p0c <- RDHonest(c~elig_year, data=rr[1:100, ], h=10,
+                                   clusterid=seq_along(rr$c[1:100]),
+                                   se.method="EHW", point.inference=TRUE))
     expect_equal(p0$coefficients, p0c$coefficients)
 
     ## Check we match sandwich::vcovCL: uniform + triangular
diff --git a/tests/testthat/test_covariates.R b/tests/testthat/test_covariates.R
index 150b20d..cc2b2cc 100644
--- a/tests/testthat/test_covariates.R
+++ b/tests/testthat/test_covariates.R
@@ -47,10 +47,10 @@ test_that("Test covariates", {
     dd <- sort(r0$lm$coefficients)-sort(m3$coefficients)
     expect_lt(max(abs(dd)), 1e-8)
 
-    ## TODO: return lm object, so we can do normal inference: print method
-
-    ## TODO: speed up
-
-    ## TODO: pass function as kern
-
+    ## pass function as kern
+    expect_message(r00 <- RDHonest(log(c)|retired ~ elig_year|food, data=df,
+                                   weights=survey_year,
+                                   h=r0$coefficients$bandwidth,
+                                   kern=function(u) pmax(1-abs(u), 0)))
+    expect_equal(r00$coefficients, r00$coefficients)
 })
diff --git a/tests/testthat/test_interface.R b/tests/testthat/test_interface.R
index 4ff0ad2..e745471 100644
--- a/tests/testthat/test_interface.R
+++ b/tests/testthat/test_interface.R
@@ -1,7 +1,9 @@
 test_that("Test inputs", {
     expect_error(r2 <- RDHonest(log(cn)~retired|elig_year, data=rcp, M=1, T0=0))
     expect_error(r2 <- RDHonest(log(cn)~elig_year, data=rcp, M=c(1, 1)))
-
+    expect_error(r2 <- RDHonest(log(cn)~elig_year, data=rcp, kern="Unif"))
+    expect_error(RDHonest(log(cn)|retired~elig_year, data=rcp, M=1, T0=0,
+                          kern="optimal"))
     expect_message(pp <- RDHonest(voteshare~margin, data=lee08,
                                   M=2, h=5, subset=I(margin>0))$coefficients)
     expect_equal(as.numeric(pp[c(2, 3, 11)]),
diff --git a/tests/testthat/test_rd.R b/tests/testthat/test_rd.R
index 6cfa5c6..9c83761 100644
--- a/tests/testthat/test_rd.R
+++ b/tests/testthat/test_rd.R
@@ -141,11 +141,11 @@ test_that("Honest inference in Lee and LM data",  {
     expect_equal(ro[8], paste0("margin 5.8864375  1.4472117",
                                "   0.80247208 (2.6646087, 9.1082664)"))
     ## Try nn
-    r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.04,
-                   opt.criterion="FLCI", se.method="nn")
-    expect_equal(as.numeric(r1$coefficients[2:6]),
-                 c(5.886437523, 1.196647571, 0.8024720847, 3.099785428,
-                   8.673089618))
+    r1 <- RDHonest(voteshare ~ margin, data=lee08[(1:1000)*6, ], kern="optimal",
+                   M=0.04, opt.criterion="FLCI", se.method="nn")
+    expect_equal(as.numeric(r1$coefficients[2:8]),
+                 c(5.3946873132,  2.5735905969,  1.6959559231, -0.5711839065,
+                   11.3605585330, -0.5344484375, 11.3238230640))
     ## We no longer allow different bw below and above cutoff
     ## r <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular",
     ##               M=0.04, opt.criterion="MSE", se.method="supplied.var",
@@ -195,21 +195,21 @@ test_that("Honest inference in Lee and LM data",  {
     ## Missing values
     expect_error(RDHonest(mortHS ~ povrate, data=headst, kern="uniform", h=12,
                           na.action="na.fail"))
-    expect_message(r1 <- RDHonest(mortHS ~ povrate, data=headst, kern="uniform",
-                                  na.action="na.omit"))
+    expect_message(r1 <- RDHonest(mortHS ~ povrate, data=headst[c(2500:3000), ],
+                                  kern="uniform", na.action="na.omit"))
     r1 <- capture.output(print(r1, digits=6))
     expect_equal(r1[c(8, 11, 12, 22)],
-                 c(paste0("I(povrate>0) -3.17121    1.44439",
-                          "     0.759228 (-6.35207, 0.00964571)"),
-                   "Number of effective observations:     239",
-                   "Maximal leverage for sharp RD parameter: 0.019615",
-                   "24 observations with missing values dropped"))
+                 c(paste0("I(povrate>0) -2.75044    2.08871      ",
+                          "1.49488 (-7.70164, 2.20076)"),
+                   "Number of effective observations:      65",
+                   "Maximal leverage for sharp RD parameter: 0.0615608",
+                   "3 observations with missing values dropped"))
     expect_message(r2 <- RDHonest(mortHS ~ povrate, data=headst,
-                                  kern="epanechnikov",
+                                  kern="epanechnikov", h=9.42625,
                                   point.inference=TRUE, cutoff=4))
     expect_equal(capture.output(print(r2, digits=6))[c(8, 11, 19)],
                  c(paste0("(Intercept)  2.63181   0.300132     ",
-                          "0.152587  (1.97505, 3.28856)"),
+                          "0.152588  (1.97505, 3.28856)"),
                    "Number of effective observations: 373.642",
                    "   2.631805    -0.000162  "))
 })
@@ -307,9 +307,3 @@ test_that("Adaptation bounds", {
     b2 <- RDTEfficiencyBound(r, 2*0.005, opt.criterion="FLCI")
     expect_equal(unname(c(b1, b2)), c(0.9956901, 0.95870418))
 })
-
-
-## rf <- RDHonest(log(cn)~retired | elig_year, data=rcp, cutoff=0)
-## rs <- RDHonest(log(cn)~elig_year, data=rcp, cutoff=0)
-## ro <- RDHonest(log(cn)~elig_year, data=rcp, cutoff=0, kern="optimal")
-## rp <- RDHonest(log(cn)~elig_year, data=rcp, cutoff=1, point.inference=TRUE)
diff --git a/vignettes/RDHonest.Rmd b/vignettes/RDHonest.Rmd
index 21b396c..6ec6ac4 100644
--- a/vignettes/RDHonest.Rmd
+++ b/vignettes/RDHonest.Rmd
@@ -3,21 +3,21 @@ output:
   pdf_document:
     latex_engine: pdflatex
     citation_package: natbib
-    template: mk_Rpackage_template.tex
     toc: true
     toc_depth: 2
     includes:
         in_header: vignette_head.tex
     keep_tex: true
+    number_sections: true
 title: "Honest inference in Regression Discontinuity Designs"
 author: "Michal Kolesár"
 date: "`r format(Sys.time(), '%B %d, %Y')`"
 geometry: margin=1in
 fontfamily: mathpazo
-fontsize: 10pt
+fontsize: 11pt
 bibliography: np-testing-library.bib
 vignette: >
-  %\VignetteIndexEntry{Honest inference in Sharp Regression Discontinuity}
+  %\VignetteIndexEntry{RDHonest}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
@@ -31,53 +31,57 @@ knitr::opts_chunk$set(tidy = TRUE, collapse=TRUE, comment = "#>",
 
 # Introduction
 
-The package `RDHonest` implements confidence intervals for the regression
-discontinuity parameter considered in @ArKo16optimal, @ArKo16honest, and
-@KoRo16. In this vignette, we demonstrate the implementation of these confidence
-intervals using datasets from @lee08, @oreopoulos06, and @battistin09, which are
-included in the package as a data frame `lee08`, `cghs`, and `rcp`. The datasets
-from @lalive08 and @LuMi07 that are used in @ArKo16honest, and @KoRo16 are also
-included in the package as data frames `rebp` and `headst`.
+The package `RDHonest` implements bias-aware inference methods in regression
+discontinuity (RD) designs developed in @ArKo18optimal, @ArKo20, and
+@KoRo16. In this vignette, we demonstrate the implementation of these methods
+using datasets from @lee08, @oreopoulos06, and @battistin09 and @LuMi07, which
+are included in the package as a data frame `lee08`, `cghs`, `rcp` and `headst`.
+The dataset from @lalive08 used in @KoRo16 is also included in the package as
+data frame `rebp`.
 
 # Sharp RD
 
 ## Model
 
-In the sharp regression discontinuity model, we observe units $i=1,\dotsc,n$,
-with the outcome $y_i$ for the $i$th unit given by $$ y_i = f(x_i) + u_i, $$
-where $f(x_i)$ is the expectation of $y_i$ conditional on the running variable
-$x_i$ and $u_i$ is the regression error. A unit is treated if and only if the
-running variable $x_{i}$ lies weakly above a known cutoff $x_{i}\geq c_{0}$. The
-parameter of interest is given by the jump of $f$ at the cutoff, $$
-\beta=\lim_{x\downarrow c_{0}}f(x)-\lim_{x\uparrow c_{0}}f(x).$$ Let
-$\sigma^2(x_i)$ denote the conditional variance of $u_i$.
-
-In the @lee08 dataset, the running variable corresponds to the margin of victory of
+We observe units $i=1,\dotsc, n$, with the outcome $Y_i$ for the $i$th unit given
+by
+\begin{equation}
+Y_i = f_Y(x_i) + u_{Y, i}
+\end{equation}
+where $f_Y(x_i)$ is the expectation of $Y_i$
+conditional on the running variable $x_i$ and $u_{Y, i}$ is the regression error that
+is conditionally mean zero by definition.
+
+A unit is assigned to treatment if and only if the running variable $x_{i}$ lies
+weakly above a known cutoff. We denote the assignment indicator by
+$Z_{i}=I\{x_{i}\geq c_{0}\}$. In a sharp RD design, all units comply with the
+assigned treatment, so that the observed treatment coincides with treatment
+assignment, $D_{i}=Z_{i}$. The parameter of interest is given by the jump of $f$
+at the cutoff, $$ \tau_Y=\lim_{x\downarrow c_{0}}f_Y(x)-\lim_{x\uparrow
+c_{0}}f_Y(x).$$ Under mild continuity conditions, $\tau_{Y}$ can
+be interpreted as the effect of the treatment for units at the threshold
+(@htv01). Let $\sigma_Y^2(x_i)$ denote the conditional variance of $Y_i$.
+
+In the @lee08 dataset `lee08`, the running variable corresponds to the margin of victory of
 a Democratic candidate in a US House election, and the treatment corresponds to
 winning the election. Therefore, the cutoff is zero. The outcome of interest is
 the Democratic vote share in the following election.
 
-The Oreopoulos dataset consists of a subsample of British workers, and it
-exploits a change in minimum school leaving age in the UK from 14 to 15, which
-occurred in 1947. The running variable is the year in which the individual turned
-14, with the cutoff equal to 1947 so that the "treatment" is being subject to a
-higher minimum school-leaving age. The outcome is log earnings in 1998.
-
-Some of the functions in the package require the data to be transformed into a
-custom `RDData` format. This can be accomplished with the `RDData` function:
-
-```{r}
-library("RDHonest")
-```
+The `cghs` dataset from @oreopoulos06 consists of a subsample of British
+workers. The RD design exploits a change in minimum school-leaving age in the UK
+from 14 to 15, which occurred in 1947. The running variable is the year in which
+the individual turned 14, with the cutoff equal to 1947 so that the "treatment"
+is being subject to a higher minimum school-leaving age. The outcome is log
+earnings in 1998.
 
 ## Plots
 
 The package provides a function `RDScatter` to plot the raw data. To remove
-some noise, the function plots averages over `avg` number of observations. The
-function takes an `RDData` object as an argument
+some noise, the function plots averages over `avg` number of observations.
 
 ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Lee (2008) data"}
-## plot 25-bin averages in for observations 50 at most points
+library("RDHonest")
+## plot 50-bin averages in for observations 50 at most points
 ## away from the cutoff. See Figure 1.
 RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
           avg=50, propdotsize=FALSE, xlab="Margin of victory",
@@ -86,13 +90,13 @@ RDScatter(voteshare~margin, data=lee08, subset=abs(lee08$margin)<=50,
 
 The running variable in the Oreopoulos dataset is discrete. It is therefore
 natural to plot the average outcome by each value of the running variable, which
-is achieved using by setting `avg=Inf`. The option `dotsize="count"` makes the
+is achieved using by setting `avg=Inf`. The option `propdotsize=TRUE` makes the
 size of the points proportional to the number of observations that the point
 averages over.
 
 ```{r, fig.width=4.5, fig.height=3.5, fig.cap="Oreopoulos (2006) data"}
 ## see Figure 2
-f2 <- RDScatter(I(log(earnings))~yearat14, data=cghs, cutoff=1947,
+f2 <- RDScatter(log(earnings)~yearat14, data=cghs, cutoff=1947,
                 avg=Inf, xlab="Year aged 14", ylab="Log earnings",
                 propdotsize=TRUE)
 ## Adjust size of dots if they are too big
@@ -102,53 +106,110 @@ f2 + ggplot2::scale_size_area(max_size = 4)
 ## Inference based on local polynomial estimates
 
 The function `RDHonest` constructs one- and two-sided confidence intervals (CIs)
-around local linear and local quadratic estimators using either a user-supplied
-bandwidth (which is allowed to differ on either side of the cutoff), or
+around local linear estimators using either a user-supplied bandwidth, or
 bandwidth that is optimized for a given performance criterion. The sense of
 honesty is that, if the regression errors are normally distributed with known
 variance, the CIs are guaranteed to achieve correct coverage _in finite
 samples_, and achieve correct coverage asymptotically uniformly over the
 parameter space otherwise. Furthermore, because the CIs explicitly take into
-account the possible bias of the estimators, the asymptotic approximation
-doesn't rely on the bandwidth to shrink to zero at a particular rate.
-
-To describe the form of the CIs, let $\hat{\beta}_{h_{+},h_{-}}$ denote a a
-local polynomial estimator with bandwidth equal to $h_{+}$ above the cutoff and
-equal to $h_{-}$ below the cutoff. Let $\beta_{h_{+},h_{-}}(f)$ denote its
-expectation conditional on the covariates when the regression function equals
-$f$. Then the bias of the estimator is given by $\beta_{h_{+},h_{-}}(f)-\beta$.
-Let $$
-B(\hat{\beta}_{h_{+},h_{-}})=\sup_{f\in\mathcal{F}}|\beta_{h_{+},h_{-}}(f)-\beta|
-$$ denote the worst-case bias over the parameter space $\mathcal{F}$. Then the
-lower limit of a one-sided CI is given by $$\hat{\beta}_{h_{+},h_{-}}-
-B(\hat{\beta}_{h_{+},h_{-}})-z_{1-\alpha}\widehat{se}(\hat{\beta}_{h_{+},h_{-}}),
-$$ where $z_{1-\alpha}$ is the $1-\alpha$ quantile of a standard normal
-distribution, and $\widehat{se}(\hat{\beta}_{h_{+},h_{-}})$ is the standard
-error (an estimate of the standard deviation of the estimator). Subtracting the
-worst-case bias in addition to the usual critical value times standard error
-ensures correct coverage at all points in the parameter space.
-
-A two-sided CI is given by $$ \hat{\beta}_{h_{+},h_{-}} \pm
-cv_{1-\alpha}(B(\hat{\beta}_{h_{+},h_{-}})/\widehat{se}(\hat{\beta}_{h_{+},h_{-}}))\times
-\widehat{se}(\hat{\beta}_{h_{+},h_{-}}),$$ where the critical value function
-$cv_{1-\alpha}(b)$ corresponds to the $1-\alpha$ quantile of the $|N(b,1)|$
-distribution. To see why using this critical value ensures honesty, decompose
-the $t$-statistic as $$
-\frac{\hat{\beta}_{h_{+},h_{-}}-\beta}{\widehat{se}(\hat{\beta}_{h_{+},h_{-}})}
-=
-\frac{\hat{\beta}_{h_{+},h_{-}}-\beta_{h_{+},h_{-}}(f)}{\widehat{se}(\hat{\beta}_{h_{+},h_{-}})}
-+\frac{\beta_{h_{+},h_{-}}(f)-\beta}{\widehat{se}(\hat{\beta}_{h_{+},h_{-}})} $$
-By a central limit theorem, the first term on the right-hand side will by
-distributed standard normal, irrespective of the bias. The second term is
-bounded in absolute value by
-$B(\hat{\beta}_{h_{+},h_{-}})/\widehat{se}(\hat{\beta}_{h_{+},h_{-}})$, so that,
-in large samples, the $1-\alpha$ quantile of the absolute value of the
-$t$-statistic will be bounded by
-$cv_{1-\alpha}(B(\hat{\beta}_{h_{+},h_{-}})/\widehat{se}(\hat{\beta}_{h_{+},h_{-}}))$.
-This approach gives tighter CIs than simply adding and subtracting
-$B(\hat{\beta}_{h_+,h_-})$ from the point estimate, in addition to adding and subtracting $z_{1-\alpha}\widehat{se}(\hat{\beta}_{h_+,h_-})$
-
-The function `CVb` gives these critical values:
+account the possible bias of the estimators, the asymptotic approximation does
+not rely on the bandwidth to shrinking to zero at a particular rate---in fact,
+the CIs are valid even if the bandwidth is fixed as $n\to\infty$.
+
+We first estimate $\tau_{Y}$ using local linear regression: instead of using all
+available observations, we only use observations within a narrow estimation
+window around the cutoff, determined by a bandwidth $h$. Within this estimation
+window, we may choose to give more weight to observations closer to the
+cutoff---this weighing is determined a kernel $K$. The local linear regression
+is just a weighted least squares (WLS) regression of $Y_{i}$ onto the treatment
+indicator, the running variable, and their interaction, weighting each
+observation using kernel weights $K(x_{i}/h)$. The local linear regression
+estimator $\hat{\tau}_{Y, h}$ or $\tau_{Y}$ is given by the first element of the
+vector of regression coefficients from this regression,
+$$\hat{\beta}_{Y, h}=
+  \left(\sum_{i=1}^{n}K(x_{i}/h)m(x_{i})m(x_{i})'\right)^{-1}
+  \sum_{i=1}^{n}K(x_{i}/h)m(x_{i})Y_{i},$$
+where $m(x)=(I\{x\geq 0\}, I\{x\geq 0\}x, 1, x)'$ collects all the regressors, and we normalize the cutoff to zero.
+When the kernel is uniform, $K(u)=I\{\abs{u}\leq 1\}$, $\hat{\tau}_{Y, h}$ is
+simply the treatment coefficient from an OLS regression of $Y_{i}$ onto
+$m(x_{i})$ for observations that are within distance $h$ of the cutoff. Other
+kernel choices may weight observations within the estimation window $[-h, h]$
+differently, giving more weight to observations that are relatively closer to
+the cutoff.
+
+Equivalently, using the Frisch–Waugh–Lovell theorem, $\hat{\tau}_{Y, h}$ may
+also be computed by first running an auxiliary WLS regression of the
+treatment indicator $D_{i}$ onto the remaining regressors,
+$(I\{x_{i}\geq 0\}x_{i}, 1, x_{i})$ and then running a WLS regression of the
+outcome $Y_{i}$ onto the residuals $\tilde{D}_{i}$ from this auxiliary
+regression,
+$$\hat{\tau}_{Y, h}=\sum_{i=1}^{n}k_{i, h}Y_{i}, \quad k_{i, h}
+  =\frac{K(x_{i}/h)\tilde{D}_{i}}{\sum_{i=1}^{n}K(x_{i}/h)\tilde{D}_{i}^{2}}.$$
+This representation makes it clear that the estimator simply a weighted average
+of the outcomes. By definition of the residual $\tilde{D}_{i}$, the weights sum
+to zero, and satisfy $\sum_{i}k_{i, h}x_{i}=\sum_{i}k_{i, h}x_{i}I\{x_{i}\geq 0\}=0$:
+this ensures that our estimate of the jump at $0$ is unbiased when the
+regression function is piecewise linear inside the estimation window.
+
+### Bias-aware confidence intervals
+
+The estimator $\hat{\tau}_{Y, h}$ is a regression estimator, so it will be
+asymptotically normal under mild regularity conditions. In particular, if the
+residuals $u_{i}$ are well-behaved, a sufficient condition is that none of the
+weights $k_{i, h}$ are too influential in the sense that the maximal leverage
+goes to zero, as we discuss in the diagnostics section.
+
+Due to the asymptotic normality, the simplest approach to inference is to use
+the usual CI,
+$\hat{\tau}_{Y, h}\pm z_{1-\alpha/2}\hatse(\hat{\tau}_{Y, h})$, where
+$z_{\alpha}$ is the $\alpha$ quantile of the standard normal distribution.
+However, this CI will typically undercover relative to its nominal
+confidence level $1-\alpha$ because it's not correctly centered: unless the
+regression function $f_{Y}$ is exactly linear inside the estimation window, the
+estimator $\hat{\tau}_{Y, h}$ will be biased. If $f_{Y}$ is "close" to linear,
+then this bias will be small, but if it is wiggly, the bias may be substantial,
+leading to severe coverage distortions.
+
+The idea behind bias-aware inference methods is to bound the potential bias of
+the estimator by making an explicit assumption on the smoothness of $f_{Y}$. A
+convenient way of doing this is to bound the curvature of $f_{Y}$ by
+restricting its second derivative. To allow $f_{Y}$ to be discontinuous at zero,
+we assume that it's twice differentiable on either side of the cutoff, with a
+second derivative bounded by a known constant $M$. The choice of the exact
+curvature parameter $M$ is key to implementing bias-aware methods, and we
+discuss it below. Once $M$ is selected, we can work out the potential
+finite-sample bias of the estimator and account for it in the CI construction.
+In particular, it turns out that the absolute value of the bias of
+$\hat{\tau}_{Y, h}$ is maximized at the piecewise quadratic function $x\mapsto
+M x^{2}(I\{x< 0\}-I\{x\geq 0\})/2$, so that $$\abs{E[\hat{\tau}_{Y,
+h}-\tau_{Y}]}\leq B_{M, h}:=-\frac{M}{2}\sum_{i=1}^{n} k_{i,
+h}x_{i}^{2}\operatorname{sign}(x_{i}).$$ Hence, a simple way to ensure that we
+achieve correct coverage regardless of the true shape of the regression function
+$f_{Y}$ (so long as $\abs{f_{Y}''(x)}\leq M$) is to simply enlarge the usual CI
+by this bias bound, leading to the CI $\hat{\tau}_{Y, h}\pm (B_{M,
+h}+z_{1-\alpha/2}\hatse(\hat{\tau}_{Y, h}))$. We can actually to slightly better
+than this, since the bias bound can't simultaneously be binding on both
+endpoints of the CI. In particular, observe that in large samples, the
+$t$-statistic $(\hat{\tau}_{Y, h}-\tau_{Y})/\hatse(\hat{\tau}_{Y, h})$ is
+normally distributed with variance one, and mean that is bounded by $t=B_{M,
+h}/\hatse(\hat{\tau}_{Y, h})$ (ignoring sampling variability in the standard
+error, which is negligible in large samples). To ensure correct coverage, we
+therefore replace the usual critical value $z_{1-\alpha/2}$, with the $1-\alpha$
+quantile of folded normal distribution $\abs{\mathcal{N}(t,1)}$,
+$\cv_{\alpha}(t)$ (note $\cv_{\alpha}(0)=z_{1-\alpha/2}$). This leads to the
+bias-aware CI
+\begin{equation}
+\hat{\tau}_{Y, h} \pm
+\cv_{\alpha}(B_{M, h}/\hatse(\hat{\tau}_{Y, h})) \hatse(\hat{\tau}_{Y, h})
+\end{equation}
+Notice the bias bound $B_{M, h}$ accounts for the exact *finite-sample bias* of the estimator.
+ The only asymptotic approximation we have used in its
+construction is the approximate normality of the estimator $\hat{\tau}_{Y, h}$,
+which obtains without any restrictions on $f_{Y}$---we only need the maximal
+leverage to be close to zero, mirroring a standard leverage condition from
+parametric regression settings.
+
+The function `CVb` gives the critical values $\cv_{\alpha}(t)$:
 
 ```{r, }
 CVb(0, alpha=0.05) ## Usual critical value
@@ -158,83 +219,146 @@ CVb(1/2, alpha=0.05)
 CVb(0:5, alpha=0.1)
 ```
 
-### Parameter space
-
-To implement the honest CIs, one needs to specify the parameter space
-$\mathcal{F}$. The function `RDHonest` computes honest CIs when the parameter
-space $\mathcal{F}$ corresponds to a second-order Taylor or second-order Hölder
-smoothness class, which capture two different types of smoothness restrictions.
-The second-order Taylor class assumes that $f$ lies in the the class of
-functions $$\mathcal{F}_{\text{Taylor}}(M)= \left \{f_{+}-f_{-}\colon
-f_{+}\in\mathcal{F}_{T}(M; [c_{0}
-%] %(
-,\infty)),\;
-f_{-}\in\mathcal{F}_{T}(M;(-\infty, c_{0})) \right\},$$
-
-where $\mathcal{F}_{T}(M;\mathcal{X})$ consists of functions $f$ such that the
-approximation error from second-order Taylor expansion of $f(x)$ about $c_{0}$
-is bounded by $M|x|^{2}/2$, uniformly over $\mathcal{X}$: \begin{align*}
-\mathcal{F}_{T}(M;\mathcal{X}) =\left\{f\colon \left|
-f(x)-f(c_0)-f'(c_0)x\right| \leq M|x|^2/2\text{ all }x\in\mathcal{X}\right\}.
-\end{align*} The class $\mathcal{F}_{T}(M;\mathcal{X})$ formalizes the idea that
-the second derivative of $f$ at zero should be bounded by $M$. See Section 2 in
-@ArKo16optimal (note the constant $C$ in that paper equals $C=M/2$ here). This
-class is doesn't impose smoothness away from boundary, which may be undesirable
-in some empirical applications. The Hölder class addresses this problem by
-bounding the second derivative globally. In particular, it assumes that $f$ lies
-in the class of functions $$\mathcal{F}_{\text{Hölder}}(M)= \left
-\{f_{+}-f_{-}\colon f_{+}\in\mathcal{F}_{H}(M;[c_{0}%] %( ,\infty)),\;
-f_{-}\in\mathcal{F}_{H}(M;(-\infty, c_{0})) \right\},$$
-
-where $$ \mathcal{F}_{H}(M;\mathcal{X})=\{f\colon |f'(x)-f'(y)|\leq M|x-y|
-\;\;x,y\in\mathcal{X}\}.$$
-
-The smoothness class is specified using the option `sclass`. CIs around a local
-linear estimator with bandwidth that equals to 10 on either side of the cutoff
-when the parameter space is given by a Taylor and Hölder smoothness class,
-respectively, with $M=0.1$:
+The function `RDHonest` puts all these steps together. Specifying curvature
+parameter $M=0.1$, bandwidth $h=8$, and a triangular kernel yields:
 
 ```{r}
-RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="T")
-RDHonest(voteshare~margin, data=lee08, kern="uniform", M=0.1, h=10, sclass="H")
+r0 <- RDHonest(voteshare~margin, data=lee08, kern="triangular", M=0.1, h=8)
+print(r0)
 ```
 
-The confidence intervals use the nearest-neighbor method to estimate the
-standard error by default (this can be changed using the option `se.method`, see
-help file for `RDHonest`). The package reports two-sided as well one-sided CIs
-(with lower as well as upper limit) by default.
-
-Instead of specifying a bandwidth, one can just specify the smoothness class and
-smoothness constant $M$, and the bandwidth will be chosen optimally for a given
-optimality criterion:
+- The default for calculating the standard errors is to use the nearest neighbor
+  method. Specifying `se.method="EHW"` changes them to the regression-based
+  heteroskedasticity-robust Eicker-Huber-White standard errors. It can be shown
+  that unless the regression function $f_{Y}$ is linear inside the estimation
+  window, the `EHW` standard errors generally overestimate the conditional
+  variance.
+- The default option `sclass="H"` specifies the parameter space as second-order
+  Hölder smoothness class, which formalizes our assumption above that the second
+  derivative of $f_Y$ is bounded by $M$ on either side of the cutoff. The
+  package also allows the user to use a Taylor smoothness class by setting
+  `sclass="T"`. This changes the computation of the worst-case bias, and allows
+  $f_Y$ to correspond to any function such that the approximation error from a
+  second-order Taylor expansion around the cutoff is bounded by $M x^{2}/2$. For
+  more discussion, see Section 2 in @ArKo18optimal (note the constant $C$ in
+  that paper equals $C=M/2$ here).
+- Other options for `kern` are `"uniform"` and `"epanechnikov"`, or the user can
+  also supply their own kernel function.
+- `RDHonest` reports two-sided as well one-sided CIs. One-sided CIs simply
+  subtract off the worst-case bias bound $B_{M, h}$ in addition to subtracting
+  the standard error times the $z_{1-\alpha}$ critical value from the estimate.
+  It also reports the p-value for the hypothesis that $\tau_Y=0$.
+- `RDHonest` also reports the fitted regression coefficients $\hat{\beta}_{Y,
+  h}$, and returns the `lm` object under `r0$lm`. We see from the above that the
+  fitted slopes below and above the cutoff differ by
+  `r round(unname(r0$lm$coefficients[2]), 2)`, for instance.
+
+## Automatic bandwidth choice
+
+Instead of specifying a bandwidth, one can just specify the curvature parameter
+$M$, and the bandwidth will be chosen optimally for a given optimality
+criterion---minimizing the worst-case MSE of the estimator, or minimizing the
+length the resulting confidence interval. Typically, this makes little difference:
 
 ```{r}
 RDHonest(voteshare ~ margin, data=lee08, kern="triangular",
-         M=0.1, opt.criterion="MSE", sclass="H")
+         M=0.1, opt.criterion="MSE")
 ## Choose bws optimal for length of CI
 RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
-         opt.criterion="FLCI", sclass="H")
+         opt.criterion="FLCI")
+```
+
+To compute the optimal bandwidth, the package assumes homoskedastic variance on
+either side of the cutoff, which it estimates based on a preliminary local
+linear regression using the @ImKa12 bandwidth selector. This homoskedasticity
+assumption is dropped when the final standard errors are computed.
+
+Notice that, when the variance function $\sigma^2_Y(x)$ is known, neither the
+conditional variance of the estimator, $\sd(\hat{\tau}_{Y,
+h})^{2}=\sum_{i=1}^{n}k_{i, h}^{2}\sigma^{2}_{Y}(x_{i})$, nor the bias bound
+$B_{M, h}$ depend on the outcome data. Therefore, the MSE and the length of the
+infeasible CI, $2\cv_{\alpha}(B_{M, h}/\sd(\hat{\tau}_{Y, h}))
+\sd(\hat{\tau}_{Y, h})$, do not depend on the outcome data. To stress this
+property, we refer to this infeasible CI (and, with some abuse of terminology,
+also the feasible version in eq. (2)) as a fixed-length confidence interval
+(FLCI). As a consequence of this property, optimizing the bandwidth for CI
+length does not impact the coverage of the resulting CI.
+
+## Choice of curvature parameter
+
+The curvature parameter $M$ is the most important implementation choice. It
+would be convenient if one could use data-driven methods to automate its
+selection. Unfortunately, if one only assume that the second derivative of
+$f_{Y}$ is bounded by some constant $M$, it is not possible to do that: one
+cannot use data to select $M$ without distorting coverage
+(@low97, @ArKo18optimal). This result is essentially an instance of the general
+issue with using pre-testing or using model selection rules, such as using
+cross-validation or information criteria like AIC or BIC to pick which controls
+to include in a regression: doing so leads to distorted confidence intervals.
+Here the curvature parameter $M$ indexes the size of the model: a large $M$ is
+the analog of saying that all available covariates need to be included in the
+model to purge omitted variables bias; a small $M$ is the analog of saying that
+a small subset of them will do. Just like one needs to use institutional
+knowledge of the problem at hand to decide which covariates to include in a
+regression, ideally one uses problem-specific knowledge to select $M$. Analogous
+to reporting results based on different subsets of controls in columns of a
+table with regression results, one can vary the choice of $M$ by way of
+sensitivity analysis.
+
+Depending on the problem at hand, it may be difficult to translate
+problem-specific intuition about how close we think the regression function is
+to a linear function into a statement about the curvature parameter $M$. In such
+cases, it is convenient to have a rule of thumb for selecting $M$ using the
+data. To do this, we need to impose additional restrictions on $f_{Y}$ besides
+assuming that its second derivative is bounded in order to get around the
+results on impossibility of post-model selection inference discussed above. An
+appealing way of doing this is to relate the assumption about the local
+smoothness of $f_{Y}$ at the cutoff point, which drives the bias of the
+estimator but is difficult to measure in the data, to the global smoothness of
+$f_{Y}$, which is much easier to measure. In our implementation, we measure
+global smoothness by fitting a global quartic polynomial $\check{f}$, separately
+on either side of the cutoff, following @ArKo20. We assume the local second
+derivative of $f_{Y}$, $M$, is no larger than the maximum second derivative of
+the global polynomial approximation $\check{f}$. Under this assumption, we can
+calibrate $M$ by setting
+$$\hat{M}_{ROT}=\sup_{x\in[x_{\min}, x_{\max}]}\abs{\check{f}''(x)}.$$
+There are
+different ways of relating local and global smoothness, which lead to different
+calibrations of $M$. For instance, @ImWa19 propose fitting a global quadratic
+polynomial instead, and then multiplying the maximal curvature of the fitted
+model by a constant such as $2$ or $4$. An important question for future
+research is to figure out whether there is a way of relating local and global
+smoothness that is empirically appealing across a wide range of scenarios. See
+also @NoRo21 for a discussion how to visualize the choice of $M$ to aid with its
+interpretation.
+
+When the user doesn't supply `M`, the package uses the rule of thumb
+$\hat{M}_{ROT}$, and prints a message to inform the user:
+```{r}
+## Data-driven choice of M
+RDHonest(voteshare ~ margin, data=lee08)
 ```
 
-### Inference when running variable is discrete
+
+
+
+
+## Inference when running variable is discrete
 
 The confidence intervals described above can also be used when the running
 variable is discrete, with $G$ support points: their construction makes no
 assumptions on the nature of the running variable (see Section 5.1 in @KoRo16
 for more detailed discussion).
 
-Note that units that lies exactly at the cutoff are considered treated, since
-the definition of treatment is that the running variable
- $x_i\geq c_0$.
+Units that lie exactly at the cutoff are considered treated, since the
+definition of treatment assignment is that the running variable lies weakly
+above the cutoff, $x_i\geq c_0$.
 
 As an example, consider the @oreopoulos06 data, in which the running variable is age in years:
 ```{r}
-## Replicate Table 2, column (10)
+## Replicate Table 2, column (10) in Kolesar and Rothe (2018)
 RDHonest(log(earnings) ~ yearat14, cutoff=1947,
          data=cghs, kern="uniform", M=0.04, opt.criterion="FLCI", sclass="H")
-## Triangular kernel generally gives tigher CIs
-RDHonest(log(earnings) ~ yearat14, cutoff=1947,
-         data=cghs, kern="triangular", M=0.04, opt.criterion="FLCI", sclass="H")
 ```
 
 In addition, the package provides function `RDHonestBME` that calculates honest
@@ -243,14 +367,14 @@ no worse at the cutoff than away from the cutoff as in Section 5.2 in @KoRo16.
 
 ```{r}
 ## Replicate Table 2, column (6), run local linear regression (order=1)
-## with a uniform kernel (other kernels are not yet implemented)
+## with a uniform kernel (other kernels are not implemented for RDHonestBME)
 RDHonestBME(log(earnings) ~ yearat14, cutoff=1947,
             data=cghs, h=3, order=1)
 ```
 
 
 Let us describe the implementation of the variance estimator $\hat{V}(W)$ used
-to construct the CI as described in in Section 5.2 in @KoRo16. Suppose the point
+to construct the CI following Section 5.2 in @KoRo16. Suppose the point
 estimate is given by the first element of the regression of the outcome $y_i$ on
 $m(x_i)$. For instance, local linear regression with uniform kernel and
 bandwidth $h$ corresponds to $m(x)=I(|x|\leq h)\cdot(I(x>c_0),1,x, x\cdot
@@ -267,7 +391,7 @@ $x_g$ is the $g$th support point of $x_i$, is given by the $g$th element of the
 regression estimand $S^{-1}E[w(x_i)y_i]$, where $S=E[w(x_i)w(x_i)']$. Let
 $\hat{\mu}=\hat{S}^{-1}\sum_i w(x_i)y_i$, where $\hat{S}=\sum_i w(x_i)w(x_i)'$
 denote the least squares estimator. Then an estimate of
-$(\delta(x_1),\dotsc,\delta(x_G))'$ is given by $\hat{\delta}$, the vector with
+$(\delta(x_1), \dotsc, \delta(x_G))'$ is given by $\hat{\delta}$, the vector with
 elements $\hat{\mu}_g-x_g\hat{\theta}$.
 
 By standard regression results, the asymptotic distribution of $\hat{\theta}$
@@ -309,7 +433,7 @@ T_i'=\begin{pmatrix}
 Note that the upper left block and lower right block correspond simply to the
 Eicker-Huber-White estimators of the asymptotic variance of $\hat{\theta}$ and
 $\hat{\mu}$. By the delta method, a consistent estimator of the asymptotic
-variance of $(\hat\delta,\hat{\theta}_1)$ is given by
+variance of $(\hat\delta, \hat{\theta}_1)$ is given by
 \begin{equation*}
     \hat{\Sigma}=
 \begin{pmatrix}
@@ -323,222 +447,458 @@ e_1'& 0\\
 where $X$ is a matrix with $g$th row equal to $x_g'$, and $e_1$ is the first
 unit vector.
 
-Recall that in the notation of @KoRo16, $W=(g^-,g^+,s^-,s^+)$, and $g^{+}$ and
+Recall that in the notation of @KoRo16, $W=(g^-, g^+, s^-, s^+)$, and $g^{+}$ and
 $g^{-}$ are such that $x_{g^{-}}< c_0\leq x_{g^{+}}$, and
-$s^{+},s^{-}\in\{-1,1\}$. An upper limit for a right-sided CI for
+$s^{+}, s^{-}\in\{-1,1\}$. An upper limit for a right-sided CI for
 $\theta_1+b(W)$ is then given by
 
 \begin{equation*}
 \hat{\theta}_{1}+s^{+}\hat\delta(x_{g^+})+
 s^{-}\hat\delta(x_{g^-})+z_{1-\alpha}\hat{V}(W),
 \end{equation*}
-
 where $\hat{V}(W)=a(W)'\hat{\Sigma}a(W)$, and $a(W)\in\mathbb{R}^{G_{h}+1}$
 denotes a vector with the $g_{-}$th element equal to $s^{-}$,
 $(G_{h}^{-}+g_{+})$th element equal to $s^{+}$, the last element equal to one,
 and the remaining elements equal to zero. The rest of the construction then
 follows the description in Section 5.2 in @KoRo16.
 
-## Data-driven choice of smoothness constant
 
-Without further restrictions, the smoothness constant $M$ cannot be data-driven:
-to maintain honesty over the whole function class, a researcher must choose
-$M$ a priori, rather than attempting to use a data-driven method. Therefore,
-one should, whenever possible, use problem-specific knowledge to decide what
-choice of $M$ is reasonable a priori.
+# Fuzzy RD
+
+## Model
+
+In a fuzzy RD design, the treatment status $D_i$ of a unit does not necessarily
+equate the treatment assignment $Z_i=I\{x_{i}\geq c_{0}\}$. Instead, the
+treatment assignment induces a jump in the treatment probability at the cutoff.
+Correspondingly, we augment the outcome model with a first stage that measures
+the effect of the running variable on the treatment:
+\begin{equation}
+Y_i = f_Y(x_i) + u_{Y, i}, \qquad D_i = f_D(x_i) + u_{Y, i},
+\end{equation}
+where $f_D, f_Y$ are the conditional mean functions.
+
+To account for imperfect compliance the fuzzy RD parameter scales the jump in the
+outcome equation $\tau_Y$ by the jump in the treatment probability at the
+cutoff, $\tau_D=\lim_{x\downarrow c_{0}}f_D(x)-\lim_{x\uparrow c_{0}}f_D(x)$.
+This fuzzy RD parameter, $\theta=\tau_{Y}/\tau_{D}$, measures the local average
+treatment effect for individuals at the threshold who comply with the
+treatment assignment, provided mild continuity conditions and a monotonicity
+condition hold (@htv01). Under perfect compliance, the treatment
+probability jumps all the way from zero to one at the threshold so that
+$\tau_{D}=1$, and the two parameters coincide.
+
+For example, in the @battistin09 dataset, the treatment variable is an indicator
+for retirement, and the running variable is the number of years since being eligible
+to retire. The cutoff is $0$. Individuals exactly at the cutoff are dropped from
+the dataset. If there were individuals exactly at the cutoff, they are assumed
+to receive the treatment assignment (i.e. be eligible for retirement).
 
-For cases in which this is difficult, the internal function `RDHonest`
-implements the method considered in Section 3.4.1 in @ArKo16honest based on a
-global polynomial approximation:
+## Inference based on local polynomial estimates
+
+A natural estimator for the fuzzy RD parameter $\theta$ is the sample analog
+based on local linear regression,
+\begin{equation*}
+  \hat{\theta}_{h}=\frac{\hat{\tau}_{Y, h}}{\hat{\tau}_{D, h}}.
+\end{equation*}
+
+Unlike in the sharp case, our bias-aware CIs do rely on the consistency of the
+estimator, which generally requires the bandwidth to shrink with the sample
+size. Since this estimator is a ratio of regression coefficients, it follows by
+the delta method that so long as $\tau_{D}\neq 0$, the estimator will be
+asymptotically normal in large samples. In fact, the estimator is equivalent to
+a weighted IV regression of $Y_i$ onto $D_i$, using $Z_i$ as an instrument, and
+$x_i$ and its interaction with $Z_i$ as controls, so the variance formula is
+analogous to the IV variance formula:
+\begin{equation*}
+\sd(\hat{\theta}_h)^2=\frac{\sd(\tau_{Y, h})^2+\theta^2\sd(\tau_{D, h})^2
+-2\cov(\tau_{D, h}, \tau_{Y, h})\theta}{\tau_D^2},
+\end{equation*}
+where $\cov(\tau_{D, h}, \tau_{Y, h})=\sum_i
+k_{i, h}^2\cov(Y_i, D_i\mid x_i)$ is the covariance of the
+estimators.
+
+If the second derivative of $f_Y$ is bounded by $M_Y$ and the second derivative
+of $f_D$ is bounded by $M_D$, a linearization argument from Section 3.2.3 in
+@ArKo20 that the bias can be bounded in large samples by $B_{M, h}$, with
+$M=(M_{Y}+\abs{\theta}M_{D})/\abs{\tau_{D}}$, which now depends on $\theta$
+itself. Therefore, optimal bandwidth calculations will require a preliminary
+estimate of $\abs{\theta}$, which can be passed to `RDHonest` via the option
+`T0`. Like in the sharp case, the optimal bandwidth calculations assume
+homoskedastic covariance of $(u_{Y, i}, u_{D, i})$ on either side of the cutoff,
+which are estimated based on a preliminary local linear regression for both the
+outcome and first stage equation, with bandwidth given by the @ImKa12 bandwidth
+selector applied to the outcome equation.
 
 ```{r}
-## Data-driven choice of M
-RDHonest(voteshare ~ margin, data=lee08, kern="uniform", sclass="H",
-         opt.criterion="MSE")
+## Initial estimate of treatment effect for optimal bandwidth calculations
+r <- RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
+## Use it to compute optimal bandwidth
+RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
+         T0=r$coefficients$estimate)
 ```
-See @ArKo16honest for a discussion of the restrictions on the
-parameter space under which this method yields honest inference.
 
-## Optimal inference
+## Choice of curvature parameters
 
-For the second-order Taylor smoothness class, the function `RDHonest`, with
-`kernel="optimal"`, computes finite-sample optimal estimators and confidence
-intervals, as described in Section 2.2 in @ArKo16optimal. This typically yields
-tighter CIs. Comparing the lengths of two-sided CIs with optimally chosen
-bandwidths, using Silverman's rule of thumb to estimate the preliminary variance
-estimate used to compute optimal bandwidths:
+Like in the sharp RD case, without further restrictions, the curvature
+parameters $M_Y$ and $M_D$ cannot be data-driven: to maintain honesty over the
+whole function class, a researcher must choose them a priori, rather than
+attempting to use a data-driven method. Therefore, one should, whenever
+possible, use problem-specific knowledge to decide what choices of $M_Y$ and
+$M_D$ are reasonable a priori.
+
+For cases in which this is difficult, the function `RDHonest` implements the
+rule of thumb @ArKo20 described earlier, based on computing the global
+smoothness of both $f_Y$ and $f_D$ using a quartic polynomial. When the user
+doesn't supply the curvature bounds, the package uses the rule of thumb
+$\hat{M}_{ROT}$, and prints a message to inform the user:
 
 ```{r}
-r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
-               opt.criterion="FLCI",
-               se.method="nn")$coefficients
-r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
-               opt.criterion="FLCI", se.method="nn",
-               sclass="T")$coefficients
-r1$conf.high-r1$conf.low
-r2$conf.high-r2$conf.low
+## Data-driven choice of M
+RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
+         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)
 ```
+See @ArKo20 for a discussion of the restrictions on the
+parameter space under which this method yields honest inference.
 
-## Specification testing
+# Extensions
 
-The package also implements lower-bound estimates for the smoothness constant
-$M$ for the Taylor and Hölder smoothness class, as described in the supplements
-to @KoRo16 and @ArKo16optimal
+## Covariates
+
+RD datasets often contain information on a vector of $K$ pre-treatment
+covariates $W_{i}$, such as pre-intervention outcomes, demographic, or
+socioeconomic characteristics of the units. Similar to randomized controlled
+trials, while the presence of covariates doesn't help to weaken the fundamental
+identifying assumptions, augmenting the RD estimator with predetermined
+covariates can increase its precision.
 
+Let us first describe covariate adjustment in the sharp RD case. We implement
+the covariate adjustment studied in @ccft19, namely to include $W_{i}$ as one of
+the regressors in the WLS regression, regressing
+$Y_{i}$ onto $m(x_{i})$ and $W_{i}$. As in the case with no covariates, we weight each observation with the
+kernel weight $K(x_{i}/h)$. This leads to the estimator
+\begin{equation*}
+  \tilde{\tau}_{Y, h}=\tilde{\beta}_{Y, h,1}, \qquad \tilde{\beta}_{Y, h}=\left(\sum_{i=1}^{n}K(x_{i}/h)
+    \tilde{m}(x_{i}, W_{i})\tilde{m}(x_{i}, W_{i})'\right)^{-1}
+  \sum_{i=1}^{n}K(x_{i}/h)\tilde{m}(x_{i}, W_{i})Y_{i},
+\end{equation*}
+where $\tilde{m}(x_{i}, W_{i})=(m(x_{i})', W_{i}')'$. Denote the coefficient on
+$W_{i}$ in this regression by $\tilde{\gamma}_{Y, h}$; this corresponds to the
+last $K$ elements of $\tilde{\beta}_{Y, h}$. As in the case without covariates,
+we first take the bandwidth $h$ as given, and defer bandwidth selection choice
+to the end of this subsection.
+
+To motivate the estimator under our framework, and to derive bias-aware CIs
+that incorporate covariates, we need to formalize the assumption that the
+covariates are predetermined (without any assumptions on the covariates, it is
+optimal to ignore the covariates and use the unadjusted estimator
+$\hat{\tau}_{Y, h}$). Let $f_{W}(x)=E[W_{i}\mid X_{i}=x]$ denote the regression
+function from regressing the covariates on the running variable, and let
+\begin{equation*}
+  \Sigma_{WW}(x)=\operatorname{var}(W_{i}\mid X_{i}=x), \qquad
+  \Sigma_{WY}(x)=\cov(W_{i}, Y_{i}\mid X_{i}=x).
+\end{equation*}
+We assume that the variance and covariance functions are continuous, except
+possibly at zero. Let
+$\gamma_{Y}=(\Sigma_{WW}(0_{+})+\Sigma_{WW}(0_{-}))^{-1}
+(\Sigma_{WY}(0_{+})+\Sigma_{WY}(0_{-}))$ denote the coefficient on $W_{i}$ when
+we regress $Y_{i}$ onto $W_{i}$ for observations at the cutoff. Let
+$\tilde{Y}_{i}:=Y_{i}-W_{i}'\gamma_{Y}$ denote the covariate-adjusted outcome.
+To formalize the assumption that the covariates are pre-determined, we assume
+that $\tau_{W}=\lim_{x\downarrow 0}f_{W}(0)-\lim_{x\uparrow 0}f_{W}(0)=0$, which
+implies that $\tau_{Y}$ can be identified as the jump in the covariate-adjusted
+outcome $\tilde{Y}_{i}$ at $0$. Following Appendix B.1 in
+@ArKo18optimal, we also assume that the covariate-adjusted outcome
+varies smoothly with the running variable (except for a possible jump at the
+cutoff), in that the second derivative of
+\begin{equation*}
+  \tilde{f}(x):=f_{Y}(x)-f_{W}(x)'\gamma_{Y}
+\end{equation*}
+is bounded by a known constant $\tilde{M}$. In addition, we assume $f_{W}$ has
+bounded second derivatives.
+
+Under these assumptions, if $\gamma_{Y}$ was known and hence $\tilde{Y}_{i}$ was
+directly observable, we could estimate $\tau$ as in the case without covariates,
+replacing $M$ with $\tilde{M}$ and $Y_{i}$ with
+$\tilde{Y}_{i}$. Furthermore, as discussed in @ArKo18optimal, such
+approach would be optimal under homoskedasticity assumptions.
+Although $\gamma_{Y}$ is unknown, it turns out that the estimator
+$\tilde{\tau}_{Y, h}$ has the same large sample behavior as the infeasible
+estimator $\hat{\tau}_{\tilde{Y}, h}$. To show this, note that by standard
+regression algebra, $\tilde{\tau}_{Y, h}$ can equivalently be written as
+\begin{equation*}
+  \tilde{\tau}_{Y, h}=\hat{\tau}_{Y-W'\tilde{\gamma}_{Y, h}, h}=
+  \hat{\tau}_{\tilde{Y}, h}-
+  \sum_{k=1}^{K}\hat{\tau}_{W_{k}, h}(\tilde{\gamma}_{Y, h, k}-\gamma_{Y, k}).
+\end{equation*}
+The first equality says that covariate-adjusted estimate is the same as an
+unadjusted estimate that replaces the original outcome $Y_{i}$ with the
+covariate-adjusted outcome $Y_{i}-W_{i}'\tilde{\gamma}_{Y, h}$. The second
+equality uses the decomposition
+$Y_{i}-W_{i}'\tilde{\gamma}_{Y, h}=\tilde{Y}_{i}-W_{i}'
+(\tilde{\gamma}_{Y h}-\gamma_{Y})$
+to write the estimator as a sum of the infeasible estimator and a linear
+combination of "placebo RD estimators" $\hat{\tau}_{W_{k}, h}$, that
+replace $Y_{i}$ in the outcome equation with the $k$th element of $W_{i}$,
+Since $f_{W}$ has bounded second derivatives, these placebo estimators converge
+to zero, with rate that is at least as fast as the rate of convergence of the
+infeasible estimator $\hat{\tau}_{\tilde{Y}, h}$:
+$\hat{\tau}_{W_{k}, h}=O_{p}(B_{\tilde{M}, h}+\sd(\hat{\tau}_{\tilde{Y}, h}))$.
+Furthermore, under regularity conditions, $\tilde{\gamma}_{Y, h}$ converges to
+$\gamma_{Y}$, so that the second term in the previous display is asymptotically
+negligible relative to the first. Consequently, we can form bias-aware CIs
+based on $\tilde{\tau}_{Y, h}$ as in the case without covariates, treating
+the covariate-adjusted outcome $Y_{i}-W_{i}'\tilde{\gamma}_{Y}$ as the outcome,
+\begin{equation*}
+  \tilde{\tau}_{Y, h}\pm
+  \cv_{1-\alpha}(B_{\tilde{M}, h} / \sd(\hat{\tau}_{\tilde{Y}, h}))\sd(\hat{\tau}_{\tilde{Y}, h}), \qquad
+  \sd(\hat{\tau}_{Y, h})=\sum_{i=1}^{n}k_{i, h}^{2}\sigma^{2}_{\tilde{Y}}(x_{i}),
+\end{equation*}
+where
+$\sigma^{2}_{\tilde{Y}}(x_{i})=\sigma^{2}_{Y}(x_{i})+{\gamma}_{Y}'\Sigma_{WW}(x_{i}){\gamma}_{Y}-2{\gamma}_{Y}'\Sigma_{WY}(x_{i})$.
+If the covariates are effective at explaining variation in the outcomes, then
+the quantity
+$\sum_{i}k_{i, h}^{2}\left({\gamma}_{Y}'\Sigma_{WW}(x_{i}){\gamma}_{Y}-2{\gamma}_{Y}'\Sigma_{WY}(x_{i})
+\right)$ will be negative, and
+$\sd(\hat{\tau}_{\tilde{Y}, h})\leq \sd(\hat{\tau}_{Y, h})$. If the smoothness of
+the covariate-adjusted conditional mean function $f_{Y}-f_{W}'\gamma_{Y}$ is
+greater than the smoothness of the unadjusted conditional mean function
+$f_{Y}$, so that $\tilde{M}\leq M$, then using the covariates will help tighten
+the confidence intervals.
+
+Implementation of covariate-adjustment requires a choice of $\tilde{M}$, and
+computing the optimal bandwidth requires a preliminary estimate of the variance
+of the covariate-adjusted outcome. In our implementation, we first estimate the
+model without covariates (using a rule of thumb to calibrate $M$, the bound on
+the second derivative of $f_{Y}$), and compute the bandwidth $\check{h}$ that's
+MSE optimal without covariates. Based on this bandwidth, we compute a
+preliminary estimate $\check{\gamma}_{Y, \check{h}}$ of $\gamma_{Y}$, and use
+this preliminary estimate to compute the covariate-adjusted outcome
+$Y_{i}-W_{i}'\check{\gamma}_{Y, \check{h}}$. If $\tilde{M}$ is not supplied, we
+calibrate $\tilde{M}$ using the rule of thumb, using this covariate-adjusted
+outcome as the outcome. Similarly, we use this covariate-adjusted outcome as the
+outcome to compute a preliminary estimator of the conditional variance
+$\sigma^{2}_{\tilde{Y}}(x_{i})$, for optimal bandwidth calculations, as in the
+case without covariates.
+
+A demonstration using the `headst` data:
 ```{r}
-r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
-### Only use three point-average for averages of a 100 points closest to cutoff,
-### and report results separately for points above and below cutoff
-RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")
+## No covariates
+rn <- RDHonest(mortHS ~ povrate, data=headst)
+## Use Percent attending school aged 14-17, urban, black,
+## and their interaction as covariates.
+rc <- RDHonest(mortHS ~ povrate | urban*black + sch1417, data=headst)
+rn
+rc
+```
 
-## Pool estimates based on observations below and above cutoff, and
-## use three-point averages over the entire support of the running variable
-RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")
+We see that the inclusion of covariates leads to a reduction in the
+rule-of-thumb curvature and also smaller standard errors (this would be true
+even if the bandwidth was kept fixed). Correspondingly, the CIs are tighter by
+about 6 percentage points:
+```{r}
+100 * (1 - (rc$coefficients$conf.high-rc$coefficients$conf.low) /
+           (rn$coefficients$conf.high-rn$coefficients$conf.low))
 ```
 
-## Weighted regression
+In the fuzzy RD case, we need to covariate-adjust the treatment $D_i$ as well as
+the outcome. The implementation mirrors the sharp case. Define $\gamma_{D}$
+analogously to $\gamma_{Y}$, and assume that the second derivative of
+$f_{Y}(x)-f_{W}(x)'\gamma_{Y}$ is bounded by a known constant $\tilde{M}_{Y}$,
+and that $f_{D}(x)-f_{W}(x)'\gamma_{D}$ is bounded by a known constant
+$\tilde{M}_{D}$. The covariate-adjusted estimator is given by
+$\tilde{\theta}_{h}=\tilde{\tau}_{Y, h}/\tilde{\tau}_{D, h}$, with variances and
+worst-case bias computed as in the case without covariates, replacing the
+treatment and outcome with their covariate-adjusted versions.
+
+
+## Aggregated data and weighted regression
 
 In some cases, data is only observed as cell averages. For instance, suppose
 that instead of observing the original `cghs` data, we only observe averages for
 cells as follows:
 
 ```{r}
-d <- cghs
-## Make 20 groups based on observation number
-d$mod <- seq_along(d$yearat14) %% 20
-## Make cells defined as intersection of group and year
-d$cell <- d$mod/100+d$yearat14
-## Data with cell averages
 dd <- data.frame()
-for (j in unique(d$cell)){
-    dd <- rbind(dd, data.frame(y=mean(log(d$earnings)[d$cell==j]),
-                               x=mean(d$yearat14[d$cell==j]),
-                               weights=length(d$yearat14[d$cell==j])))
+## Collapse data by running variable
+for (j in unique(cghs$yearat14)) {
+    ix <- cghs$yearat14==j
+    df <- data.frame(y=mean(log(cghs$earnings[ix])), x=j,
+                     weights=sum(ix),
+                     sigma2=var(log(cghs$earnings[ix]))/sum(ix))
+    dd <- rbind(dd, df)
 }
 ```
 
 The column `weights` gives the number of observations that each cell averages
 over. In this case, if we weight the observations using `weights`, we can
-recover the original estimates (and the same worst-case bias):
+recover the original estimates (and the same worst-case bias). If we use the
+estimates of the conditional variance of the outcome, `dd$sigma2`, then we can
+also replicate the standard error calculations:
 
 ```{r}
-RDHonest(log(earnings)~yearat14, cutoff=1947, h=5, data=cghs, M=0.1,
-         se.method="nn")
-RDHonest(y~x, cutoff=1947, weights=weights, h=5, data=dd, M=0.1,
-         se.method="nn")
+s0 <- RDHonest(log(earnings)~yearat14, cutoff=1947, data=cghs)
+## keep same bandwidth
+s1 <- RDHonest(y~x, cutoff=1947, data=dd, weights=weights,
+               sigmaY2=sigma2, se.method="supplied.var",
+               h=s0$coefficients$bandwidth)
+## Results are identical:
+s0
+s1
 ```
 
-Note the variance estimates don't quite match, since the variance estimator is
-different, but the worst-case bias and the point estimate are identical.
-
-# Initial bandwidth estimation
-
-We use @FaGi96 rule of thumb and @ImKa12 bandwidth selectors for preliminary
-variance estimation.
-
-
-# Fuzzy RD
-
-## Model
-
-In a fuzzy RD design, units are assigned to treatment if their running variable
-$x_{i}$ weakly exceeds a cutoff $x_i\geq c_{0}$. However, the actual treatment
-$d_{i}$ does not perfectly comply with the treatment assignment. Instead, the cutoff
-induces a jump in the treatment probability. The resulting reduced-form and
-first-stage regressions are given by
-\begin{align*}
-  y_{i}&=f_{1}(x_{i})+u_{i1}, &  d_{i}&=f_{2}(d_{i})+u_{i2},
-\end{align*}
-See Section 3.3 in @ArKo16honest for a more detailed description.
-
-In the @battistin09 dataset, the treatment variable is an indicator for
-retirement, and the running variable is number of years since being eligible to
-retire. The cutoff is $0$. Individuals exactly at the cutoff are dropped from
-the dataset. If there were individuals exactly at the cutoff, they are assumed
-to be assigned to the treatment group.
-
-Similarly to the `RDData` function, the `FRDData` function transforms the data
-into an appropriate format:
+Without supplying the variance estimates and specifying
+`se.method="supplied.var"`, the variance estimates will not match, since the
+collapsed data is not generally not sufficient to learn about the true
+variability of the collapsed outcomes.
 
+The same method works in fuzzy designs, but one has to also save the conditional
+variance of the treatment and its covariance with the outcome:
 ```{r}
-## Assumes first column in the data frame corresponds to outcome,
-##  second to the treatment variable, and third to the running variable
-## Outcome here is log of non-durables consumption
-## dr <- FRDData(cbind(logf=log(rcp[, 6]), rcp[, c(3, 2)]), cutoff=0)
+r0 <- RDHonest(log(cn)|retired~elig_year, data=rcp, h=7)
+dd <- data.frame(x=sort(unique(rcp$elig_year)), y=NA, d=NA, weights=NA,
+                 sig11=NA, sig12=NA, sig21=NA, sig22=NA)
+for (j in seq_len(NROW(dd))) {
+    ix <- rcp$elig_year==dd$x[j]
+    Y <- cbind(log(rcp$cn[ix]), rcp$retired[ix])
+    dd[j, -1] <- c(colMeans(Y), sum(ix), as.vector(var(Y))/sum(ix))
+}
+r1 <- RDHonest(y|d~x, data=dd, weights=weights,
+               sigmaY2=sig11, T0=0, sigmaYD=sig21,
+               sigmaD2=sig22, h=7,
+               se.method="supplied.var")
+## Outputs match up to numerical precision
+max(abs(r0$coefficients[2:11]-r1$coefficients[2:11]))
 ```
 
-## Inference based on local polynomial estimates
+## Clustering
+
+In some applications, the data are collected by clustered sampling. In such
+cases, the user can specify a vector `clusterid` signifying cluster membership.
+In this case, preliminary bandwidth calculations assume that the regression
+errors have a Moulton-type structure, with homoskedasticity on either side of
+the cutoff: \begin{equation*} \cov(Y_{i}, Y_{j})=\begin{cases} \sigma^{2}_{+} &
+\text{if $i=j$ and $x_{i}\geq 0$,} \\ \sigma^{2}_{-} & \text{if $i=j$ and
+$x_{i}< 0$,} \\ \rho & \text{if $i\neq j$ and $g(i)=g(j)$,} \\ 0 &
+\text{otherwise,} \end{cases} \end{equation*} where $g(i)\in\{1,\dotsc, G\}$
+denotes cluster membership. Since it appears difficult to generalize the nearest
+neighbor variance estimator to clustering, we use regression-based
+cluster-robust variance formulas to compute estimator variances, so that option
+`se.method="EHW"` is required.
 
-The function `RDHonest` constructs one- and two-sided confidence intervals
-(CIs) around local linear and local quadratic estimators using either a
-user-supplied bandwidth (which is allowed to differ on either side of the
-cutoff), or bandwidth that is optimized for a given performance criterion.
+```{r}
+## make fake clusters
+set.seed(42)
+clusterid <- sample(1:50, NROW(lee08), replace=TRUE)
+sc <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
+               clusterid=clusterid, M=0.14, h=7)
+## Since clusters are unrelated to outcomes, not clustering
+## should yield similar standard errors
+sn <- RDHonest(voteshare~margin, data=lee08, se.method="EHW",
+               M=0.14, h=7)
+sc
+sn
+```
 
-### Parameter space and initial estimate
 
-To implement the honest CIs, one needs to specify the parameter space
-$\mathcal{F}$ for $f_1$ and $f_2$. The function `RDHonest` computes honest CIs
-when $f_1$ and $f_2$ both lie in a second-order Taylor or second-order Hölder
-smoothness class, $\mathcal{F}_{T}(M_1, M_2)$ and
-$\mathcal{F}_{\text{Hölder}}(M_1, M_2)$, where the smoothness constants $M_1$
-and $M_2$ for the reduced form and the first stage are allowed to differ. Also,
-since the worst-case bias calculation requires an estimate of the treatment
-effect, for optimal bandwidth calculations, the user needs to supply an initial
-estimator of the treatment effect
+## Specification testing
 
+The package also implements lower-bound estimates for the smoothness constant
+$M$ for the Taylor and Hölder smoothness class, as described in the supplements
+to @KoRo16 and @ArKo18optimal
 
 ```{r}
-## Initial estimate of treatment effect for optimal bandwidth calculations
-r <- RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
-              M=c(0.001, 0.002), opt.criterion="MSE", sclass="H", T0=0)
-## Use it to compute optimal bandwidth
-RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
-         M=c(0.001, 0.002), opt.criterion="MSE", sclass="H",
-         T0=r$coefficients$estimate)
-```
+r1 <- RDHonest(voteshare ~ margin, data=lee08, M=0.1, se.method="nn")
+### Only use three point-average for averages of a 100 points closest to cutoff,
+### and report results separately for points above and below cutoff
+RDSmoothnessBound(r1, s=100, separate=TRUE, multiple=FALSE, sclass="T")
 
-## Data-driven choice of smoothness constant
+## Pool estimates based on observations below and above cutoff, and
+## use three-point averages over the entire support of the running variable
+RDSmoothnessBound(r1, s=100, separate=FALSE, multiple=TRUE, sclass="H")
+```
 
-Like in the sharp RD case, Without further restrictions, the smoothness
-constants $M_1$ and $M_2$ cannot be data-driven: to maintain honesty over the
-whole function class, a researcher must choose them a priori, rather than
-attempting to use a data-driven method. Therefore, one should, whenever
-possible, use problem-specific knowledge to decide what choices of $M_1$ and
-$M_2$ are reasonable a priori.
+## Optimal weights under Taylor smoothness class
 
-For cases in which this is difficult, the function `RDHonest` implements the
-method considered in Section 3.4.1 in @ArKo16honest based on a global polynomial
-approximation:
+For the second-order Taylor smoothness class, the function `RDHonest`, with
+`kernel="optimal"`, computes finite-sample optimal estimators and confidence
+intervals, as described in Section 2.2 in @ArKo18optimal. This typically yields
+tighter CIs:
 
 ```{r}
-## Data-driven choice of M
-RDHonest(log(cn) | retired ~ elig_year, data=rcp, kern="triangular",
-         opt.criterion="MSE", sclass="H", T0=r$coefficients$estimate)
+r1 <- RDHonest(voteshare ~ margin, data=lee08, kern="optimal", M=0.1,
+               opt.criterion="FLCI",
+               se.method="nn")$coefficients
+r2 <- RDHonest(voteshare ~ margin, data=lee08, kern="triangular", M=0.1,
+               opt.criterion="FLCI", se.method="nn",
+               sclass="T")$coefficients
+r1$conf.high-r1$conf.low
+r2$conf.high-r2$conf.low
 ```
-See @ArKo16honest for a discussion of the restrictions on the
-parameter space under which this method yields honest inference.
-
 # Inference at a point
 
-The package can also perform inference at
-a point, and optimal bandwidth selection for inference at a point. Suppose, for example, one
-was interested in the vote share for candidates with margin of victory equal to
-20 points:
+The package can also perform inference at a point, and optimal bandwidth
+selection for inference at a point. Suppose, for example, one was interested in
+the vote share for candidates with margin of victory equal to 20 points:
 ```{r}
-## Transform data, specify we're interested in inference at x0=20,
+## Specify we're interested in inference at x0=20,
 ## and drop observations below cutoff
 RDHonest(voteshare ~ margin, data=lee08, subset=margin>0,
          cutoff=20, kern="uniform",
          opt.criterion="MSE", sclass="H", point.inference=TRUE)
 ```
 
-# Lindeberg weights
-
-In fuzzy RD, by Theorem B.2 in the appendix to @ArKo16honest, the estimator is asymptotically equivalent to
-\begin{equation*}
-\sum_i w_i (Y_i-D_i\beta)/first stage,
-\end{equation*}
-so the Lindeberg weights are the same as in the sharp case.
-
+To compute the optimal bandwidth, the package assumes homoskedastic variance on
+either side of the cutoff, which it estimates based on a preliminary local
+linear regression using the @FaGi96 rule of thumb bandwidth selector. This
+homoskedasticity assumption is dropped when the final standard errors are
+computed.
+
+# Diagnostics: leverage and effective observations
+
+The estimators in this package are just weighted regression estimators, or
+ratios of regression estimators in the fuzzy RD case. Regression estimators are
+linear in outcomes, taking the form $\sum_i k_{i, h} Y_i$, where $k_{i, h}$ are
+estimation weights, returned by `data$est_w` part of the `RDHonest` output (see
+expression for $\hat{\tau}_{Y, h}$ above).
+
+For the sampling distribution of the estimator to be well-approximated by a
+normal distribution, it is important that these regression weights not be too
+large: asymptotic normality requires $L_{\max}=\max_j k_{j, h}^2/\sum_i k_{i,
+h}^2\to 0$. If uniform kernel is used, the weights $k_{i, h}$ are just the
+diagonal elements of the partial projection matrix. We therefore refer to
+$L_{\max}$ as maximal (partial) leverage, and it is reported in the `RDHonest`
+output. The package issues a warning if the maximal leverage exceeds $0.1$---in
+such cases using a bigger bandwidth is advised.
+
+In the fuzzy RD case, by Theorem B.2 in the appendix to @ArKo20, the estimator
+is asymptotically equivalent to $\sum_i k_{i, h} (Y_i-D_i\theta)/\tau_D$, where
+$k_{i, h}$ are the weights for $\hat{\tau}_{Y, h}$. The maximal leverage
+calculations are thus analogous to the sharp case.
+
+With local regression methods, it is clear that observations outside the
+estimation window don't contribute to estimation, reducing the effective sample
+size. If the uniform kernel is used, the package therefore reports the number of
+observations inside the estimation window as the "number of effective
+observations".
+
+To make this number comparable across different kernels, observe that, under
+homoskedasticity, the variance of a linear estimator $\sum_{i} k_{i} Y_i$ is
+$\sigma^2\sum_{i} k_i^2$. We expect this to scale in inverse proportion to the
+sample size: with twice as many observations and the same bandwidth, we expect
+the variance to halve. Therefore, if the variance ratio relative to a uniform
+kernel estimator with weights $\sum_i k_{uniform, i}Y_i$ is $\sigma^2\sum_{i}
+k_{uniform, i}^2/\sigma^2\sum_{i} k_i^2=\sum_{i} k_{uniform, i}^2/\sum_{i}
+k_i^2$, the precision of this estimator is the same as if we used a uniform
+kernel, but with $\sum_{i} k_{uniform, i}^2/\sum_{i} k_i^2$ as many
+observations. Correspondingly, we define the number of effective observations
+for other kernels as the number of observations inside the estimation window
+times $\sum_{i} k_{uniform, i}^2/\sum_{i} k_i^2$. With this definition, using a
+triangular kernel typically yields effective samples sizes equal to about 80% of
+the number of observations inside the estimation window.
+
+Finally, to assess which observations are important for pinning down the
+estimate, it can be useful to explicitly plot the estimation weights.
 
 # References
diff --git a/vignettes/mk_Rpackage_template.tex b/vignettes/mk_Rpackage_template.tex
deleted file mode 100644
index 5a62e71..0000000
--- a/vignettes/mk_Rpackage_template.tex
+++ /dev/null
@@ -1,168 +0,0 @@
-\documentclass[$if(fontsize)$$fontsize$,$endif$$if(lang)$$babel-lang$,$endif$$if(papersize)$$papersize$,$endif$$for(classoption)$$classoption$$sep$,$endfor$]{article}
-
-$if(geometry)$
-\usepackage[$geometry$]{geometry}
-$endif$
-
-$if(fontfamily)$
-\usepackage[$fontfamilyoptions$]{$fontfamily$}
-$else$
-\usepackage{libertine}
-\usepackage[cmintegrals, cmbraces, libertine]{newtxmath}
-$endif$
-
-\usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc}
-\usepackage[utf8]{inputenc}
-
-\setcounter{secnumdepth}{2}
-
-$if(listings)$
-\usepackage{listings}
-$endif$
-
-$if(lhs)$
-\lstnewenvironment{code}{\lstset{language=r,basicstyle=\small\ttfamily}}{}
-$endif$
-
-$if(highlighting-macros)$
-$highlighting-macros$
-$endif$
-$if(verbatim-in-note)$
-\usepackage{fancyvrb}
-$endif$
-% Above clashes with xcolor, removed \usepackage{xcolor} below. Otherwise
-% replace "highlighting-macros" with content from .tex file generated with
-% build_vignettes(clean=FALSE)
-
-
-$if(tables)$
-\usepackage{longtable,booktabs}
-$endif$
-
-$if(graphics)$
-\usepackage{graphicx,grffile}
-\makeatletter
-\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
-\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
-\makeatother
-% Scale images if necessary, so that they will not overflow the page
-% margins by default, and it is still possible to overwrite the defaults
-% using explicit options in \includegraphics[width, height, ...]{}
-\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
-$endif$
-
-$if(title)$
-\title{$title$$if(subtitle)$: $subtitle$$endif$ $if(anonymous)$$else$$if(thanks)$\thanks{$thanks$} $endif$$endif$ }
-$endif$
-
-$if(anonymous)$$else$\author{$for(author)$$author$$sep$\and$endfor$}$endif$
-
-\date{$if(date)$$date$$endif$}
-
-\usepackage{titlesec}
-\titleformat*{\section}{\large\bfseries}
-\titleformat*{\subsection}{\normalsize\itshape}
-\titleformat*{\subsubsection}{\normalsize\itshape}
-\titleformat*{\paragraph}{\normalsize\itshape}
-\titleformat*{\subparagraph}{\normalsize\itshape}
-
-$if(natbib)$
-\usepackage{natbib}
-\bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$}
-\usepackage[strings]{underscore} % protect underscores in most circumstances
-$endif$
-
-$if(biblatex)$
-\usepackage$if(biblio-style)$[style=$biblio-style$]$endif${biblatex}
-$if(biblatexoptions)$\ExecuteBibliographyOptions{$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$}$endif$
-$for(bibliography)$
-\addbibresource{$bibliography$}
-$endfor$
-$endif$
-
-
-\usepackage{setspace}
-\onehalfspacing
-
-\makeatletter
-\@ifpackageloaded{hyperref}{}{%
-\ifxetex
-  \PassOptionsToPackage{hyphens}{url}\usepackage[setpagesize=false, % page size defined by xetex
-              unicode=false, % unicode breaks when used with xetex
-              xetex]{hyperref}
-\else
-  \PassOptionsToPackage{hyphens}{url}\usepackage[unicode=true]{hyperref}
-\fi
-}
-
-\definecolor{webbrown}{rgb}{.6,0,0}
-\makeatother
-\hypersetup{breaklinks=true,
-            bookmarks=true,
-            pdfauthor={$if(anonymous)$$else$$for(author)$$author$$sep$ and $endfor$$endif$},
-            pdfkeywords = {$if(keywords)$$keywords$$endif$},
-            pdftitle={$title$$if(subtitle)$: $subtitle$$endif$},
-            colorlinks=true,
-            citecolor=$if(citecolor)$$citecolor$$else$webbrown$endif$,
-            urlcolor=$if(urlcolor)$$urlcolor$$else$webbrown$endif$,
-            linkcolor=$if(linkcolor)$$linkcolor$$else$webrown$endif$,
-            pdfborder={0 0 0}}
-
-% set default figure placement to htbp
-\makeatletter
-\def\fps@figure{htbp}
-\makeatother
-
-\providecommand{\tightlist}{%
-\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
-
-$for(header-includes)$
-$header-includes$
-$endfor$
-
-$if(endnotes)$
-\usepackage{endnotes}
-\renewcommand{\enotesize}{\normalsize}
-\let\footnote=\endnote
-$endif$
-
-\begin{document}
-
-$if(title)$
-\maketitle
-$endif$
-
-$if(toc)$
-{
-\hypersetup{linkcolor=black}
-\setcounter{tocdepth}{$toc-depth$}
-\tableofcontents
-}
-$endif$
-
-$body$
-$if(endnotes)$
-\newpage
-\theendnotes
-$endif$
-\newpage
-\singlespacing
-$if(natbib)$
-$if(bibliography)$
-$if(biblio-title)$
-$if(book-class)$
-\renewcommand\bibname{$biblio-title$}
-$else$
-\renewcommand\refname{$biblio-title$}
-$endif$
-$endif$
-\bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$}
-$endif$
-$endif$
-$if(biblatex)$
-\printbibliography$if(biblio-title)$[title=$biblio-title$]$endif$
-$endif$
-$for(include-after)$
-$include-after$
-$endfor$
-\end{document}
diff --git a/vignettes/np-testing-library.bib b/vignettes/np-testing-library.bib
index 0a04af0..13400f4 100644
--- a/vignettes/np-testing-library.bib
+++ b/vignettes/np-testing-library.bib
@@ -1,3 +1,65 @@
+@article{ccft19,
+title = {Regression Discontinuity Designs Using Covariates},
+author = {Calonico, Sebastian and Cattaneo, Matias D. and Farrell, Max H. and Titiunik, Roc{\'i}o},
+year = 2019,
+month = jul,
+journal = {The Review of Economics and Statistics},
+volume = 101,
+number = 3,
+pages = {442--451},
+doi = {10.1162/rest_a_00760},
+}
+
+
+@online{NoRo21,
+title = {Bias-Aware Inference in Fuzzy Regression Discontinuity Designs},
+author = {Noack, Claudia and Rothe, Christoph},
+year = 2021,
+month = feb,
+archiveprefix = {arXiv},
+eprint = {1906.04631},
+eprinttype = {arxiv},
+}
+
+@article{ImWa19,
+title = {Optimized Regression Discontinuity Designs},
+author = {Imbens, Guido W. and Wager, Stefan},
+year = 2019,
+month = may,
+journaltitle = {The Review of Economics and Statistics},
+volume = 101,
+number = 2,
+pages = {264--278},
+doi = {10.1162/rest_a_00793},
+}
+
+
+@article{low97,
+title = {On Nonparametric Confidence Intervals},
+author = {Low, Mark G.},
+year = 1997,
+month = dec,
+journaltitle = {The Annals of Statistics},
+volume = 25,
+number = 6,
+pages = {2547--2554},
+doi = {10.1214/aos/1030741084},
+}
+
+
+@article{htv01,
+title = {Identification and Estimation of Treatment Effects with a Regression-Discontinuity Design},
+author = {Hahn, Jinyong and Todd, Petra E. and van der Klaauw, Wilbert},
+year = 2001,
+month = jan,
+journal = {Econometrica},
+volume = 69,
+number = 1,
+pages = {201--209},
+doi = {10.1111/1468-0262.00183},
+}
+
+
 @article{ImKa12,
 title = {Optimal Bandwidth Choice for the Regression Discontinuity Estimator},
 author = {Imbens, Guido W. and Kalyanaraman, Karthik},
@@ -70,7 +132,7 @@ @article{lalive08
 pages = {785--806},
 }
 
-@article{ArKo16optimal,
+@article{ArKo18optimal,
 author = {Armstrong, Timothy B. and Kolesár, Michal},
 title = {Optimal Inference in a Class of Regression Models},
 journal = {Econometrica},
@@ -82,7 +144,7 @@ @article{ArKo16optimal
 pages = {655--683}
 }
 
-@article{ArKo16honest,
+@article{ArKo20,
 author = {Armstrong, Timothy B. and Kolesár, Michal},
 title = {Simple and Honest Confidence Intervals in Nonparametric Regression},
 journal = {Quantitative Economics},
diff --git a/vignettes/vignette_head.tex b/vignettes/vignette_head.tex
index bde3b81..1503fa0 100644
--- a/vignettes/vignette_head.tex
+++ b/vignettes/vignette_head.tex
@@ -4,3 +4,8 @@
 % and \right, or use \abs[\Bigg]{\frac{a}{b}}
 \DeclarePairedDelimiter\abs{\lvert}{\rvert}
 \DeclarePairedDelimiter\norm{\lVert}{\rVert}
+\DeclareMathOperator{\cv}{cv} % formerly tilde chi
+\DeclareMathOperator{\se}{se} % standard error
+\DeclareMathOperator{\sd}{sd} % standard deviation
+\newcommand{\hatse}{\widehat{\se}} % standard error estimator
+\DeclareMathOperator{\cov}{cov} %