Merge pull request #55 from WorldHealthOrganization/speed

Add a fast approximation for simple surveys
WorldHealthOrganization · Oct 29, 2024 · 2d96147 · 2d96147
2 parents 7e42f47 + df715c8
commit 2d96147
Show file tree

Hide file tree

Showing 9 changed files with 542 additions and 177 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,14 +22,17 @@ Imports:
     survey
 Suggests:
     testthat
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Collate:
     'anthro-package.R'
     'utils.R'
+    'prevalence-survey.R'
+    'prevalence-simple.R'
     'assertions.R'
     'prevalence.R'
     'z-score-helper.R'
     'api.R'
+    'prevalence-compute-api.R'
     'z-score-arm-circumference-for-age.R'
     'z-score-bmi-for-age.R'
     'z-score-head-circumference-for-age.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,14 +1,30 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(column_names,simple_design)
+S3method(column_names,survey_design)
+S3method(column_values,simple_design)
+S3method(column_values,survey_design)
+S3method(compute_prevalence_estimates_for_column_by,simple_design)
+S3method(compute_prevalence_estimates_for_column_by,survey_design)
+S3method(compute_prevalence_sample_size_by,simple_design)
+S3method(compute_prevalence_sample_size_by,survey_design)
+S3method(compute_prevalence_zscore_summaries_by,simple_design)
+S3method(compute_prevalence_zscore_summaries_by,survey_design)
 export(anthro_api_compute_prevalence)
 export(anthro_api_compute_zscore)
 export(anthro_api_compute_zscore_adjusted)
 export(anthro_api_standardize_oedema_var)
 export(anthro_api_standardize_sex_var)
 export(anthro_prevalence)
 export(anthro_zscores)
+importFrom(stats,aggregate)
 importFrom(stats,as.formula)
 importFrom(stats,confint)
+importFrom(stats,glm)
+importFrom(stats,plogis)
+importFrom(stats,qt)
+importFrom(stats,quasibinomial)
+importFrom(stats,sd)
 importFrom(stats,setNames)
 importFrom(survey,degf)
 importFrom(survey,svyby)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # anthro (development version)
 
+## Performance
+
+* For inputs with `cluster/strata = NULL` and `sw = NULL or 1` a faster
+  method was added for this special case to compute `anthro_prevalence`.
+
 ## Bugfix
 
 * Fixed a bug during prevalence computation when a value used as a group

diff --git a/R/prevalence-compute-api.R b/R/prevalence-compute-api.R
@@ -0,0 +1,40 @@
+#' In order to support different types of computation for special cases of the
+#' data we define here a compute API that different backends can hook into.
+#' @noRd
+NULL
+
+#' Computes the sample size by indicator and subset
+#' @noRd
+compute_prevalence_sample_size_by <- function(
+    data, indicator, subset_col_name
+) {
+  UseMethod("compute_prevalence_sample_size_by")
+}
+
+#' Computes prevalence of rates for a given indicator, subset and cutoff
+#' @noRd
+compute_prevalence_estimates_for_column_by <- function(
+    data, indicator_name, subset_col_name, prev_col_name
+){
+  UseMethod("compute_prevalence_estimates_for_column_by")
+}
+
+#' Computes prevalence of zscores by indicator and subset
+#' @noRd
+compute_prevalence_zscore_summaries_by <- function(
+    data, indicator, subset_col_name
+) {
+  UseMethod("compute_prevalence_zscore_summaries_by")
+}
+
+#' Returns the values for a specific column
+#' @noRd
+column_values <- function(x, col) {
+  UseMethod("column_values")
+}
+
+#' Returns the column names of the data
+#' @noRd
+column_names <- function(x) {
+  UseMethod("column_names")
+}
diff --git a/R/prevalence-simple.R b/R/prevalence-simple.R
@@ -0,0 +1,168 @@
+#' The following is a fast approximation of what the survey package
+#' computes given cluster/strata/sw is NULL.
+#'
+#' T. Lumley (2024) "survey: analysis of complex survey samples".
+#' R package version 4.4.
+#'
+#' The speedup and memory savings can be substantial in particular for very
+#' large surveys with > 100.000 observations.
+#'
+#' The simple here refers to the absence of cluster/strata/sw, making the
+#' structure a bit more simple.
+#' @noRd
+
+#' @export
+column_names.simple_design <- function(x) {
+  colnames(x$data)
+}
+
+#' @export
+column_values.simple_design <- function(x, col) {
+  x$data[[col]]
+}
+
+#' @export
+compute_prevalence_zscore_summaries_by.simple_design <- function(
+    data, indicator, subset_col_name) {
+  zscore_col_name <- prev_zscore_value_column(indicator)
+  compute_and_aggregate(
+    data,
+    zscore_col_name,
+    subset_col_name,
+    compute = zscore_estimate,
+    empty_data_prototype = data.frame(
+      r = NA_real_,
+      se = NA_real_,
+      ll = NA_real_,
+      ul = NA_real_,
+      stdev = NA_real_
+    )
+  )
+}
+
+#' @export
+compute_prevalence_estimates_for_column_by.simple_design <- function(
+    data, indicator_name, subset_col_name, prev_col_name) {
+  compute_and_aggregate(
+    data,
+    prev_col_name,
+    subset_col_name,
+    compute = logit_rate_estimate,
+    empty_data_prototype = data.frame(
+      r = NA_real_,
+      se = NA_real_,
+      ll = NA_real_,
+      ul = NA_real_
+    )
+  )
+}
+
+#' @export
+compute_prevalence_sample_size_by.simple_design <- function(
+    data, indicator, subset_col_name) {
+  column_name <- prev_prevalence_column_name(indicator)
+  compute_and_aggregate(
+    data,
+    column_name,
+    subset_col_name,
+    compute = sample_size,
+    empty_data_prototype = data.frame(
+      pop = 0,
+      unwpop = 0
+    )
+  )
+}
+
+#' @importFrom stats aggregate
+compute_and_aggregate <- function(
+    data, value_column, subset_column, compute, empty_data_prototype) {
+  col_values <- column_values(data, value_column)
+  N <- length(col_values)
+  values <- aggregate(
+    col_values,
+    by = list(Group = column_values(data, subset_column)),
+    FUN = compute,
+    simplify = FALSE,
+    drop = FALSE,
+    N = N,
+    empty_data_prototype = empty_data_prototype
+  )
+  Group <- values$Group
+  values <- lapply(values$x, function(df) {
+    if (is.null(df)) {
+      return(empty_data_prototype)
+    }
+    df
+  })
+  data <- do.call(rbind, values)
+  cbind(Group, data)
+}
+
+#' @importFrom stats glm plogis qt quasibinomial
+logit_rate_estimate <- function(x, N, empty_data_prototype) {
+  x_orig <- x
+  x <- x[!is.na(x)]
+  if (length(x) == 0) {
+    return(empty_data_prototype)
+  }
+  n <- length(x)
+  r <- mean(x == 1)
+  se <- sample_se(x, r, n, N)
+
+  cis <- if (r == 1) {
+    c(1, 1)
+  } else if (r == 0) {
+    c(0, 0)
+  } else {
+    glm_model <- glm(x_orig ~ 1, family = quasibinomial(link = "logit"))
+    summary_model <- summary(glm_model)
+    r_var <- summary_model$cov.unscaled[1, 1]
+    logit_r <- summary_model$coefficients[1, "Estimate"]
+    scale <- N / (N - 1)
+    se_logit <- sqrt(scale * r_var)
+    t_value <- qt(1 - prevalence_significance_level / 2, df = N - 1)
+    cis <- plogis(
+      logit_r + se_logit * c(-t_value, t_value)
+    )
+    as.numeric(cis)
+  }
+
+  data.frame(
+    r = r * 100,
+    se = se * 100,
+    ll = cis[1L] * 100,
+    ul = cis[2L] * 100
+  )
+}
+
+#' @importFrom stats sd plogis qt
+zscore_estimate <- function(x, N, empty_data_prototype) {
+  x <- x[!is.na(x)]
+  if (length(x) == 0) {
+    return(empty_data_prototype)
+  }
+  n <- length(x)
+  r <- mean(x)
+  stdev <- sd(x)
+  se <- sample_se(x, r, n, N)
+  t_value <- qt(1 - prevalence_significance_level / 2, N - 1)
+  cis <- r + se * c(-t_value, t_value)
+  data.frame(
+    r = r,
+    se = se,
+    ll = cis[1L],
+    ul = cis[2L],
+    stdev = stdev
+  )
+}
+
+sample_size <- function(x, N, empty_data_prototype) {
+  n <- as.numeric(sum(!is.na(x)))
+  data.frame(pop = n, unwpop = n)
+}
+
+sample_se <- function(x, x_mean, n, N) {
+  scale <- N / (N - 1)
+  x_centered_and_scaled <- (x - x_mean) / n * sqrt(scale)
+  sqrt(sum(x_centered_and_scaled^2))
+}