Skip to content

Commit

Permalink
Merge pull request #55 from WorldHealthOrganization/speed
Browse files Browse the repository at this point in the history
Add a fast approximation for simple surveys
  • Loading branch information
dirkschumacher authored Oct 29, 2024
2 parents 7e42f47 + df715c8 commit 2d96147
Show file tree
Hide file tree
Showing 9 changed files with 542 additions and 177 deletions.
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@ Imports:
survey
Suggests:
testthat
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Collate:
'anthro-package.R'
'utils.R'
'prevalence-survey.R'
'prevalence-simple.R'
'assertions.R'
'prevalence.R'
'z-score-helper.R'
'api.R'
'prevalence-compute-api.R'
'z-score-arm-circumference-for-age.R'
'z-score-bmi-for-age.R'
'z-score-head-circumference-for-age.R'
Expand Down
16 changes: 16 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
# Generated by roxygen2: do not edit by hand

S3method(column_names,simple_design)
S3method(column_names,survey_design)
S3method(column_values,simple_design)
S3method(column_values,survey_design)
S3method(compute_prevalence_estimates_for_column_by,simple_design)
S3method(compute_prevalence_estimates_for_column_by,survey_design)
S3method(compute_prevalence_sample_size_by,simple_design)
S3method(compute_prevalence_sample_size_by,survey_design)
S3method(compute_prevalence_zscore_summaries_by,simple_design)
S3method(compute_prevalence_zscore_summaries_by,survey_design)
export(anthro_api_compute_prevalence)
export(anthro_api_compute_zscore)
export(anthro_api_compute_zscore_adjusted)
export(anthro_api_standardize_oedema_var)
export(anthro_api_standardize_sex_var)
export(anthro_prevalence)
export(anthro_zscores)
importFrom(stats,aggregate)
importFrom(stats,as.formula)
importFrom(stats,confint)
importFrom(stats,glm)
importFrom(stats,plogis)
importFrom(stats,qt)
importFrom(stats,quasibinomial)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(survey,degf)
importFrom(survey,svyby)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# anthro (development version)

## Performance

* For inputs with `cluster/strata = NULL` and `sw = NULL or 1` a faster
method was added for this special case to compute `anthro_prevalence`.

## Bugfix

* Fixed a bug during prevalence computation when a value used as a group
Expand Down
40 changes: 40 additions & 0 deletions R/prevalence-compute-api.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#' In order to support different types of computation for special cases of the
#' data we define here a compute API that different backends can hook into.
#' @noRd
NULL

#' Computes the sample size by indicator and subset
#' @noRd
compute_prevalence_sample_size_by <- function(
data, indicator, subset_col_name
) {
UseMethod("compute_prevalence_sample_size_by")
}

#' Computes prevalence of rates for a given indicator, subset and cutoff
#' @noRd
compute_prevalence_estimates_for_column_by <- function(
data, indicator_name, subset_col_name, prev_col_name
){
UseMethod("compute_prevalence_estimates_for_column_by")
}

#' Computes prevalence of zscores by indicator and subset
#' @noRd
compute_prevalence_zscore_summaries_by <- function(
data, indicator, subset_col_name
) {
UseMethod("compute_prevalence_zscore_summaries_by")
}

#' Returns the values for a specific column
#' @noRd
column_values <- function(x, col) {
UseMethod("column_values")
}

#' Returns the column names of the data
#' @noRd
column_names <- function(x) {
UseMethod("column_names")
}
168 changes: 168 additions & 0 deletions R/prevalence-simple.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#' The following is a fast approximation of what the survey package
#' computes given cluster/strata/sw is NULL.
#'
#' T. Lumley (2024) "survey: analysis of complex survey samples".
#' R package version 4.4.
#'
#' The speedup and memory savings can be substantial in particular for very
#' large surveys with > 100.000 observations.
#'
#' The simple here refers to the absence of cluster/strata/sw, making the
#' structure a bit more simple.
#' @noRd

#' @export
column_names.simple_design <- function(x) {
colnames(x$data)
}

#' @export
column_values.simple_design <- function(x, col) {
x$data[[col]]
}

#' @export
compute_prevalence_zscore_summaries_by.simple_design <- function(
data, indicator, subset_col_name) {
zscore_col_name <- prev_zscore_value_column(indicator)
compute_and_aggregate(
data,
zscore_col_name,
subset_col_name,
compute = zscore_estimate,
empty_data_prototype = data.frame(
r = NA_real_,
se = NA_real_,
ll = NA_real_,
ul = NA_real_,
stdev = NA_real_
)
)
}

#' @export
compute_prevalence_estimates_for_column_by.simple_design <- function(
data, indicator_name, subset_col_name, prev_col_name) {
compute_and_aggregate(
data,
prev_col_name,
subset_col_name,
compute = logit_rate_estimate,
empty_data_prototype = data.frame(
r = NA_real_,
se = NA_real_,
ll = NA_real_,
ul = NA_real_
)
)
}

#' @export
compute_prevalence_sample_size_by.simple_design <- function(
data, indicator, subset_col_name) {
column_name <- prev_prevalence_column_name(indicator)
compute_and_aggregate(
data,
column_name,
subset_col_name,
compute = sample_size,
empty_data_prototype = data.frame(
pop = 0,
unwpop = 0
)
)
}

#' @importFrom stats aggregate
compute_and_aggregate <- function(
data, value_column, subset_column, compute, empty_data_prototype) {
col_values <- column_values(data, value_column)
N <- length(col_values)
values <- aggregate(
col_values,
by = list(Group = column_values(data, subset_column)),
FUN = compute,
simplify = FALSE,
drop = FALSE,
N = N,
empty_data_prototype = empty_data_prototype
)
Group <- values$Group
values <- lapply(values$x, function(df) {
if (is.null(df)) {
return(empty_data_prototype)
}
df
})
data <- do.call(rbind, values)
cbind(Group, data)
}

#' @importFrom stats glm plogis qt quasibinomial
logit_rate_estimate <- function(x, N, empty_data_prototype) {
x_orig <- x
x <- x[!is.na(x)]
if (length(x) == 0) {
return(empty_data_prototype)
}
n <- length(x)
r <- mean(x == 1)
se <- sample_se(x, r, n, N)

cis <- if (r == 1) {
c(1, 1)
} else if (r == 0) {
c(0, 0)
} else {
glm_model <- glm(x_orig ~ 1, family = quasibinomial(link = "logit"))
summary_model <- summary(glm_model)
r_var <- summary_model$cov.unscaled[1, 1]
logit_r <- summary_model$coefficients[1, "Estimate"]
scale <- N / (N - 1)
se_logit <- sqrt(scale * r_var)
t_value <- qt(1 - prevalence_significance_level / 2, df = N - 1)
cis <- plogis(
logit_r + se_logit * c(-t_value, t_value)
)
as.numeric(cis)
}

data.frame(
r = r * 100,
se = se * 100,
ll = cis[1L] * 100,
ul = cis[2L] * 100
)
}

#' @importFrom stats sd plogis qt
zscore_estimate <- function(x, N, empty_data_prototype) {
x <- x[!is.na(x)]
if (length(x) == 0) {
return(empty_data_prototype)
}
n <- length(x)
r <- mean(x)
stdev <- sd(x)
se <- sample_se(x, r, n, N)
t_value <- qt(1 - prevalence_significance_level / 2, N - 1)
cis <- r + se * c(-t_value, t_value)
data.frame(
r = r,
se = se,
ll = cis[1L],
ul = cis[2L],
stdev = stdev
)
}

sample_size <- function(x, N, empty_data_prototype) {
n <- as.numeric(sum(!is.na(x)))
data.frame(pop = n, unwpop = n)
}

sample_se <- function(x, x_mean, n, N) {
scale <- N / (N - 1)
x_centered_and_scaled <- (x - x_mean) / n * sqrt(scale)
sqrt(sum(x_centered_and_scaled^2))
}
Loading

0 comments on commit 2d96147

Please sign in to comment.