Skip to content

Commit

Permalink
Reformat code
Browse files Browse the repository at this point in the history
  • Loading branch information
hanneoberman committed Dec 13, 2022
1 parent e21ccd1 commit daa3d6c
Showing 1 changed file with 127 additions and 96 deletions.
223 changes: 127 additions & 96 deletions R/pattern.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,112 +12,142 @@
#' @examples
#' plot_pattern(mice::nhanes)
#' @export
plot_pattern <- function(data, vrb = "all", square = FALSE, rotate = FALSE, cluster = NULL, npat = NULL) {
if (!is.data.frame(data) & !is.matrix(data)) {
stop("Dataset should be a 'data.frame' or 'matrix'.")
}
if (vrb == "all") {
vrb <- names(data)
}
if (!is.null(cluster)) {
if (cluster %nin% names(data[, vrb])) {
stop("Cluster variable not recognized, please provide the variable name as a character string.")
plot_pattern <-
function(data,
vrb = "all",
square = FALSE,
rotate = FALSE,
cluster = NULL,
npat = NULL) {
if (!is.data.frame(data) & !is.matrix(data)) {
stop("Dataset should be a 'data.frame' or 'matrix'.")
}
}
if(!is.null(npat)) {
if (!is.numeric(npat) | npat < 1) {
stop("Number of patterns should be one or more. Please provide a positive numeric value.")
if (vrb == "all") {
vrb <- names(data)
}
if (".x" %in% vrb | ".y" %in% vrb) {
stop(
"The variable names '.x' and '.y' are used internally to produce the missing data pattern plot. Please exclude or rename your variable(s)."
)
}
if (!is.null(cluster)) {
if (cluster %nin% names(data[, vrb])) {
stop(
"Cluster variable not recognized, please provide the variable name as a character string."
)
}
}
if (!is.null(npat)) {
if (!is.numeric(npat) | npat < 1) {
stop("Number of patterns should be one or more. Please provide a positive numeric value.")
}
}
}
if(".x" %in% vrb | ".y" %in% vrb) {
stop("The variable names '.x' and '.y' are used internally to produce the missing data pattern plot. Please exclude or rename your variable(s).")
}

# get missing data pattern
pat <- mice::md.pattern(data[, vrb], plot = FALSE)
# get missing data pattern
pat <- mice::md.pattern(data[, vrb], plot = FALSE)

# filter npat most frequent patterns
if (!is.null(npat)) {
if (npat < (nrow(pat) - 1)) {
top_n_pat <- sort(as.numeric(row.names(pat)), decreasing = T)[1:npat]
pat <- pat[rownames(pat) %in% c(top_n_pat, ""),]
# filter npat most frequent patterns
if (!is.null(npat)) {
if (npat < (nrow(pat) - 1)) {
top_n_pat <-
sort(as.numeric(row.names(pat)), decreasing = TRUE)[1:npat]
pat <- pat[rownames(pat) %in% c(top_n_pat, ""), ]
} else {
warning("Number of patterns specified is equal to or greater than the total number of patterns. All missing data patterns are shown.")
warning(
"Number of patterns specified is equal to or greater than the total number of patterns. All missing data patterns are shown."
)
}
}
}

# extract pattern info
rws <- nrow(pat)
cls <- ncol(pat)
vrb <- colnames(pat)[-cls]
frq <- row.names(pat)[-rws]
na_row <- pat[-rws, cls]
na_col <- pat[rws, -cls]
# extract pattern info
rws <- nrow(pat)
cls <- ncol(pat)
vrb <- colnames(pat)[-cls]
frq <- row.names(pat)[-rws]
na_row <- pat[-rws, cls]
na_col <- pat[rws,-cls]

# add opacity for clustering
if (is.null(cluster)) {
pat_clean <- cbind(.opacity = 1, pat[-rws, vrb, drop=F])
} else {
pats <- purrr::map(split(data[, vrb], ~ get(cluster)), ~ {
mice::md.pattern(., plot = FALSE) %>%
pat_to_chr(., ord = vrb)
})
pat_used <- purrr::map_dfr(pats, ~ {
pat_to_chr(pat) %in% .x
}) %>%
rowMeans()
pat_clean <- data.frame(.opacity = pat_used, pat[-rws, vrb, drop=F])
}

# tidy the pattern
long <- data.frame(.y = 1:(rws - 1), pat_clean, row.names = NULL) %>%
tidyr::pivot_longer(cols = tidyselect::all_of(vrb), names_to = "x", values_to = ".where") %>%
dplyr::mutate(
.x = as.numeric(factor(.data$x, levels = vrb, ordered = TRUE)),
.where = factor(.data$.where, levels = c(0, 1), labels = c("missing", "observed")),
# TODO: always obs/always missing, add title, maybe make y axis prop to freq, add asterisk to clust var with caption that can tell that there is missingness in it
.opacity = as.numeric(.data$.opacity)
)
# add opacity for clustering
if (is.null(cluster)) {
pat_clean <- cbind(.opacity = 1, pat[-rws, vrb])
} else {
pats <- purrr::map(split(data[, vrb], ~ get(cluster)), ~ {
mice::md.pattern(., plot = FALSE) %>%
pat_to_chr(., ord = vrb)
})
pat_used <- purrr::map_dfr(pats, ~ {
pat_to_chr(pat) %in% .x
}) %>%
rowMeans()
pat_clean <- data.frame(.opacity = pat_used, pat[-rws, vrb])
}

# create the plot
gg <- ggplot2::ggplot(long, ggplot2::aes(.data$.x, .data$.y, fill = .data$.where, alpha = 0.1 + .data$.opacity / 2)) +
ggplot2::geom_tile(color = "black") +
ggplot2::scale_fill_manual(values = c("observed" = "#006CC2B3", "missing" = "#B61A51B3")) +
ggplot2::scale_alpha_continuous(limits = c(0, 1), guide = "none") +
ggplot2::scale_x_continuous(
breaks = 1:(cls - 1),
labels = na_col,
sec.axis = ggplot2::dup_axis(
labels = vrb,
name = "Column name"
)
) +
ggplot2::scale_y_reverse(
breaks = 1:(rws - 1),
labels = frq,
sec.axis = ggplot2::dup_axis(
labels = na_row,
name = "Number of missing entries\nper pattern"
# tidy the pattern
long <-
data.frame(.y = 1:(rws - 1), pat_clean, row.names = NULL) %>%
tidyr::pivot_longer(
cols = tidyselect::all_of(vrb),
names_to = "x",
values_to = ".where"
) %>%
dplyr::mutate(
.x = as.numeric(factor(
.data$x, levels = vrb, ordered = TRUE
)),
.where = factor(
.data$.where,
levels = c(0, 1),
labels = c("missing", "observed")
),
# TODO: always obs/always missing, add title, maybe make y axis prop to freq, add asterisk to clust var with caption that can tell that there is missingness in it
.opacity = as.numeric(.data$.opacity)
)
) +
ggplot2::labs(
x = "Number of missing entries\nper column",
y = "Pattern frequency",
fill = "",
alpha = ""
) +
theme_minimice()
if (square) {
gg <- gg + ggplot2::coord_fixed(expand = FALSE)
} else {
gg <- gg + ggplot2::coord_cartesian(expand = FALSE)
}
if (rotate) {
gg <- gg + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
}

return(gg)
}
# create the plot
gg <-
ggplot2::ggplot(
long,
ggplot2::aes(
.data$.x,
.data$.y,
fill = .data$.where,
alpha = 0.1 + .data$.opacity / 2
)
) +
ggplot2::geom_tile(color = "black") +
ggplot2::scale_fill_manual(values = c("observed" = "#006CC2B3", "missing" = "#B61A51B3")) +
ggplot2::scale_alpha_continuous(limits = c(0, 1), guide = "none") +
ggplot2::scale_x_continuous(
breaks = 1:(cls - 1),
labels = na_col,
sec.axis = ggplot2::dup_axis(labels = vrb,
name = "Column name")
) +
ggplot2::scale_y_reverse(
breaks = 1:(rws - 1),
labels = frq,
sec.axis = ggplot2::dup_axis(labels = na_row,
name = "Number of missing entries\nper pattern")
) +
ggplot2::labs(
x = "Number of missing entries\nper column",
y = "Pattern frequency",
fill = "",
alpha = ""
) +
theme_minimice()
if (square) {
gg <- gg + ggplot2::coord_fixed(expand = FALSE)
} else {
gg <- gg + ggplot2::coord_cartesian(expand = FALSE)
}
if (rotate) {
gg <-
gg + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
}

return(gg)
}

#' Utils function to process missing data pattern
#'
Expand All @@ -128,5 +158,6 @@ pat_to_chr <- function(pat, ord = NULL) {
if (is.null(ord)) {
ord <- colnames(pat)[-ncol(pat)]
}
apply(pat[-nrow(pat), ord], 1, function(x) paste(as.numeric(x), collapse = ""))
apply(pat[-nrow(pat), ord], 1, function(x)
paste(as.numeric(x), collapse = ""))
}

0 comments on commit daa3d6c

Please sign in to comment.