From b76c31d5a52850ec51d1a054a18c78219add044e Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Mon, 22 Jan 2024 18:05:48 +0000 Subject: [PATCH 1/7] re-writing process_sc_all sds and alarm_telecare with data.table to improve the speed --- R/process_sc_all_alarms_telecare.R | 176 ++++++++++++----------- R/process_sc_all_sds.R | 222 +++++++++++++++-------------- 2 files changed, 204 insertions(+), 194 deletions(-) diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R index 988d1f3e7..0e63ddbc4 100644 --- a/R/process_sc_all_alarms_telecare.R +++ b/R/process_sc_all_alarms_telecare.R @@ -17,97 +17,101 @@ process_sc_all_alarms_telecare <- function( write_to_disk = TRUE) { # Data Cleaning----------------------------------------------------- - replaced_dates <- data %>% - # If the end date is missing, set this to the end of the period - dplyr::mutate( - service_end_date = fix_sc_missing_end_dates( - .data$service_end_date, - .data$period_end_date - ), - # If the start_date is missing, set this to the start of the period - service_start_date = fix_sc_start_dates( - .data$service_start_date, - .data$period_start_date - ), - # Fix service_end_date if earlier than service_start_date by setting end_date to the end of fy - service_end_date = fix_sc_end_dates( - .data$service_start_date, - .data$service_end_date, - .data$period - ) - ) + # Convert to data.table + data.table::setDT(data) + data.table::setDT(sc_demog_lookup) + + # Fix dates and create new variables + data[, + service_end_date := fix_sc_missing_end_dates(service_end_date, + period_end_date)] + data[, + service_start_date := fix_sc_start_dates(service_start_date, + period_start_date)] + data[, + service_end_date := fix_sc_end_dates(service_start_date, + service_end_date, + period)] + + + # Rename columns + data.table::setnames( + data, + old = c("service_start_date", "service_end_date"), + new = c("record_keydate1", "record_keydate2") + ) + # Additional mutations + data[, + c("recid", + "smrtype", + "sc_send_lca") := list( + "AT", + data.table::fcase( + service_type == 1L, + "AT-Alarm", + service_type == 2L, + "AT-Tele", + default, + NA_character_ + ), + convert_sc_sending_location_to_lca(sending_location) + )] + data$person_id = paste0(data$sending_location, + "-", + data$social_care_id) - at_full_clean <- replaced_dates %>% - # rename for matching source variables - dplyr::rename( - record_keydate1 = "service_start_date", - record_keydate2 = "service_end_date" - ) %>% - # Include source variables - dplyr::mutate( - recid = "AT", - smrtype = dplyr::case_when( - .data$service_type == 1L ~ "AT-Alarm", - .data$service_type == 2L ~ "AT-Tele" - ), - # Create person id variable - person_id = stringr::str_glue("{sending_location}-{social_care_id}"), - # Use function for creating sc send lca variables - sc_send_lca = convert_sc_sending_location_to_lca(.data$sending_location) - ) %>% - # Match on demographics data (chi, gender, dob and postcode) - dplyr::left_join( - sc_demog_lookup, - by = c("sending_location", "social_care_id") - ) %>% - # when multiple social_care_id from sending_location for single CHI - # replace social_care_id with latest - replace_sc_id_with_latest() + # Join with sc_demog_lookup + data = sc_demog_lookup[data, on = .(sending_location, social_care_id)] - # Deal with episodes which have a package across quarters. - qtr_merge <- at_full_clean %>% - # use as.data.table to change the data format to data.table to accelerate - data.table::as.data.table() %>% - dplyr::group_by( - .data$sending_location, - .data$social_care_id, - .data$record_keydate1, - .data$smrtype, - .data$period - ) %>% - # Create a count for the package number across episodes - dplyr::mutate(pkg_count = dplyr::row_number()) %>% + # Replace social_care_id with latest if needed (assuming replace_sc_id_with_latest is a custom function) + data = replace_sc_id_with_latest(data) + + # Deal with episodes that have a package across quarters + data[, pkg_count := seq_len(.N), by = .(sending_location, + social_care_id, + record_keydate1, + smrtype, + period)] + + # Order data before summarizing + data = data %>% dplyr::group_by( + .data$sending_location, + .data$social_care_id, + .data$record_keydate1, + .data$smrtype, + .data$period + ) %>% # Sort prior to merging dplyr::arrange(.by_group = TRUE) %>% - # group for merging episodes - dplyr::group_by( - .data$sending_location, - .data$social_care_id, - .data$record_keydate1, - .data$smrtype, - .data$pkg_count - ) %>% - # merge episodes with packages across quarters - # drop variables not needed - dplyr::summarise( - sending_location = dplyr::last(.data$sending_location), - social_care_id = dplyr::last(.data$social_care_id), - sc_latest_submission = dplyr::last(.data$period), - record_keydate1 = dplyr::last(.data$record_keydate1), - record_keydate2 = dplyr::last(.data$record_keydate2), - smrtype = dplyr::last(.data$smrtype), - pkg_count = dplyr::last(.data$pkg_count), - chi = dplyr::last(.data$chi), - gender = dplyr::last(.data$gender), - dob = dplyr::last(.data$dob), - postcode = dplyr::last(.data$postcode), - recid = dplyr::last(.data$recid), - person_id = dplyr::last(.data$person_id), - sc_send_lca = dplyr::last(.data$sc_send_lca) - ) %>% - # change the data format from data.table to data.frame - tibble::as_tibble() + dplyr::ungroup() %>% + data.table::as.data.table() + + # Summarize to merge episodes + qtr_merge = data[, .( + sending_location = data.table::last(sending_location), + social_care_id = data.table::last(social_care_id), + sc_latest_submission = data.table::last(period), + record_keydate1 = data.table::last(record_keydate1), + record_keydate2 = data.table::last(record_keydate2), + smrtype = data.table::last(smrtype), + pkg_count = data.table::last(pkg_count), + chi = data.table::last(chi), + gender = data.table::last(gender), + dob = data.table::last(dob), + postcode = data.table::last(postcode), + recid = data.table::last(recid), + person_id = data.table::last(person_id), + sc_send_lca = data.table::last(sc_send_lca) + ), by = .(sending_location, + social_care_id, + record_keydate1, + smrtype, + pkg_count)] + + # Convert back to data.frame if necessary + qtr_merge = as.data.frame(qtr_merge) + if (write_to_disk) { write_file( diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R index f9ca52f24..784ee3439 100644 --- a/R/process_sc_all_sds.R +++ b/R/process_sc_all_sds.R @@ -15,123 +15,129 @@ process_sc_all_sds <- function( sc_demog_lookup, write_to_disk = TRUE) { # Match on demographics data (chi, gender, dob and postcode) - matched_sds_data <- data %>% + matched_sds_data <- all_sds_extract %>% dplyr::left_join( sc_demog_lookup, by = c("sending_location", "social_care_id") ) %>% # when multiple social_care_id from sending_location for single CHI # replace social_care_id with latest - replace_sc_id_with_latest() - - # Data Cleaning --------------------------------------- - sds_full_clean <- matched_sds_data %>% - # Deal with SDS option 4 - # First turn the option flags into a logical T/F - dplyr::mutate(dplyr::across( - tidyselect::starts_with("sds_option_"), - ~ dplyr::case_when( - .x == 1L ~ TRUE, - .x == 0L ~ FALSE, - is.na(.x) ~ FALSE - ) - )) %>% - # SDS option 4 is derived when a person receives more than one option. - # e.g. if a person has options 1 and 2 then option 4 will be derived - dplyr::mutate( - sds_option_4 = rowSums( - dplyr::pick(tidyselect::starts_with("sds_option_")) - ) > 1L, - .after = .data$sds_option_3 - ) %>% - # If SDS start date is missing, assign start of FY - dplyr::mutate( - sds_start_date = fix_sc_start_dates( - .data$sds_start_date, - .data$sds_period_start_date - ), - # If SDS end date is missing, assign end of FY - sds_end_date = fix_sc_missing_end_dates( - .data$sds_end_date, - .data$sds_period_end_date - ), - # Fix sds_end_date is earlier than sds_start_date by setting end_date to be the end of fyear - sds_end_date = fix_sc_end_dates( - .data$sds_start_date, - .data$sds_end_date, - .data$period - ) - ) %>% - # rename for matching source variables - dplyr::rename( - record_keydate1 = .data$sds_start_date, - record_keydate2 = .data$sds_end_date - ) %>% - # Pivot longer on sds option variables - tidyr::pivot_longer( - cols = tidyselect::contains("sds_option_"), - names_to = "sds_option", - names_prefix = "sds_option_", - names_transform = list(sds_option = ~ paste0("SDS-", .x)), - values_to = "received" - ) %>% - # Only keep rows where they received a package and remove duplicates - dplyr::filter(.data$received) %>% - dplyr::distinct() %>% - # Include source variables + replace_sc_id_with_latest() %>% + # sds_options may contain only a few NA, replace NA by 0 dplyr::mutate( - smrtype = dplyr::case_when( - sds_option == "SDS-1" ~ "SDS-1", - sds_option == "SDS-2" ~ "SDS-2", - sds_option == "SDS-3" ~ "SDS-3", - sds_option == "SDS-4" ~ "SDS-4" - ), - recid = "SDS", - # Create person id variable - person_id = stringr::str_glue("{sending_location}-{social_care_id}"), - # Use function for creating sc send lca variables - sc_send_lca = convert_sc_sending_location_to_lca(.data$sending_location) + sds_option_1 = tidyr::replace_na(sds_option_1, 0), + sds_option_2 = tidyr::replace_na(sds_option_2, 0), + sds_option_3 = tidyr::replace_na(sds_option_3, 0) ) - final_data <- sds_full_clean %>% - # use as.data.table to change the data format to data.table to accelerate - data.table::as.data.table() %>% - dplyr::group_by( - .data$sending_location, - .data$social_care_id, - .data$smrtype - ) %>% - dplyr::arrange(.data$period, - .data$record_keydate1, - .by_group = TRUE - ) %>% - # Create a flag for episodes that are going to be merged - # Create an episode counter - dplyr::mutate( - distinct_episode = (.data$record_keydate1 > dplyr::lag(.data$record_keydate2)) %>% - tidyr::replace_na(TRUE), - episode_counter = cumsum(.data$distinct_episode) - ) %>% - # Group by episode counter and merge episodes - dplyr::group_by(.data$episode_counter, .add = TRUE) %>% - dplyr::summarise( - sc_latest_submission = dplyr::last(.data$period), - record_keydate1 = min(.data$record_keydate1), - record_keydate2 = max(.data$record_keydate2), - sending_location = dplyr::last(.data$sending_location), - social_care_id = dplyr::last(.data$social_care_id), - chi = dplyr::last(.data$chi), - gender = dplyr::last(.data$gender), - dob = dplyr::last(.data$dob), - postcode = dplyr::last(.data$postcode), - recid = dplyr::last(.data$recid), - person_id = dplyr::last(.data$person_id), - sc_send_lca = dplyr::last(.data$sc_send_lca) - ) %>% - dplyr::ungroup() %>% - dplyr::select(-.data$episode_counter) %>% - # change the data format from data.table to data.frame - tibble::as_tibble() + # Data Cleaning --------------------------------------- + # Convert matched_sds_data to data.table + sds_full_clean <- data.table::as.data.table(matched_sds_data) + rm(matched_sds_data) + + # Deal with SDS option 4 + # Convert option flags into logical T/F + cols_sds_option <- grep( + "^sds_option_", + names(sds_full_clean), + value = TRUE + ) + sds_full_clean[, (cols_sds_option) := lapply(.SD, function(x) { + data.table::fifelse(x == 1L, TRUE, FALSE) + }) + , .SDcols = cols_sds_option] + + # Derived SDS option 4 when a person receives more than one option + sds_full_clean[, + sds_option_4 := rowSums(.SD) > 1L, + .SDcols = cols_sds_option] + + # If SDS start date or end date is missing, assign start/end of FY + sds_full_clean[, + sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date)] + sds_full_clean[, + sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date)] + sds_full_clean[, + sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period)] + + + + # Rename for matching source variables + data.table::setnames( + sds_full_clean, + c("sds_start_date", "sds_end_date"), + c("record_keydate1", "record_keydate2") + ) + + cols_sds_option <- grep( + "^sds_option_", + names(sds_full_clean), + value = TRUE + ) + # Pivot longer on sds option variables + sds_full_clean_long <- data.table::melt( + sds_full_clean, + id.vars = setdiff(names(sds_full_clean), cols_sds_option), + measure.vars = cols_sds_option, + variable.name = "sds_option", + value.name = "received" + ) + rm(sds_full_clean) + sds_full_clean_long <- sds_full_clean_long[received == TRUE, ] + sds_full_clean_long[, + sds_option := paste0("SDS-", sub("sds_option_", "", sds_option))] + + # Filter rows where they received a package and remove duplicates + sds_full_clean_long <- unique(sds_full_clean_long) + + # Include source variables + sds_full_clean_long[, c("smrtype", + "recid", + "sc_send_lca") := + list(sds_option, + "SDS", + convert_sc_sending_location_to_lca(sending_location))] + sds_full_clean_long$person_id = paste0(sds_full_clean_long$sending_location, + "-", + sds_full_clean_long$social_care_id) + + # Group, arrange and create flags for episodes + sds_full_clean_long[, + c("period_rank", + "record_keydate1_rank") := list(rank(period), + rank(record_keydate1)), + by = .(sending_location, social_care_id, smrtype)] + data.table::setorder(sds_full_clean_long, period_rank, record_keydate1_rank) + + sds_full_clean_long[, + distinct_episode := + (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>% + tidyr::replace_na(TRUE), + by = .(sending_location, social_care_id, smrtype)] + + sds_full_clean_long[, + episode_counter := + cumsum(distinct_episode), + by = .(sending_location, social_care_id, smrtype)] + + # Merge episodes by episode counter + final_data <- sds_full_clean_long[, .( + sc_latest_submission = data.table::last(period), + record_keydate1 = min(record_keydate1), + record_keydate2 = max(record_keydate2), + chi = data.table::last(chi), + gender = data.table::last(gender), + dob = data.table::last(dob), + postcode = data.table::last(postcode), + recid = data.table::last(recid), + person_id = data.table::last(person_id), + sc_send_lca = data.table::last(sc_send_lca) + ), by = .(sending_location, social_care_id, smrtype, episode_counter)] + rm(sds_full_clean_long) + + # Drop episode_counter and convert back to data.frame if needed + final_data <- as.data.frame(final_data[, -"episode_counter"]) + # final_data now holds the processed data in the format of a data.frame if (write_to_disk) { write_file( From 1996b2aa6f2da5c482c416a0003340c4c265b2c6 Mon Sep 17 00:00:00 2001 From: lizihao-anu Date: Mon, 22 Jan 2024 18:08:22 +0000 Subject: [PATCH 2/7] Update documentation --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5123289dd..4bb0c6f18 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -73,4 +73,4 @@ Encoding: UTF-8 Language: en-GB LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.0 From 13f5c8059901b951f4c5a46c2e4f39b346d39fff Mon Sep 17 00:00:00 2001 From: lizihao-anu Date: Mon, 22 Jan 2024 18:26:23 +0000 Subject: [PATCH 3/7] Style code --- R/create_individual_file.R | 3 +- R/get_fy_quarter_dates.R | 8 +- R/process_sc_all_alarms_telecare.R | 121 +++++++++++------- R/process_sc_all_sds.R | 87 ++++++++----- .../run_episode_file_1718.R | 3 +- .../run_episode_file_1819.R | 3 +- .../run_episode_file_1920.R | 3 +- .../run_episode_file_2021.R | 3 +- .../run_episode_file_2122.R | 3 +- .../run_episode_file_2223.R | 3 +- .../run_episode_file_2324.R | 3 +- 11 files changed, 147 insertions(+), 93 deletions(-) diff --git a/R/create_individual_file.R b/R/create_individual_file.R index d9316b41b..4ca2f96d7 100644 --- a/R/create_individual_file.R +++ b/R/create_individual_file.R @@ -483,7 +483,8 @@ add_ch_columns <- function(episode_file, prefix, condition) { ch_ep_end = dplyr::if_else( eval(condition), .data$record_keydate2, - lubridate::NA_Date_ ), + lubridate::NA_Date_ + ), # If end date is missing use the first day of next FY quarter ch_ep_end = dplyr::if_else( eval(condition) & is.na(.data$ch_ep_end), diff --git a/R/get_fy_quarter_dates.R b/R/get_fy_quarter_dates.R index cd4c3492c..a772099b8 100644 --- a/R/get_fy_quarter_dates.R +++ b/R/get_fy_quarter_dates.R @@ -15,7 +15,7 @@ start_fy_quarter <- function(quarter) { quarter_unique <- unique(quarter) - #check_quarter_format(quarter) + # check_quarter_format(quarter) cal_quarter_date_unique <- lubridate::yq(quarter_unique) @@ -47,7 +47,7 @@ start_fy_quarter <- function(quarter) { end_fy_quarter <- function(quarter) { quarter_unique <- unique(quarter) - #check_quarter_format(quarter) + # check_quarter_format(quarter) cal_quarter_date_unique <- lubridate::yq(quarter_unique) @@ -80,7 +80,7 @@ end_fy_quarter <- function(quarter) { start_next_fy_quarter <- function(quarter) { quarter_unique <- unique(quarter) - #check_quarter_format(quarter) + # check_quarter_format(quarter) cal_quarter_date_unique <- lubridate::yq(quarter_unique) @@ -112,7 +112,7 @@ start_next_fy_quarter <- function(quarter) { end_next_fy_quarter <- function(quarter) { quarter_unique <- unique(quarter) - #check_quarter_format(quarter) + # check_quarter_format(quarter) cal_quarter_date_unique <- lubridate::yq(quarter_unique) diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R index 0e63ddbc4..bc417a8cd 100644 --- a/R/process_sc_all_alarms_telecare.R +++ b/R/process_sc_all_alarms_telecare.R @@ -22,16 +22,28 @@ process_sc_all_alarms_telecare <- function( data.table::setDT(sc_demog_lookup) # Fix dates and create new variables - data[, - service_end_date := fix_sc_missing_end_dates(service_end_date, - period_end_date)] - data[, - service_start_date := fix_sc_start_dates(service_start_date, - period_start_date)] - data[, - service_end_date := fix_sc_end_dates(service_start_date, - service_end_date, - period)] + data[ + , + service_end_date := fix_sc_missing_end_dates( + service_end_date, + period_end_date + ) + ] + data[ + , + service_start_date := fix_sc_start_dates( + service_start_date, + period_start_date + ) + ] + data[ + , + service_end_date := fix_sc_end_dates( + service_start_date, + service_end_date, + period + ) + ] # Rename columns @@ -42,53 +54,62 @@ process_sc_all_alarms_telecare <- function( ) # Additional mutations - data[, - c("recid", - "smrtype", - "sc_send_lca") := list( - "AT", - data.table::fcase( - service_type == 1L, - "AT-Alarm", - service_type == 2L, - "AT-Tele", - default, - NA_character_ - ), - convert_sc_sending_location_to_lca(sending_location) - )] - data$person_id = paste0(data$sending_location, - "-", - data$social_care_id) + data[ + , + c( + "recid", + "smrtype", + "sc_send_lca" + ) := list( + "AT", + data.table::fcase( + service_type == 1L, + "AT-Alarm", + service_type == 2L, + "AT-Tele", + default, + NA_character_ + ), + convert_sc_sending_location_to_lca(sending_location) + ) + ] + data$person_id <- paste0( + data$sending_location, + "-", + data$social_care_id + ) # Join with sc_demog_lookup - data = sc_demog_lookup[data, on = .(sending_location, social_care_id)] + data <- sc_demog_lookup[data, on = .(sending_location, social_care_id)] # Replace social_care_id with latest if needed (assuming replace_sc_id_with_latest is a custom function) - data = replace_sc_id_with_latest(data) + data <- replace_sc_id_with_latest(data) # Deal with episodes that have a package across quarters - data[, pkg_count := seq_len(.N), by = .(sending_location, - social_care_id, - record_keydate1, - smrtype, - period)] + data[, pkg_count := seq_len(.N), by = .( + sending_location, + social_care_id, + record_keydate1, + smrtype, + period + )] # Order data before summarizing - data = data %>% dplyr::group_by( - .data$sending_location, - .data$social_care_id, - .data$record_keydate1, - .data$smrtype, - .data$period - ) %>% + data <- data %>% + dplyr::group_by( + .data$sending_location, + .data$social_care_id, + .data$record_keydate1, + .data$smrtype, + .data$period + ) %>% # Sort prior to merging dplyr::arrange(.by_group = TRUE) %>% dplyr::ungroup() %>% data.table::as.data.table() # Summarize to merge episodes - qtr_merge = data[, .( + qtr_merge <- data[, .( sending_location = data.table::last(sending_location), social_care_id = data.table::last(social_care_id), sc_latest_submission = data.table::last(period), @@ -103,14 +124,16 @@ process_sc_all_alarms_telecare <- function( recid = data.table::last(recid), person_id = data.table::last(person_id), sc_send_lca = data.table::last(sc_send_lca) - ), by = .(sending_location, - social_care_id, - record_keydate1, - smrtype, - pkg_count)] + ), by = .( + sending_location, + social_care_id, + record_keydate1, + smrtype, + pkg_count + )] # Convert back to data.frame if necessary - qtr_merge = as.data.frame(qtr_merge) + qtr_merge <- as.data.frame(qtr_merge) if (write_to_disk) { diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R index 784ee3439..65d5bef4e 100644 --- a/R/process_sc_all_sds.R +++ b/R/process_sc_all_sds.R @@ -44,21 +44,29 @@ process_sc_all_sds <- function( ) sds_full_clean[, (cols_sds_option) := lapply(.SD, function(x) { data.table::fifelse(x == 1L, TRUE, FALSE) - }) - , .SDcols = cols_sds_option] + }), + .SDcols = cols_sds_option + ] # Derived SDS option 4 when a person receives more than one option sds_full_clean[, - sds_option_4 := rowSums(.SD) > 1L, - .SDcols = cols_sds_option] + sds_option_4 := rowSums(.SD) > 1L, + .SDcols = cols_sds_option + ] # If SDS start date or end date is missing, assign start/end of FY - sds_full_clean[, - sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date)] - sds_full_clean[, - sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date)] - sds_full_clean[, - sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period)] + sds_full_clean[ + , + sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date) + ] + sds_full_clean[ + , + sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date) + ] + sds_full_clean[ + , + sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period) + ] @@ -84,41 +92,56 @@ process_sc_all_sds <- function( ) rm(sds_full_clean) sds_full_clean_long <- sds_full_clean_long[received == TRUE, ] - sds_full_clean_long[, - sds_option := paste0("SDS-", sub("sds_option_", "", sds_option))] + sds_full_clean_long[ + , + sds_option := paste0("SDS-", sub("sds_option_", "", sds_option)) + ] # Filter rows where they received a package and remove duplicates sds_full_clean_long <- unique(sds_full_clean_long) # Include source variables - sds_full_clean_long[, c("smrtype", - "recid", - "sc_send_lca") := - list(sds_option, - "SDS", - convert_sc_sending_location_to_lca(sending_location))] - sds_full_clean_long$person_id = paste0(sds_full_clean_long$sending_location, - "-", - sds_full_clean_long$social_care_id) + sds_full_clean_long[, c( + "smrtype", + "recid", + "sc_send_lca" + ) := + list( + sds_option, + "SDS", + convert_sc_sending_location_to_lca(sending_location) + )] + sds_full_clean_long$person_id <- paste0( + sds_full_clean_long$sending_location, + "-", + sds_full_clean_long$social_care_id + ) # Group, arrange and create flags for episodes sds_full_clean_long[, - c("period_rank", - "record_keydate1_rank") := list(rank(period), - rank(record_keydate1)), - by = .(sending_location, social_care_id, smrtype)] + c( + "period_rank", + "record_keydate1_rank" + ) := list( + rank(period), + rank(record_keydate1) + ), + by = .(sending_location, social_care_id, smrtype) + ] data.table::setorder(sds_full_clean_long, period_rank, record_keydate1_rank) sds_full_clean_long[, - distinct_episode := - (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>% - tidyr::replace_na(TRUE), - by = .(sending_location, social_care_id, smrtype)] + distinct_episode := + (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>% + tidyr::replace_na(TRUE), + by = .(sending_location, social_care_id, smrtype) + ] sds_full_clean_long[, - episode_counter := - cumsum(distinct_episode), - by = .(sending_location, social_care_id, smrtype)] + episode_counter := + cumsum(distinct_episode), + by = .(sending_location, social_care_id, smrtype) + ] # Merge episodes by episode counter final_data <- sds_full_clean_long[, .( diff --git a/Run_SLF_Files_manually/run_episode_file_1718.R b/Run_SLF_Files_manually/run_episode_file_1718.R index 9be2eb9c6..ab75b94d7 100644 --- a/Run_SLF_Files_manually/run_episode_file_1718.R +++ b/Run_SLF_Files_manually/run_episode_file_1718.R @@ -4,7 +4,8 @@ library(createslf) year <- "1718" processed_data_list <- targets::tar_read("processed_data_list_1718", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% diff --git a/Run_SLF_Files_manually/run_episode_file_1819.R b/Run_SLF_Files_manually/run_episode_file_1819.R index 7dec9e5c1..cd5a7435f 100644 --- a/Run_SLF_Files_manually/run_episode_file_1819.R +++ b/Run_SLF_Files_manually/run_episode_file_1819.R @@ -4,7 +4,8 @@ library(createslf) year <- "1819" processed_data_list <- targets::tar_read("processed_data_list_1819", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% diff --git a/Run_SLF_Files_manually/run_episode_file_1920.R b/Run_SLF_Files_manually/run_episode_file_1920.R index 066bd27b7..a9dc591b1 100644 --- a/Run_SLF_Files_manually/run_episode_file_1920.R +++ b/Run_SLF_Files_manually/run_episode_file_1920.R @@ -4,7 +4,8 @@ library(createslf) year <- "1920" processed_data_list <- targets::tar_read("processed_data_list_1920", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% diff --git a/Run_SLF_Files_manually/run_episode_file_2021.R b/Run_SLF_Files_manually/run_episode_file_2021.R index 8354f49ae..37708ee8b 100644 --- a/Run_SLF_Files_manually/run_episode_file_2021.R +++ b/Run_SLF_Files_manually/run_episode_file_2021.R @@ -4,7 +4,8 @@ library(createslf) year <- "2021" processed_data_list <- targets::tar_read("processed_data_list_2021", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% diff --git a/Run_SLF_Files_manually/run_episode_file_2122.R b/Run_SLF_Files_manually/run_episode_file_2122.R index 4057770d1..47400e2d1 100644 --- a/Run_SLF_Files_manually/run_episode_file_2122.R +++ b/Run_SLF_Files_manually/run_episode_file_2122.R @@ -4,7 +4,8 @@ library(createslf) year <- "2122" processed_data_list <- targets::tar_read("processed_data_list_2122", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% diff --git a/Run_SLF_Files_manually/run_episode_file_2223.R b/Run_SLF_Files_manually/run_episode_file_2223.R index 5df7b5db6..e64a57f32 100644 --- a/Run_SLF_Files_manually/run_episode_file_2223.R +++ b/Run_SLF_Files_manually/run_episode_file_2223.R @@ -4,7 +4,8 @@ library(createslf) year <- "2223" processed_data_list <- targets::tar_read("processed_data_list_2223", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% diff --git a/Run_SLF_Files_manually/run_episode_file_2324.R b/Run_SLF_Files_manually/run_episode_file_2324.R index af9a3efe5..4a7f0ad29 100644 --- a/Run_SLF_Files_manually/run_episode_file_2324.R +++ b/Run_SLF_Files_manually/run_episode_file_2324.R @@ -4,7 +4,8 @@ library(createslf) year <- "2324" processed_data_list <- targets::tar_read("processed_data_list_2324", - store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")) + store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets") +) # Run episode file create_episode_file(processed_data_list, year = year) %>% From c1a6764d786bb5022aa057d73079a9a8483e8335 Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Wed, 28 Feb 2024 10:41:28 +0000 Subject: [PATCH 4/7] changes in line with new process_sc_all_sds dplyr version --- R/process_sc_all_sds.R | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R index 65d5bef4e..23e542efd 100644 --- a/R/process_sc_all_sds.R +++ b/R/process_sc_all_sds.R @@ -15,14 +15,17 @@ process_sc_all_sds <- function( sc_demog_lookup, write_to_disk = TRUE) { # Match on demographics data (chi, gender, dob and postcode) - matched_sds_data <- all_sds_extract %>% - dplyr::left_join( + matched_sds_data <- data %>% + dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>% + dplyr::right_join( sc_demog_lookup, by = c("sending_location", "social_care_id") ) %>% # when multiple social_care_id from sending_location for single CHI # replace social_care_id with latest replace_sc_id_with_latest() %>% + dplyr::select(-sds_start_date_after_period_end_date) %>% + dplyr::distinct() %>% # sds_options may contain only a few NA, replace NA by 0 dplyr::mutate( sds_option_1 = tidyr::replace_na(sds_option_1, 0), @@ -65,10 +68,12 @@ process_sc_all_sds <- function( ] sds_full_clean[ , - sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period) + sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, sds_period_end_date) ] - + sds_full_clean[, c("sds_period_start_date", + "sds_period_end_date", + "sds_start_date_after_end_date") := NULL] # Rename for matching source variables data.table::setnames( @@ -77,6 +82,8 @@ process_sc_all_sds <- function( c("record_keydate1", "record_keydate2") ) + sds_full_clean = unique(sds_full_clean) + cols_sds_option <- grep( "^sds_option_", names(sds_full_clean), @@ -121,14 +128,19 @@ process_sc_all_sds <- function( sds_full_clean_long[, c( "period_rank", - "record_keydate1_rank" + "record_keydate1_rank", + "record_keydate2_rank" ) := list( rank(period), - rank(record_keydate1) + rank(record_keydate1), + rank(record_keydate2) ), by = .(sending_location, social_care_id, smrtype) ] - data.table::setorder(sds_full_clean_long, period_rank, record_keydate1_rank) + data.table::setorder(sds_full_clean_long, + period_rank, + record_keydate1_rank, + record_keydate2_rank) sds_full_clean_long[, distinct_episode := @@ -138,8 +150,7 @@ process_sc_all_sds <- function( ] sds_full_clean_long[, - episode_counter := - cumsum(distinct_episode), + episode_counter := cumsum(distinct_episode), by = .(sending_location, social_care_id, smrtype) ] From dc08189eee7bb301274f277685225a941f8d5d33 Mon Sep 17 00:00:00 2001 From: lizihao-anu Date: Wed, 28 Feb 2024 10:44:57 +0000 Subject: [PATCH 5/7] Style code --- R/process_sc_all_sds.R | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R index 23e542efd..a1a1db24a 100644 --- a/R/process_sc_all_sds.R +++ b/R/process_sc_all_sds.R @@ -71,9 +71,11 @@ process_sc_all_sds <- function( sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, sds_period_end_date) ] - sds_full_clean[, c("sds_period_start_date", - "sds_period_end_date", - "sds_start_date_after_end_date") := NULL] + sds_full_clean[, c( + "sds_period_start_date", + "sds_period_end_date", + "sds_start_date_after_end_date" + ) := NULL] # Rename for matching source variables data.table::setnames( @@ -82,7 +84,7 @@ process_sc_all_sds <- function( c("record_keydate1", "record_keydate2") ) - sds_full_clean = unique(sds_full_clean) + sds_full_clean <- unique(sds_full_clean) cols_sds_option <- grep( "^sds_option_", @@ -137,10 +139,12 @@ process_sc_all_sds <- function( ), by = .(sending_location, social_care_id, smrtype) ] - data.table::setorder(sds_full_clean_long, - period_rank, - record_keydate1_rank, - record_keydate2_rank) + data.table::setorder( + sds_full_clean_long, + period_rank, + record_keydate1_rank, + record_keydate2_rank + ) sds_full_clean_long[, distinct_episode := From da3989f5b986695d9b469db0cdf31af4125472c0 Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Wed, 28 Feb 2024 11:03:08 +0000 Subject: [PATCH 6/7] remove duplicate columns --- R/process_sc_all_alarms_telecare.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R index bc417a8cd..08275c396 100644 --- a/R/process_sc_all_alarms_telecare.R +++ b/R/process_sc_all_alarms_telecare.R @@ -113,7 +113,6 @@ process_sc_all_alarms_telecare <- function( sending_location = data.table::last(sending_location), social_care_id = data.table::last(social_care_id), sc_latest_submission = data.table::last(period), - record_keydate1 = data.table::last(record_keydate1), record_keydate2 = data.table::last(record_keydate2), smrtype = data.table::last(smrtype), pkg_count = data.table::last(pkg_count), From 1d81977e054188402c59062ae87458b7a52f9677 Mon Sep 17 00:00:00 2001 From: Zihao Li Date: Wed, 28 Feb 2024 11:20:23 +0000 Subject: [PATCH 7/7] remove duplicated columns --- R/process_sc_all_alarms_telecare.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R index 08cb9faa7..0c24892d0 100644 --- a/R/process_sc_all_alarms_telecare.R +++ b/R/process_sc_all_alarms_telecare.R @@ -110,12 +110,8 @@ process_sc_all_alarms_telecare <- function( # Summarize to merge episodes qtr_merge <- data[, .( - sending_location = data.table::last(sending_location), - social_care_id = data.table::last(social_care_id), sc_latest_submission = data.table::last(period), record_keydate2 = data.table::last(record_keydate2), - smrtype = data.table::last(smrtype), - pkg_count = data.table::last(pkg_count), chi = data.table::last(chi), gender = data.table::last(gender), dob = data.table::last(dob),