From b76c31d5a52850ec51d1a054a18c78219add044e Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Mon, 22 Jan 2024 18:05:48 +0000
Subject: [PATCH 1/7] re-writing process_sc_all sds and alarm_telecare with
 data.table to improve the speed

---
 R/process_sc_all_alarms_telecare.R | 176 ++++++++++++-----------
 R/process_sc_all_sds.R             | 222 +++++++++++++++--------------
 2 files changed, 204 insertions(+), 194 deletions(-)

diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R
index 988d1f3e7..0e63ddbc4 100644
--- a/R/process_sc_all_alarms_telecare.R
+++ b/R/process_sc_all_alarms_telecare.R
@@ -17,97 +17,101 @@ process_sc_all_alarms_telecare <- function(
     write_to_disk = TRUE) {
   # Data Cleaning-----------------------------------------------------
 
-  replaced_dates <- data %>%
-    # If the end date is missing, set this to the end of the period
-    dplyr::mutate(
-      service_end_date = fix_sc_missing_end_dates(
-        .data$service_end_date,
-        .data$period_end_date
-      ),
-      # If the start_date is missing, set this to the start of the period
-      service_start_date = fix_sc_start_dates(
-        .data$service_start_date,
-        .data$period_start_date
-      ),
-      # Fix service_end_date if earlier than service_start_date by setting end_date to the end of fy
-      service_end_date = fix_sc_end_dates(
-        .data$service_start_date,
-        .data$service_end_date,
-        .data$period
-      )
-    )
+  # Convert to data.table
+  data.table::setDT(data)
+  data.table::setDT(sc_demog_lookup)
+
+  # Fix dates and create new variables
+  data[,
+       service_end_date := fix_sc_missing_end_dates(service_end_date,
+                                                    period_end_date)]
+  data[,
+       service_start_date := fix_sc_start_dates(service_start_date,
+                                                period_start_date)]
+  data[,
+       service_end_date := fix_sc_end_dates(service_start_date,
+                                            service_end_date,
+                                            period)]
+
+
+  # Rename columns
+  data.table::setnames(
+    data,
+    old = c("service_start_date", "service_end_date"),
+    new = c("record_keydate1", "record_keydate2")
+  )
 
+  # Additional mutations
+  data[,
+       c("recid",
+         "smrtype",
+         "sc_send_lca") := list(
+           "AT",
+           data.table::fcase(
+             service_type == 1L,
+             "AT-Alarm",
+             service_type == 2L,
+             "AT-Tele",
+             default,
+             NA_character_
+           ),
+           convert_sc_sending_location_to_lca(sending_location)
+         )]
+  data$person_id = paste0(data$sending_location,
+                          "-",
+                          data$social_care_id)
 
-  at_full_clean <- replaced_dates %>%
-    # rename for matching source variables
-    dplyr::rename(
-      record_keydate1 = "service_start_date",
-      record_keydate2 = "service_end_date"
-    ) %>%
-    # Include source variables
-    dplyr::mutate(
-      recid = "AT",
-      smrtype = dplyr::case_when(
-        .data$service_type == 1L ~ "AT-Alarm",
-        .data$service_type == 2L ~ "AT-Tele"
-      ),
-      # Create person id variable
-      person_id = stringr::str_glue("{sending_location}-{social_care_id}"),
-      # Use function for creating sc send lca variables
-      sc_send_lca = convert_sc_sending_location_to_lca(.data$sending_location)
-    ) %>%
-    # Match on demographics data (chi, gender, dob and postcode)
-    dplyr::left_join(
-      sc_demog_lookup,
-      by = c("sending_location", "social_care_id")
-    ) %>%
-    # when multiple social_care_id from sending_location for single CHI
-    # replace social_care_id with latest
-    replace_sc_id_with_latest()
+  # Join with sc_demog_lookup
+  data = sc_demog_lookup[data, on = .(sending_location, social_care_id)]
 
-  # Deal with episodes which have a package across quarters.
-  qtr_merge <- at_full_clean %>%
-    # use as.data.table to change the data format to data.table to accelerate
-    data.table::as.data.table() %>%
-    dplyr::group_by(
-      .data$sending_location,
-      .data$social_care_id,
-      .data$record_keydate1,
-      .data$smrtype,
-      .data$period
-    ) %>%
-    # Create a count for the package number across episodes
-    dplyr::mutate(pkg_count = dplyr::row_number()) %>%
+  # Replace social_care_id with latest if needed (assuming replace_sc_id_with_latest is a custom function)
+  data = replace_sc_id_with_latest(data)
+
+  # Deal with episodes that have a package across quarters
+  data[, pkg_count := seq_len(.N), by = .(sending_location,
+                                          social_care_id,
+                                          record_keydate1,
+                                          smrtype,
+                                          period)]
+
+  # Order data before summarizing
+  data = data %>% dplyr::group_by(
+    .data$sending_location,
+    .data$social_care_id,
+    .data$record_keydate1,
+    .data$smrtype,
+    .data$period
+  ) %>%
     # Sort prior to merging
     dplyr::arrange(.by_group = TRUE) %>%
-    # group for merging episodes
-    dplyr::group_by(
-      .data$sending_location,
-      .data$social_care_id,
-      .data$record_keydate1,
-      .data$smrtype,
-      .data$pkg_count
-    ) %>%
-    # merge episodes with packages across quarters
-    # drop variables not needed
-    dplyr::summarise(
-      sending_location = dplyr::last(.data$sending_location),
-      social_care_id = dplyr::last(.data$social_care_id),
-      sc_latest_submission = dplyr::last(.data$period),
-      record_keydate1 = dplyr::last(.data$record_keydate1),
-      record_keydate2 = dplyr::last(.data$record_keydate2),
-      smrtype = dplyr::last(.data$smrtype),
-      pkg_count = dplyr::last(.data$pkg_count),
-      chi = dplyr::last(.data$chi),
-      gender = dplyr::last(.data$gender),
-      dob = dplyr::last(.data$dob),
-      postcode = dplyr::last(.data$postcode),
-      recid = dplyr::last(.data$recid),
-      person_id = dplyr::last(.data$person_id),
-      sc_send_lca = dplyr::last(.data$sc_send_lca)
-    ) %>%
-    # change the data format from data.table to data.frame
-    tibble::as_tibble()
+    dplyr::ungroup() %>%
+    data.table::as.data.table()
+
+  # Summarize to merge episodes
+  qtr_merge = data[, .(
+    sending_location = data.table::last(sending_location),
+    social_care_id = data.table::last(social_care_id),
+    sc_latest_submission = data.table::last(period),
+    record_keydate1 = data.table::last(record_keydate1),
+    record_keydate2 = data.table::last(record_keydate2),
+    smrtype = data.table::last(smrtype),
+    pkg_count = data.table::last(pkg_count),
+    chi = data.table::last(chi),
+    gender = data.table::last(gender),
+    dob = data.table::last(dob),
+    postcode = data.table::last(postcode),
+    recid = data.table::last(recid),
+    person_id = data.table::last(person_id),
+    sc_send_lca = data.table::last(sc_send_lca)
+  ), by = .(sending_location,
+            social_care_id,
+            record_keydate1,
+            smrtype,
+            pkg_count)]
+
+  # Convert back to data.frame if necessary
+  qtr_merge = as.data.frame(qtr_merge)
+
 
   if (write_to_disk) {
     write_file(
diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R
index f9ca52f24..784ee3439 100644
--- a/R/process_sc_all_sds.R
+++ b/R/process_sc_all_sds.R
@@ -15,123 +15,129 @@ process_sc_all_sds <- function(
     sc_demog_lookup,
     write_to_disk = TRUE) {
   # Match on demographics data (chi, gender, dob and postcode)
-  matched_sds_data <- data %>%
+  matched_sds_data <- all_sds_extract %>%
     dplyr::left_join(
       sc_demog_lookup,
       by = c("sending_location", "social_care_id")
     ) %>%
     # when multiple social_care_id from sending_location for single CHI
     # replace social_care_id with latest
-    replace_sc_id_with_latest()
-
-  # Data Cleaning ---------------------------------------
-  sds_full_clean <- matched_sds_data %>%
-    # Deal with SDS option 4
-    # First turn the option flags into a logical T/F
-    dplyr::mutate(dplyr::across(
-      tidyselect::starts_with("sds_option_"),
-      ~ dplyr::case_when(
-        .x == 1L ~ TRUE,
-        .x == 0L ~ FALSE,
-        is.na(.x) ~ FALSE
-      )
-    )) %>%
-    # SDS option 4 is derived when a person receives more than one option.
-    # e.g. if a person has options 1 and 2 then option 4 will be derived
-    dplyr::mutate(
-      sds_option_4 = rowSums(
-        dplyr::pick(tidyselect::starts_with("sds_option_"))
-      ) > 1L,
-      .after = .data$sds_option_3
-    ) %>%
-    # If SDS start date is missing, assign start of FY
-    dplyr::mutate(
-      sds_start_date = fix_sc_start_dates(
-        .data$sds_start_date,
-        .data$sds_period_start_date
-      ),
-      # If SDS end date is missing, assign end of FY
-      sds_end_date = fix_sc_missing_end_dates(
-        .data$sds_end_date,
-        .data$sds_period_end_date
-      ),
-      # Fix sds_end_date is earlier than sds_start_date by setting end_date to be the end of fyear
-      sds_end_date = fix_sc_end_dates(
-        .data$sds_start_date,
-        .data$sds_end_date,
-        .data$period
-      )
-    ) %>%
-    # rename for matching source variables
-    dplyr::rename(
-      record_keydate1 = .data$sds_start_date,
-      record_keydate2 = .data$sds_end_date
-    ) %>%
-    # Pivot longer on sds option variables
-    tidyr::pivot_longer(
-      cols = tidyselect::contains("sds_option_"),
-      names_to = "sds_option",
-      names_prefix = "sds_option_",
-      names_transform = list(sds_option = ~ paste0("SDS-", .x)),
-      values_to = "received"
-    ) %>%
-    # Only keep rows where they received a package and remove duplicates
-    dplyr::filter(.data$received) %>%
-    dplyr::distinct() %>%
-    # Include source variables
+    replace_sc_id_with_latest() %>%
+    # sds_options may contain only a few NA, replace NA by 0
     dplyr::mutate(
-      smrtype = dplyr::case_when(
-        sds_option == "SDS-1" ~ "SDS-1",
-        sds_option == "SDS-2" ~ "SDS-2",
-        sds_option == "SDS-3" ~ "SDS-3",
-        sds_option == "SDS-4" ~ "SDS-4"
-      ),
-      recid = "SDS",
-      # Create person id variable
-      person_id = stringr::str_glue("{sending_location}-{social_care_id}"),
-      # Use function for creating sc send lca variables
-      sc_send_lca = convert_sc_sending_location_to_lca(.data$sending_location)
+      sds_option_1 = tidyr::replace_na(sds_option_1, 0),
+      sds_option_2 = tidyr::replace_na(sds_option_2, 0),
+      sds_option_3 = tidyr::replace_na(sds_option_3, 0)
     )
 
-  final_data <- sds_full_clean %>%
-    # use as.data.table to change the data format to data.table to accelerate
-    data.table::as.data.table() %>%
-    dplyr::group_by(
-      .data$sending_location,
-      .data$social_care_id,
-      .data$smrtype
-    ) %>%
-    dplyr::arrange(.data$period,
-      .data$record_keydate1,
-      .by_group = TRUE
-    ) %>%
-    # Create a flag for episodes that are going to be merged
-    # Create an episode counter
-    dplyr::mutate(
-      distinct_episode = (.data$record_keydate1 > dplyr::lag(.data$record_keydate2)) %>%
-        tidyr::replace_na(TRUE),
-      episode_counter = cumsum(.data$distinct_episode)
-    ) %>%
-    # Group by episode counter and merge episodes
-    dplyr::group_by(.data$episode_counter, .add = TRUE) %>%
-    dplyr::summarise(
-      sc_latest_submission = dplyr::last(.data$period),
-      record_keydate1 = min(.data$record_keydate1),
-      record_keydate2 = max(.data$record_keydate2),
-      sending_location = dplyr::last(.data$sending_location),
-      social_care_id = dplyr::last(.data$social_care_id),
-      chi = dplyr::last(.data$chi),
-      gender = dplyr::last(.data$gender),
-      dob = dplyr::last(.data$dob),
-      postcode = dplyr::last(.data$postcode),
-      recid = dplyr::last(.data$recid),
-      person_id = dplyr::last(.data$person_id),
-      sc_send_lca = dplyr::last(.data$sc_send_lca)
-    ) %>%
-    dplyr::ungroup() %>%
-    dplyr::select(-.data$episode_counter) %>%
-    # change the data format from data.table to data.frame
-    tibble::as_tibble()
+  # Data Cleaning ---------------------------------------
+  # Convert matched_sds_data to data.table
+  sds_full_clean <- data.table::as.data.table(matched_sds_data)
+  rm(matched_sds_data)
+
+  # Deal with SDS option 4
+  # Convert option flags into logical T/F
+  cols_sds_option <- grep(
+    "^sds_option_",
+    names(sds_full_clean),
+    value = TRUE
+  )
+  sds_full_clean[, (cols_sds_option) := lapply(.SD, function(x) {
+    data.table::fifelse(x == 1L, TRUE, FALSE)
+  })
+  , .SDcols = cols_sds_option]
+
+  # Derived SDS option 4 when a person receives more than one option
+  sds_full_clean[,
+                 sds_option_4 := rowSums(.SD) > 1L,
+                 .SDcols = cols_sds_option]
+
+  # If SDS start date or end date is missing, assign start/end of FY
+  sds_full_clean[,
+                 sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date)]
+  sds_full_clean[,
+                 sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date)]
+  sds_full_clean[,
+                 sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period)]
+
+
+
+  # Rename for matching source variables
+  data.table::setnames(
+    sds_full_clean,
+    c("sds_start_date", "sds_end_date"),
+    c("record_keydate1", "record_keydate2")
+  )
+
+  cols_sds_option <- grep(
+    "^sds_option_",
+    names(sds_full_clean),
+    value = TRUE
+  )
+  # Pivot longer on sds option variables
+  sds_full_clean_long <- data.table::melt(
+    sds_full_clean,
+    id.vars = setdiff(names(sds_full_clean), cols_sds_option),
+    measure.vars = cols_sds_option,
+    variable.name = "sds_option",
+    value.name = "received"
+  )
+  rm(sds_full_clean)
+  sds_full_clean_long <- sds_full_clean_long[received == TRUE, ]
+  sds_full_clean_long[,
+                      sds_option := paste0("SDS-", sub("sds_option_", "", sds_option))]
+
+  # Filter rows where they received a package and remove duplicates
+  sds_full_clean_long <- unique(sds_full_clean_long)
+
+  # Include source variables
+  sds_full_clean_long[, c("smrtype",
+                          "recid",
+                          "sc_send_lca") :=
+                        list(sds_option,
+                             "SDS",
+                             convert_sc_sending_location_to_lca(sending_location))]
+  sds_full_clean_long$person_id = paste0(sds_full_clean_long$sending_location,
+                                         "-",
+                                         sds_full_clean_long$social_care_id)
+
+  # Group, arrange and create flags for episodes
+  sds_full_clean_long[,
+                      c("period_rank",
+                        "record_keydate1_rank") := list(rank(period),
+                                                        rank(record_keydate1)),
+                      by = .(sending_location, social_care_id, smrtype)]
+  data.table::setorder(sds_full_clean_long, period_rank, record_keydate1_rank)
+
+  sds_full_clean_long[,
+                      distinct_episode :=
+                        (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>%
+                        tidyr::replace_na(TRUE),
+                      by = .(sending_location, social_care_id, smrtype)]
+
+  sds_full_clean_long[,
+                      episode_counter :=
+                        cumsum(distinct_episode),
+                      by = .(sending_location, social_care_id, smrtype)]
+
+  # Merge episodes by episode counter
+  final_data <- sds_full_clean_long[, .(
+    sc_latest_submission = data.table::last(period),
+    record_keydate1 = min(record_keydate1),
+    record_keydate2 = max(record_keydate2),
+    chi = data.table::last(chi),
+    gender = data.table::last(gender),
+    dob = data.table::last(dob),
+    postcode = data.table::last(postcode),
+    recid = data.table::last(recid),
+    person_id = data.table::last(person_id),
+    sc_send_lca = data.table::last(sc_send_lca)
+  ), by = .(sending_location, social_care_id, smrtype, episode_counter)]
+  rm(sds_full_clean_long)
+
+  # Drop episode_counter and convert back to data.frame if needed
+  final_data <- as.data.frame(final_data[, -"episode_counter"])
+  # final_data now holds the processed data in the format of a data.frame
 
   if (write_to_disk) {
     write_file(

From 1996b2aa6f2da5c482c416a0003340c4c265b2c6 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:08:22 +0000
Subject: [PATCH 2/7] Update documentation

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5123289dd..4bb0c6f18 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -73,4 +73,4 @@ Encoding: UTF-8
 Language: en-GB
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0

From 13f5c8059901b951f4c5a46c2e4f39b346d39fff Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:26:23 +0000
Subject: [PATCH 3/7] Style code

---
 R/create_individual_file.R                    |   3 +-
 R/get_fy_quarter_dates.R                      |   8 +-
 R/process_sc_all_alarms_telecare.R            | 121 +++++++++++-------
 R/process_sc_all_sds.R                        |  87 ++++++++-----
 .../run_episode_file_1718.R                   |   3 +-
 .../run_episode_file_1819.R                   |   3 +-
 .../run_episode_file_1920.R                   |   3 +-
 .../run_episode_file_2021.R                   |   3 +-
 .../run_episode_file_2122.R                   |   3 +-
 .../run_episode_file_2223.R                   |   3 +-
 .../run_episode_file_2324.R                   |   3 +-
 11 files changed, 147 insertions(+), 93 deletions(-)

diff --git a/R/create_individual_file.R b/R/create_individual_file.R
index d9316b41b..4ca2f96d7 100644
--- a/R/create_individual_file.R
+++ b/R/create_individual_file.R
@@ -483,7 +483,8 @@ add_ch_columns <- function(episode_file, prefix, condition) {
       ch_ep_end = dplyr::if_else(
         eval(condition),
         .data$record_keydate2,
-        lubridate::NA_Date_  ),
+        lubridate::NA_Date_
+      ),
       # If end date is missing use the first day of next FY quarter
       ch_ep_end = dplyr::if_else(
         eval(condition) & is.na(.data$ch_ep_end),
diff --git a/R/get_fy_quarter_dates.R b/R/get_fy_quarter_dates.R
index cd4c3492c..a772099b8 100644
--- a/R/get_fy_quarter_dates.R
+++ b/R/get_fy_quarter_dates.R
@@ -15,7 +15,7 @@
 start_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
@@ -47,7 +47,7 @@ start_fy_quarter <- function(quarter) {
 end_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
@@ -80,7 +80,7 @@ end_fy_quarter <- function(quarter) {
 start_next_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
@@ -112,7 +112,7 @@ start_next_fy_quarter <- function(quarter) {
 end_next_fy_quarter <- function(quarter) {
   quarter_unique <- unique(quarter)
 
-  #check_quarter_format(quarter)
+  # check_quarter_format(quarter)
 
   cal_quarter_date_unique <- lubridate::yq(quarter_unique)
 
diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R
index 0e63ddbc4..bc417a8cd 100644
--- a/R/process_sc_all_alarms_telecare.R
+++ b/R/process_sc_all_alarms_telecare.R
@@ -22,16 +22,28 @@ process_sc_all_alarms_telecare <- function(
   data.table::setDT(sc_demog_lookup)
 
   # Fix dates and create new variables
-  data[,
-       service_end_date := fix_sc_missing_end_dates(service_end_date,
-                                                    period_end_date)]
-  data[,
-       service_start_date := fix_sc_start_dates(service_start_date,
-                                                period_start_date)]
-  data[,
-       service_end_date := fix_sc_end_dates(service_start_date,
-                                            service_end_date,
-                                            period)]
+  data[
+    ,
+    service_end_date := fix_sc_missing_end_dates(
+      service_end_date,
+      period_end_date
+    )
+  ]
+  data[
+    ,
+    service_start_date := fix_sc_start_dates(
+      service_start_date,
+      period_start_date
+    )
+  ]
+  data[
+    ,
+    service_end_date := fix_sc_end_dates(
+      service_start_date,
+      service_end_date,
+      period
+    )
+  ]
 
 
   # Rename columns
@@ -42,53 +54,62 @@ process_sc_all_alarms_telecare <- function(
   )
 
   # Additional mutations
-  data[,
-       c("recid",
-         "smrtype",
-         "sc_send_lca") := list(
-           "AT",
-           data.table::fcase(
-             service_type == 1L,
-             "AT-Alarm",
-             service_type == 2L,
-             "AT-Tele",
-             default,
-             NA_character_
-           ),
-           convert_sc_sending_location_to_lca(sending_location)
-         )]
-  data$person_id = paste0(data$sending_location,
-                          "-",
-                          data$social_care_id)
+  data[
+    ,
+    c(
+      "recid",
+      "smrtype",
+      "sc_send_lca"
+    ) := list(
+      "AT",
+      data.table::fcase(
+        service_type == 1L,
+        "AT-Alarm",
+        service_type == 2L,
+        "AT-Tele",
+        default,
+        NA_character_
+      ),
+      convert_sc_sending_location_to_lca(sending_location)
+    )
+  ]
+  data$person_id <- paste0(
+    data$sending_location,
+    "-",
+    data$social_care_id
+  )
 
   # Join with sc_demog_lookup
-  data = sc_demog_lookup[data, on = .(sending_location, social_care_id)]
+  data <- sc_demog_lookup[data, on = .(sending_location, social_care_id)]
 
   # Replace social_care_id with latest if needed (assuming replace_sc_id_with_latest is a custom function)
-  data = replace_sc_id_with_latest(data)
+  data <- replace_sc_id_with_latest(data)
 
   # Deal with episodes that have a package across quarters
-  data[, pkg_count := seq_len(.N), by = .(sending_location,
-                                          social_care_id,
-                                          record_keydate1,
-                                          smrtype,
-                                          period)]
+  data[, pkg_count := seq_len(.N), by = .(
+    sending_location,
+    social_care_id,
+    record_keydate1,
+    smrtype,
+    period
+  )]
 
   # Order data before summarizing
-  data = data %>% dplyr::group_by(
-    .data$sending_location,
-    .data$social_care_id,
-    .data$record_keydate1,
-    .data$smrtype,
-    .data$period
-  ) %>%
+  data <- data %>%
+    dplyr::group_by(
+      .data$sending_location,
+      .data$social_care_id,
+      .data$record_keydate1,
+      .data$smrtype,
+      .data$period
+    ) %>%
     # Sort prior to merging
     dplyr::arrange(.by_group = TRUE) %>%
     dplyr::ungroup() %>%
     data.table::as.data.table()
 
   # Summarize to merge episodes
-  qtr_merge = data[, .(
+  qtr_merge <- data[, .(
     sending_location = data.table::last(sending_location),
     social_care_id = data.table::last(social_care_id),
     sc_latest_submission = data.table::last(period),
@@ -103,14 +124,16 @@ process_sc_all_alarms_telecare <- function(
     recid = data.table::last(recid),
     person_id = data.table::last(person_id),
     sc_send_lca = data.table::last(sc_send_lca)
-  ), by = .(sending_location,
-            social_care_id,
-            record_keydate1,
-            smrtype,
-            pkg_count)]
+  ), by = .(
+    sending_location,
+    social_care_id,
+    record_keydate1,
+    smrtype,
+    pkg_count
+  )]
 
   # Convert back to data.frame if necessary
-  qtr_merge = as.data.frame(qtr_merge)
+  qtr_merge <- as.data.frame(qtr_merge)
 
 
   if (write_to_disk) {
diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R
index 784ee3439..65d5bef4e 100644
--- a/R/process_sc_all_sds.R
+++ b/R/process_sc_all_sds.R
@@ -44,21 +44,29 @@ process_sc_all_sds <- function(
   )
   sds_full_clean[, (cols_sds_option) := lapply(.SD, function(x) {
     data.table::fifelse(x == 1L, TRUE, FALSE)
-  })
-  , .SDcols = cols_sds_option]
+  }),
+  .SDcols = cols_sds_option
+  ]
 
   # Derived SDS option 4 when a person receives more than one option
   sds_full_clean[,
-                 sds_option_4 := rowSums(.SD) > 1L,
-                 .SDcols = cols_sds_option]
+    sds_option_4 := rowSums(.SD) > 1L,
+    .SDcols = cols_sds_option
+  ]
 
   # If SDS start date or end date is missing, assign start/end of FY
-  sds_full_clean[,
-                 sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date)]
-  sds_full_clean[,
-                 sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date)]
-  sds_full_clean[,
-                 sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period)]
+  sds_full_clean[
+    ,
+    sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date)
+  ]
+  sds_full_clean[
+    ,
+    sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date)
+  ]
+  sds_full_clean[
+    ,
+    sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period)
+  ]
 
 
 
@@ -84,41 +92,56 @@ process_sc_all_sds <- function(
   )
   rm(sds_full_clean)
   sds_full_clean_long <- sds_full_clean_long[received == TRUE, ]
-  sds_full_clean_long[,
-                      sds_option := paste0("SDS-", sub("sds_option_", "", sds_option))]
+  sds_full_clean_long[
+    ,
+    sds_option := paste0("SDS-", sub("sds_option_", "", sds_option))
+  ]
 
   # Filter rows where they received a package and remove duplicates
   sds_full_clean_long <- unique(sds_full_clean_long)
 
   # Include source variables
-  sds_full_clean_long[, c("smrtype",
-                          "recid",
-                          "sc_send_lca") :=
-                        list(sds_option,
-                             "SDS",
-                             convert_sc_sending_location_to_lca(sending_location))]
-  sds_full_clean_long$person_id = paste0(sds_full_clean_long$sending_location,
-                                         "-",
-                                         sds_full_clean_long$social_care_id)
+  sds_full_clean_long[, c(
+    "smrtype",
+    "recid",
+    "sc_send_lca"
+  ) :=
+    list(
+      sds_option,
+      "SDS",
+      convert_sc_sending_location_to_lca(sending_location)
+    )]
+  sds_full_clean_long$person_id <- paste0(
+    sds_full_clean_long$sending_location,
+    "-",
+    sds_full_clean_long$social_care_id
+  )
 
   # Group, arrange and create flags for episodes
   sds_full_clean_long[,
-                      c("period_rank",
-                        "record_keydate1_rank") := list(rank(period),
-                                                        rank(record_keydate1)),
-                      by = .(sending_location, social_care_id, smrtype)]
+    c(
+      "period_rank",
+      "record_keydate1_rank"
+    ) := list(
+      rank(period),
+      rank(record_keydate1)
+    ),
+    by = .(sending_location, social_care_id, smrtype)
+  ]
   data.table::setorder(sds_full_clean_long, period_rank, record_keydate1_rank)
 
   sds_full_clean_long[,
-                      distinct_episode :=
-                        (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>%
-                        tidyr::replace_na(TRUE),
-                      by = .(sending_location, social_care_id, smrtype)]
+    distinct_episode :=
+      (data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>%
+      tidyr::replace_na(TRUE),
+    by = .(sending_location, social_care_id, smrtype)
+  ]
 
   sds_full_clean_long[,
-                      episode_counter :=
-                        cumsum(distinct_episode),
-                      by = .(sending_location, social_care_id, smrtype)]
+    episode_counter :=
+      cumsum(distinct_episode),
+    by = .(sending_location, social_care_id, smrtype)
+  ]
 
   # Merge episodes by episode counter
   final_data <- sds_full_clean_long[, .(
diff --git a/Run_SLF_Files_manually/run_episode_file_1718.R b/Run_SLF_Files_manually/run_episode_file_1718.R
index 9be2eb9c6..ab75b94d7 100644
--- a/Run_SLF_Files_manually/run_episode_file_1718.R
+++ b/Run_SLF_Files_manually/run_episode_file_1718.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "1718"
 
 processed_data_list <- targets::tar_read("processed_data_list_1718",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_1819.R b/Run_SLF_Files_manually/run_episode_file_1819.R
index 7dec9e5c1..cd5a7435f 100644
--- a/Run_SLF_Files_manually/run_episode_file_1819.R
+++ b/Run_SLF_Files_manually/run_episode_file_1819.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "1819"
 
 processed_data_list <- targets::tar_read("processed_data_list_1819",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_1920.R b/Run_SLF_Files_manually/run_episode_file_1920.R
index 066bd27b7..a9dc591b1 100644
--- a/Run_SLF_Files_manually/run_episode_file_1920.R
+++ b/Run_SLF_Files_manually/run_episode_file_1920.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "1920"
 
 processed_data_list <- targets::tar_read("processed_data_list_1920",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2021.R b/Run_SLF_Files_manually/run_episode_file_2021.R
index 8354f49ae..37708ee8b 100644
--- a/Run_SLF_Files_manually/run_episode_file_2021.R
+++ b/Run_SLF_Files_manually/run_episode_file_2021.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2021"
 
 processed_data_list <- targets::tar_read("processed_data_list_2021",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2122.R b/Run_SLF_Files_manually/run_episode_file_2122.R
index 4057770d1..47400e2d1 100644
--- a/Run_SLF_Files_manually/run_episode_file_2122.R
+++ b/Run_SLF_Files_manually/run_episode_file_2122.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2122"
 
 processed_data_list <- targets::tar_read("processed_data_list_2122",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2223.R b/Run_SLF_Files_manually/run_episode_file_2223.R
index 5df7b5db6..e64a57f32 100644
--- a/Run_SLF_Files_manually/run_episode_file_2223.R
+++ b/Run_SLF_Files_manually/run_episode_file_2223.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2223"
 
 processed_data_list <- targets::tar_read("processed_data_list_2223",
-                      store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%
diff --git a/Run_SLF_Files_manually/run_episode_file_2324.R b/Run_SLF_Files_manually/run_episode_file_2324.R
index af9a3efe5..4a7f0ad29 100644
--- a/Run_SLF_Files_manually/run_episode_file_2324.R
+++ b/Run_SLF_Files_manually/run_episode_file_2324.R
@@ -4,7 +4,8 @@ library(createslf)
 year <- "2324"
 
 processed_data_list <- targets::tar_read("processed_data_list_2324",
-                                         store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets"))
+  store = fs::path("/conf/sourcedev/Source_Linkage_File_Updates/", "_targets")
+)
 
 # Run episode file
 create_episode_file(processed_data_list, year = year) %>%

From c1a6764d786bb5022aa057d73079a9a8483e8335 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 28 Feb 2024 10:41:28 +0000
Subject: [PATCH 4/7] changes in line with new process_sc_all_sds dplyr version

---
 R/process_sc_all_sds.R | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R
index 65d5bef4e..23e542efd 100644
--- a/R/process_sc_all_sds.R
+++ b/R/process_sc_all_sds.R
@@ -15,14 +15,17 @@ process_sc_all_sds <- function(
     sc_demog_lookup,
     write_to_disk = TRUE) {
   # Match on demographics data (chi, gender, dob and postcode)
-  matched_sds_data <- all_sds_extract %>%
-    dplyr::left_join(
+  matched_sds_data <- data %>%
+    dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>%
+    dplyr::right_join(
       sc_demog_lookup,
       by = c("sending_location", "social_care_id")
     ) %>%
     # when multiple social_care_id from sending_location for single CHI
     # replace social_care_id with latest
     replace_sc_id_with_latest() %>%
+    dplyr::select(-sds_start_date_after_period_end_date) %>%
+    dplyr::distinct() %>%
     # sds_options may contain only a few NA, replace NA by 0
     dplyr::mutate(
       sds_option_1 = tidyr::replace_na(sds_option_1, 0),
@@ -65,10 +68,12 @@ process_sc_all_sds <- function(
   ]
   sds_full_clean[
     ,
-    sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, period)
+    sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, sds_period_end_date)
   ]
 
-
+  sds_full_clean[, c("sds_period_start_date",
+                     "sds_period_end_date",
+                     "sds_start_date_after_end_date") := NULL]
 
   # Rename for matching source variables
   data.table::setnames(
@@ -77,6 +82,8 @@ process_sc_all_sds <- function(
     c("record_keydate1", "record_keydate2")
   )
 
+  sds_full_clean = unique(sds_full_clean)
+
   cols_sds_option <- grep(
     "^sds_option_",
     names(sds_full_clean),
@@ -121,14 +128,19 @@ process_sc_all_sds <- function(
   sds_full_clean_long[,
     c(
       "period_rank",
-      "record_keydate1_rank"
+      "record_keydate1_rank",
+      "record_keydate2_rank"
     ) := list(
       rank(period),
-      rank(record_keydate1)
+      rank(record_keydate1),
+      rank(record_keydate2)
     ),
     by = .(sending_location, social_care_id, smrtype)
   ]
-  data.table::setorder(sds_full_clean_long, period_rank, record_keydate1_rank)
+  data.table::setorder(sds_full_clean_long,
+                       period_rank,
+                       record_keydate1_rank,
+                       record_keydate2_rank)
 
   sds_full_clean_long[,
     distinct_episode :=
@@ -138,8 +150,7 @@ process_sc_all_sds <- function(
   ]
 
   sds_full_clean_long[,
-    episode_counter :=
-      cumsum(distinct_episode),
+    episode_counter := cumsum(distinct_episode),
     by = .(sending_location, social_care_id, smrtype)
   ]
 

From dc08189eee7bb301274f277685225a941f8d5d33 Mon Sep 17 00:00:00 2001
From: lizihao-anu <lizihao-anu@users.noreply.github.com>
Date: Wed, 28 Feb 2024 10:44:57 +0000
Subject: [PATCH 5/7] Style code

---
 R/process_sc_all_sds.R | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/R/process_sc_all_sds.R b/R/process_sc_all_sds.R
index 23e542efd..a1a1db24a 100644
--- a/R/process_sc_all_sds.R
+++ b/R/process_sc_all_sds.R
@@ -71,9 +71,11 @@ process_sc_all_sds <- function(
     sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, sds_period_end_date)
   ]
 
-  sds_full_clean[, c("sds_period_start_date",
-                     "sds_period_end_date",
-                     "sds_start_date_after_end_date") := NULL]
+  sds_full_clean[, c(
+    "sds_period_start_date",
+    "sds_period_end_date",
+    "sds_start_date_after_end_date"
+  ) := NULL]
 
   # Rename for matching source variables
   data.table::setnames(
@@ -82,7 +84,7 @@ process_sc_all_sds <- function(
     c("record_keydate1", "record_keydate2")
   )
 
-  sds_full_clean = unique(sds_full_clean)
+  sds_full_clean <- unique(sds_full_clean)
 
   cols_sds_option <- grep(
     "^sds_option_",
@@ -137,10 +139,12 @@ process_sc_all_sds <- function(
     ),
     by = .(sending_location, social_care_id, smrtype)
   ]
-  data.table::setorder(sds_full_clean_long,
-                       period_rank,
-                       record_keydate1_rank,
-                       record_keydate2_rank)
+  data.table::setorder(
+    sds_full_clean_long,
+    period_rank,
+    record_keydate1_rank,
+    record_keydate2_rank
+  )
 
   sds_full_clean_long[,
     distinct_episode :=

From da3989f5b986695d9b469db0cdf31af4125472c0 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 28 Feb 2024 11:03:08 +0000
Subject: [PATCH 6/7] remove duplicate columns

---
 R/process_sc_all_alarms_telecare.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R
index bc417a8cd..08275c396 100644
--- a/R/process_sc_all_alarms_telecare.R
+++ b/R/process_sc_all_alarms_telecare.R
@@ -113,7 +113,6 @@ process_sc_all_alarms_telecare <- function(
     sending_location = data.table::last(sending_location),
     social_care_id = data.table::last(social_care_id),
     sc_latest_submission = data.table::last(period),
-    record_keydate1 = data.table::last(record_keydate1),
     record_keydate2 = data.table::last(record_keydate2),
     smrtype = data.table::last(smrtype),
     pkg_count = data.table::last(pkg_count),

From 1d81977e054188402c59062ae87458b7a52f9677 Mon Sep 17 00:00:00 2001
From: Zihao Li <lizihao_anu@outlook.com>
Date: Wed, 28 Feb 2024 11:20:23 +0000
Subject: [PATCH 7/7] remove duplicated columns

---
 R/process_sc_all_alarms_telecare.R | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/R/process_sc_all_alarms_telecare.R b/R/process_sc_all_alarms_telecare.R
index 08cb9faa7..0c24892d0 100644
--- a/R/process_sc_all_alarms_telecare.R
+++ b/R/process_sc_all_alarms_telecare.R
@@ -110,12 +110,8 @@ process_sc_all_alarms_telecare <- function(
 
   # Summarize to merge episodes
   qtr_merge <- data[, .(
-    sending_location = data.table::last(sending_location),
-    social_care_id = data.table::last(social_care_id),
     sc_latest_submission = data.table::last(period),
     record_keydate2 = data.table::last(record_keydate2),
-    smrtype = data.table::last(smrtype),
-    pkg_count = data.table::last(pkg_count),
     chi = data.table::last(chi),
     gender = data.table::last(gender),
     dob = data.table::last(dob),