last minute fixes and cleanup, r cmd check pass

rOpenSpain · Aug 27, 2024 · 4d1ad29 · 4d1ad29
1 parent 6b93fee
commit 4d1ad29
Show file tree

Hide file tree

Showing 27 changed files with 176 additions and 297 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -34,7 +34,9 @@ Imports:
     glue,
     here,
     lubridate,
+    memuse,
     parallelly,
+    purrr,
     readr,
     rlang,
     sf,

diff --git a/R/connect_to_converted_data.R b/R/connect_to_converted_data.R
@@ -14,7 +14,7 @@ spod_connect <- function(
   data_path,
   target_table_name = NULL,
   quiet = FALSE,
-  max_mem_gb = 4, # later increase that to be 4GB or perhaps 60% of available RAM, as for analysis the amount of memory that can be used can significanly affect the speed of aggregations.
+  max_mem_gb = max(4, spod_available_ram() - 4),
   max_n_cpu = parallelly::availableCores() - 1,
   temp_path = spod_get_temp_dir()
 ){

diff --git a/R/convert_data.R b/R/convert_data.R
@@ -27,7 +27,7 @@ spod_convert <- function(
   type = c(
     "od", "origin-destination",
     "os", "overnight_stays",
-    "tpp", "trips_per_person"
+    "nt", "number_of_trips"
   ),
   zones = c(
     "districts", "dist", "distr", "distritos",
@@ -39,8 +39,8 @@ spod_convert <- function(
   overwrite = FALSE,
   data_dir = spod_get_data_dir(),
   quiet = FALSE,
-  max_mem_gb = 4, # fails on full v1 data with less than 4 GB. 4GB is reasonable.  Eventually I would set this to be either 4 as minumum, or about 60% of available RAM autodetected with `as.numeric(unclass(memuse::Sys.meminfo())[1][['totalram']])` or `unclass(benchmarkme::get_ram())`, whichever is bigger. 
-  max_n_cpu = parallelly::availableCores() - 1, # Overall, the conversion is not so much memory-bounded, but rather the speed depends on available cores. E.g. with 6 cores it is almost exactly twice as fast as with 3 cores. So using as many cores as possible is very beneficial.
+  max_mem_gb = max(4, spod_available_ram() - 4),
+  max_n_cpu = parallelly::availableCores() - 1,
   max_download_size_gb = 1
 ) {
 

diff --git a/R/download_data.R b/R/download_data.R
@@ -1,7 +1,7 @@
 #' Download the data files of specified type, zones, and dates
 #'
 #' This function downloads the data files of the specified type, zones, dates and data version.
-#' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"trips_per_person"` (or just `"tpp"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package**
+#' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"number_of_trips"` (or just `"nt"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See respective codebooks for more information. **ADD CODEBOOKS! to the package**
 #' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`, or the original Spanish `"distritos"`) or `"municipalities"` (or `"muni"`, `"municip"`, or the original Spanish `"municipios"`) for both data versions. Additionaly, these can be `"large_urban_areas"` (or `"lau"`, or the original Spanish `"grandes_areas_urbanas"`, or `"gau"`) for v2 data (2022 onwards).
 #' @inheritParams spod_dates_argument_to_dates_seq
 #' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()` which returns the value of the environment variable `SPANISH_OD_DATA_DIR` or a temporary directory if the variable is not set.
@@ -37,7 +37,7 @@ spod_download_data <- function(
     type = c(
       "od", "origin-destination",
       "os", "overnight_stays",
-      "tpp", "trips_per_person"
+      "nt", "number_of_trips"
     ),
     zones = c(
       "districts", "dist", "distr", "distritos",

diff --git a/R/duckdb_helpers.R b/R/duckdb_helpers.R
@@ -179,18 +179,18 @@ spod_duckdb_od <- function(
   return(con)
 }
 
-#' Create a duckdb trips per person table
+#' Create a duckdb number of trips table
 #' 
 #' @description
-#' This function creates a duckdb connection to the trips per person data stored in a folder of CSV.gz files.
+#' This function creates a duckdb connection to the number of trips data stored in a folder of CSV.gz files.
 #' @inheritParams spod_duckdb_od
 #' @inheritParams spod_available_data
 #' @inheritParams spod_download_data
 #' 
 #' @return A duckdb connection with 2 views.
 #' 
 #' @keywords internal
-spod_duckdb_trips_per_person <- function(
+spod_duckdb_number_of_trips <- function(
   con = DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:", read_only = FALSE),
   zones = c(
     "districts", "dist", "distr", "distritos",
@@ -397,10 +397,10 @@ spod_sql_where_dates <- function(dates) {
 #' @param max_mem_gb The maximum memory to use in GB. A conservative default is 3 GB, which should be enough for resaving the data to DuckDB form a folder of CSV.gz files while being small enough to fit in memory of most even old computers. For data analysis using the already converted data (in DuckDB or Parquet format) or with the raw CSV.gz data, it is recommended to increase it according to available resources.
 #' @param max_n_cpu The maximum number of threads to use. Defaults to the number of available cores minus 1.
 spod_duckdb_limit_resources <- function(
-    con,
-    max_mem_gb = 3, # in GB, default to 3 GB, should be enough to resave the data and small enough to fit in memory of most even old computers
-    max_n_cpu = parallelly::availableCores() - 1 # leave one core for other tasks by default
-    ) {
+  con,
+  max_mem_gb = max(4, spod_available_ram() - 4),
+  max_n_cpu = parallelly::availableCores() - 1
+) {
   DBI::dbExecute(
     con,
     dplyr::sql(

diff --git a/R/get.R b/R/get.R
@@ -203,7 +203,7 @@ spod_available_data_v2 <- function(
   }
 
   # add known file sizes from cached data
-  file_sizes <- file_sizes <- readr::read_csv(system.file("extdata", "url_file_sizes_v2.txt.gz", package = "spanishoddata"), show_col_types = FALSE)
+  file_sizes <- readr::read_csv(system.file("extdata", "url_file_sizes_v2.txt.gz", package = "spanishoddata"), show_col_types = FALSE)
   files_table <- dplyr::left_join(files_table, file_sizes, by = "target_url")
 
   # if there are files with missing sizes, impute them

diff --git a/R/get_v1_data.R b/R/get_v1_data.R
@@ -341,10 +341,9 @@ spod_clean_zones_v1 <- function(zones_path, zones) {
   # now we have some duplicate ids, we need to remove them
   # here's a function for that
   unique_separated_ids <- function(column) {
-    # Split the string by semicolon, remove duplicates, and join them back with semicolons
-    sapply(column, function(x) {
-      unique_ids <- unique(stringr::str_split(x, ";\\s*")[[1]])  # Split by semicolon and remove duplicates
-      paste(unique_ids, collapse = "; ")  # Join them back with semicolons
+    purrr::map_chr(column, ~ {
+      unique_ids <- unique(stringr::str_split(.x, ";\\s*")[[1]])  # Split by semicolon and remove duplicates
+      stringr::str_c(unique_ids, collapse = "; ")  # Join them back with semicolons
     })
   }
 
@@ -480,7 +479,7 @@ spod_get_od <- function(
     dates = NULL,
     data_dir = spod_get_data_dir(),
     quiet = FALSE,
-    max_mem_gb = 3,
+    max_mem_gb = max(4, spod_available_ram() - 4),
     max_n_cpu = parallelly::availableCores() - 1
 ) {
   # hardcode od as this is a wrapper to get origin-destiation data using spod_get() function
@@ -531,7 +530,7 @@ spod_get <- function(
   type = c(
     "od", "origin-destination",
     "os", "overnight_stays",
-    "tpp", "trips_per_person"
+    "nt", "number_of_trips"
   ),
   zones = c(
     "districts", "dist", "distr", "distritos",
@@ -540,7 +539,7 @@ spod_get <- function(
   dates = NULL,
   data_dir = spod_get_data_dir(),
   quiet = FALSE,
-  max_mem_gb = 3,
+  max_mem_gb = max(4, spod_available_ram() - 4),
   max_n_cpu = parallelly::availableCores() - 1,
   max_download_size_gb = 1,
   duckdb_target = ":memory:",
@@ -595,8 +594,8 @@ spod_get <- function(
       ver = ver,
       data_dir = data_dir
     )
-  } else if (type == "tpp") {
-    con <- spod_duckdb_trips_per_person(
+  } else if (type == "nt") {
+    con <- spod_duckdb_number_of_trips(
       con = con,
       zones = zones,
       ver = ver,

diff --git a/R/help.R b/R/help.R
@@ -16,7 +16,7 @@ spod_codebook = function(ver = 1) {
       topic = "v1-2020-2021-mitma-data-codebook",
       package = "spanishoddata"
     )
-    if( class(help) == "vignette" ){
+    if( inherits(help, what = "vignette") ){
       return(help)
     } else {
       message("For some reason the codebook was not installed with the package. Please refer to the online version at: https://robinlovelace.github.io/spanishoddata/articles/codebook-v1.html")
@@ -26,7 +26,7 @@ spod_codebook = function(ver = 1) {
       topic = "v2-2022-onwards-mitma-data-codebook.qmd",
       package = "spanishoddata"
     ) 
-    if( class(help) == "vignette" ){
+    if( inherits(help, what = "vignette") ){
       return(help)
     } else {
       message("For some reason the codebook was not installed with the package. Please refer to the online version at: https://robinlovelace.github.io/spanishoddata/articles/codebook-v2.html")

diff --git a/R/internal_utils.R b/R/internal_utils.R
@@ -210,7 +210,7 @@ spod_match_data_type_for_local_folders <- function(
     type = c(
       "od", "origin-destination",
       "os", "overnight_stays",
-      "tpp", "trips_per_person"
+      "nt", "number_of_trips"
     ),
     ver = c(1, 2)) {
   if (!ver %in% c(1, 2)) {
@@ -223,7 +223,7 @@ spod_match_data_type_for_local_folders <- function(
   if (ver == 1) {
     if (type %in% c("od", "origin-destination")) {
       return("maestra1")
-    } else if (type %in% c("tpp", "trips_per_person")) {
+    } else if (type %in% c("nt", "number_of_trips")) {
       return("maestra2")
     }
   }
@@ -233,7 +233,7 @@ spod_match_data_type_for_local_folders <- function(
       return("viajes")
     } else if (type %in% c("os", "overnight_stays")) {
       return("pernoctaciones")
-    } else if (type %in% c("tpp", "trips_per_person")) {
+    } else if (type %in% c("nt", "number_of_trips")) {
       return("personas")
     }
   }
@@ -244,14 +244,14 @@ spod_match_data_type_for_local_folders <- function(
 
 
 #' Match data types for normalisation
-#' @param type The type of data to match. Can be "od", "origin-destination", "os", "overnight_stays", or "tpp", "trips_per_person".
+#' @param type The type of data to match. Can be "od", "origin-destination", "os", "overnight_stays", or "nt", "number_of_trips".
 
 #' @keywords internal
 spod_match_data_type <- function(
     type = c(
       "od", "origin-destination", "viajes",
       "os", "overnight_stays", "pernoctaciones",
-      "tpp", "trips_per_person", "personas"
+      "nt", "number_of_trips", "personas"
     )
 ) {
 
@@ -262,10 +262,19 @@ spod_match_data_type <- function(
     return("od")
   } else if (type %in% c("os", "overnight_stays", "pernoctaciones")) {
     return("os")
-  } else if (type %in% c("tpp", "trips_per_person", "personas")) {
-    return("tpp")
+  } else if (type %in% c("nt", "number_of_trips", "personas")) {
+    return("nt")
   }
 
   # need to add a warning here that the type is not recognized
   return(NULL)
 }
+
+#' Get available RAM
+#' @keywords internal
+#' @return A `numeric` amount of available RAM in GB.
+spod_available_ram <- function(){
+  return(
+    as.numeric(unclass(memuse::Sys.meminfo())[1][['totalram']])/1024/1024/1024
+  )
+}
diff --git a/README.md b/README.md
@@ -11,10 +11,13 @@ experimental](https://img.shields.io/badge/lifecycle-experimental-red.svg)
 <!-- badges: end -->
 
 **spanishoddata** is an R package that provides functions for
-downloading and formatting Spanish origin-destination (OD) data from the
-Ministry of Transport and Sustainable Mobility of Spain.
+downloading and formatting Spanish open mobility data released by the
+Ministry of Transport and Sustainable Mobility of Spain (Secretary of
+State for Transport, Mobility and Urban Agenda (Secretaría de Estado de
+Transportes, Movilidad y Agenda Urbana) 2024).
 
-It supports the two versions of the Spanish OD data. [The first
+It supports the two versions of the Spanish mobility data that consists
+of origin-destination matrices and some additional data sets. [The first
 version](https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/estudios-de-movilidad-anteriores/covid-19/opendata-movilidad)
 covers data from 2020 and 2021, including the period of the COVID-19
 pandemic. [The second
@@ -23,7 +26,13 @@ contains data from January 2022 onwards and is updated monthly on the
 fifteenth of each month. Both versions of the data primarily consist of
 mobile phone positioning data, and include matrices for overnight stays,
 individual movements, and trips of Spanish residents at different
-geographical levels.
+geographical levels. See the [package
+website](https://robinlovelace.github.io/spanishoddata/) and vignettes
+for
+[v1](https://robinlovelace.github.io/spanishoddata/articles/v1-2020-2021-mitma-data-codebook)
+and
+[v2](https://robinlovelace.github.io/spanishoddata/articles/v2-2022-onwards-mitma-data-codebook)
+data for more details.
 
 **spanishoddata** is designed to save people time by providing the data
 in analysis-ready formats. Automating the process of downloading,
@@ -51,7 +60,8 @@ Install the development version of the package as follows:
 
 ``` r
 if (!require("remotes")) install.packages("remotes")
-remotes::install_github("Robinlovelace/spanishoddata")
+remotes::install_github("Robinlovelace/spanishoddata",
+  force = TRUE, dependencies = TRUE)
 ```
 
 Load it as follows:
@@ -411,4 +421,15 @@ Origin-Destination Data,” August.
 
 </div>
 
+<div id="ref-mitma-mobility-2024-v6" class="csl-entry">
+
+Secretary of State for Transport, Mobility and Urban Agenda (Secretaría
+de Estado de Transportes, Movilidad y Agenda Urbana). 2024. “Estudio de
+movilidad de viajeros de ámbito nacional aplicando la tecnología Big
+Data. Informe metodológico (Study of National Traveler Mobility Using
+Big Data Technology. Methodological Report).”
+<https://www.transportes.gob.es/ministerio/proyectos-singulares/estudio-de-movilidad-con-big-data>.
+
+</div>
+
 </div>
diff --git a/README.qmd b/README.qmd
@@ -16,9 +16,9 @@ eval: false
 [![R-CMD-check](https://github.com/Robinlovelace/spanish_od_data/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/Robinlovelace/spanish_od_data/actions/workflows/R-CMD-check.yaml)
 <!-- badges: end -->
 
-**spanishoddata** is an R package that provides functions for downloading and formatting Spanish origin-destination (OD) data from the Ministry of Transport and Sustainable Mobility of Spain.
+**spanishoddata** is an R package that provides functions for downloading and formatting Spanish open mobility data released by the Ministry of Transport and Sustainable Mobility of Spain [@mitma-mobility-2024-v6].
 
-It supports the two versions of the Spanish OD data. [The first version](https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/estudios-de-movilidad-anteriores/covid-19/opendata-movilidad) covers data from 2020 and 2021, including the period of the COVID-19 pandemic. [The second version](https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad) contains data from January 2022 onwards and is updated monthly on the fifteenth of each month. Both versions of the data primarily consist of mobile phone positioning data, and include matrices for overnight stays, individual movements, and trips of Spanish residents at different geographical levels.
+It supports the two versions of the Spanish mobility data that consists of origin-destination matrices and some additional data sets. [The first version](https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/estudios-de-movilidad-anteriores/covid-19/opendata-movilidad) covers data from 2020 and 2021, including the period of the COVID-19 pandemic. [The second version](https://www.transportes.gob.es/ministerio/proyectos-singulares/estudios-de-movilidad-con-big-data/opendata-movilidad) contains data from January 2022 onwards and is updated monthly on the fifteenth of each month. Both versions of the data primarily consist of mobile phone positioning data, and include matrices for overnight stays, individual movements, and trips of Spanish residents at different geographical levels. See the [package website](https://robinlovelace.github.io/spanishoddata/) and vignettes for [v1](https://robinlovelace.github.io/spanishoddata/articles/v1-2020-2021-mitma-data-codebook) and [v2](https://robinlovelace.github.io/spanishoddata/articles/v2-2022-onwards-mitma-data-codebook) data for more details.
 
 **spanishoddata** is designed to save people time by providing the data in analysis-ready formats. Automating the process of downloading, cleaning, and importing the data can also reduce the risk of errors in the laborious process of data preparation. It also reduces computational resources by using computationally efficient packages behind the scenes. To effectively work with multiple data files, it’s recommended you set up a data directory where the package can search for the data and download only the files that are not already present.
 
@@ -33,7 +33,8 @@ Install the development version of the package as follows:
 
 ```{r}
 if (!require("remotes")) install.packages("remotes")
-remotes::install_github("Robinlovelace/spanishoddata")
+remotes::install_github("Robinlovelace/spanishoddata",
+  force = TRUE, dependencies = TRUE)
 ```
 
 Load it as follows:

diff --git a/man/spod_available_ram.Rd b/man/spod_available_ram.Rd
diff --git a/man/spod_connect.Rd b/man/spod_connect.Rd
diff --git a/man/spod_convert.Rd b/man/spod_convert.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,7 +34,9 @@ Imports: @@
         glue,
         here,
         lubridate,
+        memuse,
         parallelly,
+        purrr,
         readr,
         rlang,
         sf,
@@ Expand Down @@