-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05_benchmarking.R
120 lines (102 loc) · 3.79 KB
/
05_benchmarking.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Load Libraries ----------------------------------------------------------
require(here)
library(arrow)
library(dplyr)
library(stringr)
library(tictoc)
library(duckplyr)
library(polars)
library(ggplot2)
# Download Data -----------------------------------------------------------
# Download 40GB (1.1 billion rows) of NYC Taxi rides
# NOTE: This may take several hours
data_path <- here::here("data/nyc-taxi")
open_dataset("s3://voltrondata-labs-datasets/nyc-taxi") |>
filter(year %in% 2012:2021) |>
write_dataset(data_path, partitioning = c("year", "month"))
# Subset & Benchmark ------------------------------------------------------
# Manually iterated over the code below to benchmark
# and compare performance on 1 million, 10 million, 100
# million, and 1.1 billion rows
nyc_taxi_arrow <- open_dataset("data/nyc-taxi") |>
dplyr::select(year, passenger_count)
nyc_taxi_tibble <- nyc_taxi_arrow |>
dplyr::collect() |>
dplyr::slice_sample(n = 50000000)
nyc_taxi <- nyc_taxi_tibble |>
arrow::as_arrow_table()
nyc_taxi_duckplyr_df <- nyc_taxi_tibble |>
duckplyr::as_duckplyr_df()
nyc_taxi_polars <- pl$DataFrame(nyc_taxi_tibble)$lazy()
tic()
bnch <- bench::mark(
min_iterations = 50,
tibble_to_arrow = nyc_taxi_tibble |>
arrow::as_arrow_table() |>
dplyr::group_by(year) |>
dplyr::summarise(all_trips = n(),
shared_trips = sum(passenger_count > 1, na.rm = T)) |>
dplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
dplyr::collect(),
tibble_to_duckplyr = nyc_taxi_tibble |>
duckplyr::as_duckplyr_df() |>
duckplyr::mutate(all_trips = n(), .by = year) |>
duckplyr::filter(passenger_count > 1) |>
duckplyr::summarise(shared_trips = n(),
.by = c(year, all_trips)) |>
duckplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
duckplyr::collect(),
tibble_to_dplyr = nyc_taxi_tibble |>
dplyr::group_by(year) |>
dplyr::summarise(all_trips = n(),
shared_trips = sum(passenger_count > 1, na.rm = T)) |>
dplyr::mutate(pct_shared = shared_trips / all_trips * 100),
arrow_table = nyc_taxi |>
dplyr::group_by(year) |>
dplyr::summarise(all_trips = n(),
shared_trips = sum(passenger_count > 1, na.rm = T)) |>
dplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
dplyr::collect(),
# arrow = nyc_taxi_tibble |>
# dplyr::group_by(year) |>
# dplyr::summarise(all_trips = n(),
# shared_trips = sum(passenger_count > 1, na.rm = T)) |>
# dplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
# dplyr::collect(),
arrow_to_duckdb = nyc_taxi |>
arrow::to_duckdb() |>
dplyr::mutate(all_trips = n(), .by = year) |>
dplyr::filter(passenger_count > 1) |>
dplyr::group_by(year, all_trips) |>
dplyr::summarise(shared_trips = n(),
.groups = "drop") |>
dplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
dplyr::collect(),
duckplyr_df = nyc_taxi_duckplyr_df |>
duckplyr::mutate(all_trips = n(), .by = year) |>
duckplyr::filter(passenger_count > 1) |>
duckplyr::summarise(shared_trips = n(),
.by = c(year, all_trips)) |>
duckplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
duckplyr::collect(),
polars = nyc_taxi_polars$
select(pl$col(c("year", "passenger_count")))$
with_columns(
pl$count()$
over("year")$
alias("all_trips")
)$
filter(pl$col("passenger_count") > 1)$
group_by(c("year", "all_trips"))$
agg(
pl$count()$alias("shared_trips")
)$
collect()$
to_data_frame() |>
mutate(pct_shared = shared_trips / all_trips * 100),
check = FALSE
)
toc()
autoplot(bnch)
# Session Info ------------------------------------------------------------
sessionInfo()