-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_NYC_Taxi.R
130 lines (108 loc) · 3.54 KB
/
01_NYC_Taxi.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
library(arrow)
library(dplyr)
library(tictoc)
nyc_taxi <- open_dataset(here::here("data/nyc-taxi"))
glimpse(nyc_taxi)
nyc_taxi |>
nrow()
bnch <- bench::mark(
min_iterations = 10,
arrow = nyc_taxi |>
dplyr::group_by(year) |>
dplyr::summarise(all_trips = n(),
shared_trips = sum(passenger_count > 1, na.rm = T)) |>
dplyr::mutate(pct_shared = shared_trips / all_trips * 100) |>
dplyr::collect()
)
library(ggplot2)
autoplot(bnch)
tic()
nyc_taxi |>
group_by(year) |>
summarise(all_trips = n(),
shared_trips = sum(passenger_count > 1, na.rm = T)) |>
mutate(pct_shared = shared_trips / all_trips * 100) |>
collect()
### Custom binding request below
# |> mutate(pct_shared = scales::label_percent(accuracy = 0.1)(shared_trips / all_trips)) |>
toc()
# longest trip distance for every month in 2019
tic()
nyc_taxi |>
filter(year == 2019) |>
group_by(month) |>
summarise(max_trip_dist = max(trip_distance, na.rm = T)) |>
arrange(month) |>
nrow()
collect()
toc()
# longest trip duration for every month in 2019
tic()
nyc_taxi |>
filter(year == 2019) |>
group_by(month) |>
### Custom binding request below
mutate(duration_minutes = as.numeric(dropoff_datetime - pickup_datetime) / 60) |>
arrange(month) |>
collect()
toc()
# How many taxi fares in the dataset had a total amount greater than $100?
glimpse(nyc_taxi)
nyc_taxi |>
filter(total_amount > 100) |>
nrow()
# How many distinct pickup locations (distinct combinations of the
# pickup_latitude and pickup_longitude columns) are in the dataset since 2016?
# using pickup and dropoff location IDs
nyc_taxi |>
filter(year >= 2016) |>
distinct(pickup_location_id, dropoff_location_id) |>
compute() |>
nrow()
nyc_taxi |>
filter(year >= 2016) |>
# using pickup and dropoff location lat/long pairs
distinct(pickup_latitude, pickup_longitude) |>
collect() |>
nrow()
# Adjust in diff currencies
taxi_gbp <- nyc_taxi |>
# Question: Why does ~.x work below but not an anonymous function syntax \(x)
# Note on list(), below: This is the .fns arg and the output is named by
# combining the function name and the column name using the glue specification
# in .names.
mutate(across(ends_with("amount"), list(pounds = ~.x * 0.79))) |>
head() |>
select(contains("amount")) |>
collect()
# Look at na_if function as an error when Arrow hasn't implemented a function binding
nyc_taxi |>
mutate(vendor_name = na_if(vendor_name, "CMT")) |>
head() |>
collect()
nyc_taxi |>
mutate(vendor_name = if_else(vendor_name == "CMT", NA, vendor_name)) |>
head() |>
collect()
# Use the dplyr::filter() and stringr::str_ends() functions to return a subset
# of the data which is a) from September 2020, and b) the value in vendor_name
# ends with the letter “S”.
nyc_taxi |>
filter(year == 2020,
month == 9,
stringr::str_ends(vendor_name, "S")) |>
collect()
# Try to use the stringr function str_replace_na() to replace any NA values in
# the vendor_name column with the string “No vendor” instead. What happens, and
# why?
nyc_taxi |>
mutate(vendor_name = stringr::str_replace_na(vendor_name, "No vendor"))
# The console says the followubg for the above:
# Error: Expression stringr::str_replace_na(vendor_name, "No vendor") not supported in Arrow
# Call collect() first to pull data into R.
nyc_taxi |>
mutate(vendor_name = if_else(is.na(vendor_name), "No vendor", vendor_name))
# Count number of NAs
nyc_taxi |>
summarise(across(everything(), ~sum(is.na(.)))) |>
collect()