-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path03-predictions-and-model-checking.R
212 lines (134 loc) · 6.69 KB
/
03-predictions-and-model-checking.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# example and practical scripts for part 04!
# load requisite packages
library('mgcv')
library('gratia')
library('ggplot2')
library('dplyr')
# load the data
trawls <- read.csv("data/trawl_nl.csv")
trawls2010 <- filter(trawls, year==2010)
flu_data <- read.csv("data/california-flu-data.csv") %>%
mutate(season = factor(season))
#filter to just one year for now
flu2010_data <- flu_data %>%
filter(season=="2010-2011")
# Following along with the slides ####
flu2010_posmod <- gam(tests_pos ~ s(week_centered, k = 15) + offset(log(tests_total)),
data= flu2010,
family = poisson,
method = "REML")
ggplot(flu2010, aes(x =week_centered, y=tests_pos/tests_total)) +
geom_point()+
labs(x="Week (centered)", y="Fraction of positive tests")
plot(flu2010_posmod)
draw(flu2010_posmod)
sm_fit <- evaluate_smooth(flu2010_posmod, 's(week_centered)') # tidy data on smooth
sm_post <- smooth_samples(flu2010_posmod, 's(week_centered)', n = 20, seed = 42) # more on this later
draw(sm_fit) + geom_line(data = sm_post, aes(x = .x1, y = value, group = draw),
alpha = 0.3, colour = 'red')
new_flu <- with(flu2010, tibble(week_centered = seq(min(week_centered), max(week_centered), length.out = 100),
tests_total = 1))
pred <- predict(flu2010_posmod, newdata = new_flu, se.fit = TRUE, type = 'link')
pred <- bind_cols(new_flu, as_tibble(as.data.frame(pred)))
pred
ilink <- inv_link(flu2010_posmod) # inverse link function
crit <- qnorm((1 - 0.89) / 2, lower.tail = FALSE) # or just `crit <- 2`
pred <- mutate(pred, pos_rate = ilink(fit),
lwr = ilink(fit - (crit * se.fit)), # lower...
upr = ilink(fit + (crit * se.fit))) # upper credible interval
pred
ggplot(pred, aes(x = week_centered)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2) +
geom_line(aes(y = pos_rate)) + labs(y = "Test postivity rate", x = NULL)
## Exercise 1
# Let's explore some of the plotting options for GAMs and how we can control what kinds of intervals are drawn
# First we'll re-fit a model of species richness in the trawl data from the last session
rich_depthyear <- gam(richness~s(log10(depth), k=30) +s(year, k=10),
family=poisson,
method="REML",
data=trawls)
# Here's how to plot this model on a single page using the `plot` function:
plot(rich_depthyear, pages=1)
# 1. Draw the estimated smooth again, this time using `seWithMean = TRUE`. Does it change the plot much? Think why this is the case.
# 2. Using `draw()`, draw the estimated smooth
# 3. Repeat 2 but this time account for the additional uncertainty due to estimating the smoothness parameters using `unconditional = TRUE`
# Now let's explore working with smooths in a little more detail. First evaluate the smooth over a range of values for `year`
depth_sm <- evaluate_smooth(rich_depthyear, "s(log10(depth))")
depth_sm
# We can add a confidence interval to `year_sm` by hand or use the `confint()` methof
depth_sm <- confint(rich_depthyear, "s(log10(depth))")
depth_sm
# Posterior simulation code ####
sm_week <- get_smooth(flu2010_posmod, "s(week_centered)") # extract the smooth object from model
idx <- gratia:::smooth_coefs(sm_week) # indices of the coefs for this smooth
idx
beta <- coef(flu2010_posmod) # vector of model parameters
#Extracting the variance-covariance matrix
Vb <- vcov(flu2010_posmod) # default is the bayesian covariance matrix
new_weeks <- with(flu2010, tibble(week_centered = seq_min_max(week_centered, n = 100),
tests_total = 1))
Xp <- predict(flu2010_posmod, newdata = new_weeks, type = 'lpmatrix')
dim(Xp)
#Take only the columns of Xp that are involved in the smooth of `year`
Xp <- Xp[, idx, drop = FALSE]
dim(Xp)
# Simulate from the posterior:
set.seed(42)
beta_sim <- rmvn(n = 20, beta[idx], Vb[idx, idx, drop = FALSE])
dim(beta_sim)
sm_draws <- Xp %*% t(beta_sim)
dim(sm_draws)
matplot(sm_draws, type = 'l')
# Using smooth_samples to draw from the posterior:
sm_post <- smooth_samples(flu2010_posmod, 's(week_centered)', n = 20, seed = 42)
draw(sm_post)
# Drawing posteriors for the predicted response values:
beta <- coef(flu2010_posmod) # vector of model parameters
Vb <- vcov(flu2010_posmod) # default is the bayesian covariance matrix
Xp <- predict(flu2010_posmod, type = 'lpmatrix')
set.seed(42)
beta_sim <- rmvn(n = 1000, beta, Vb) # simulate parameters
eta_p <- Xp %*% t(beta_sim) # form linear predictor values
mu_p <- inv_link(flu2010_posmod)(eta_p) # apply inverse link function
mean(mu_p[1, ]) # mean of posterior for the first observation in the data
quantile(mu_p[1, ], probs = c(0.025, 0.975))
# Exercise 2:
# 1. Use the smooth_samples function to calculate 20 posterior draws from the
# `rich_depthyear` model for the log10(depth) term and the year term
# 2. Plot those samples using the draw function.
# 3. Visually evaluate where the uncertainty is the greatest for each term
# Viewing concurvity ####
# load the data
trawls <- read.csv(here("data","trawl_nl.csv"))
trawls2010 <- filter(trawls,year==2010)
rich_tempdepth <- gam(richness ~ s(log10(depth),k=30)+s(temp_bottom) + lat,
data= trawls2010,
family = poisson, method ="REML")
draw(rich_tempdepth)
concurvity(rich_tempdepth)
# Comparing models - alternative approaches ####
# Compare rich_tempdepth vs. rich_depth
rich_tempdepth <- gam(richness ~ s(log10(depth),k=30)+s(temp_bottom) + lat,
data= trawls2010,
family = poisson, method ="REML")
summary(rich_tempdepth)
rich_depth <- gam(richness ~ s(log10(depth),k=30) + lat,
data= trawls2010,
family = poisson, method ="REML")
anova(rich_depth, rich_tempdepth,test = "Chisq")
AIC(rich_depth, rich_tempdepth)
# Select=TRUE
rich_tempdepth_nonull <- gam(richness ~ s(log10(depth),k=30)+s(temp_bottom),
data= trawls2010,
family = poisson, method ="REML", select = TRUE)
summary(rich_tempdepth_nonull)
draw(rich_tempdepth_nonull)
# Exercise 3: ####
# Use the full trawls data set, fit a model of richness as a function of
# depth,year, and temperature, like this:
rich_yeartempdepth <- gam(richness ~ s(log10(depth),k=30)+s(year, k= 10) + s(temp_bottom),
data= trawls,
family = poisson, method ="REML")
# 1. Compare this model to reduced models: one bottom temperature, and one
# excluding year and bottom temperature, using anova and AIC
# 2. Refit the above model using select =TRUE and view it using draw