diff --git a/GP.qmd b/GP.qmd index 61e6d25..2dc8dea 100644 --- a/GP.qmd +++ b/GP.qmd @@ -1,6 +1,6 @@ --- -title: "VectorByte Methods Training" -subtitle: "Introduction to Gaussian Processes for Time Dependent Data" +subtitle: "VectorByte Methods Training" +title: "Introduction to Gaussian Processes for Time Dependent Data" editor: source author: "The VectorByte Team (Parul Patil, Virginia Tech)" title-slide-attributes: diff --git a/GP_Notes.qmd b/GP_Notes.qmd index b8f393b..aeb1ce7 100644 --- a/GP_Notes.qmd +++ b/GP_Notes.qmd @@ -1,11 +1,15 @@ --- -title: "VectorByte Methods Training" -subtitle: "Introduction to Gaussian Processes for Time Dependent Data" -author: "The VectorByte Team (Parul Patil, Virginia Tech)" +title: "VectorByte Methods Training: Introduction to Gaussian Processes for Time Dependent Data (notes)" +author: + - name: Parul Patil + affiliation: Virginia Tech and VectorByte title-slide-attributes: data-background-image: VectorByte-logo_lg.png data-background-size: contain data-background-opacity: "0.2" +citation: true +date: 2024-07-21 +date-format: long format: html: toc: true diff --git a/GP_Practical.qmd b/GP_Practical.qmd index d7f70ba..23bf0f2 100644 --- a/GP_Practical.qmd +++ b/GP_Practical.qmd @@ -1,7 +1,11 @@ --- -title: "VectorByte Methods Training" -subtitle: "Practical: Introduction to Gaussian Processes for Time Dependent Data" -author: "The VectorByte Team (Parul Patil, Virginia Tech)" +title: "VectorByte Methods Training: Introduction to Gaussian Processes for Time Dependent Data (Practical)" +author: + - name: Parul Patil + affiliation: Virginia Tech and VectorByte +citation: true +date: 2024-07-21 +date-format: long format: html: toc: true diff --git a/Stats_review.qmd b/Stats_review.qmd index 2e736e0..c863679 100644 --- a/Stats_review.qmd +++ b/Stats_review.qmd @@ -1,7 +1,12 @@ --- -title: VectorByte Methods Training -subtitle: Probability and Statistics Fundamentals -author: The VectorByte Team (Leah R. Johnson, Virginia Tech) +title: "VectorByte Methods Training: Probability and Statistics Fundamentals" +author: + - name: Leah R. Johnson + url: https://lrjohnson0.github.io/QEDLab/leahJ.html + affiliation: Virginia Tech and VectorByte +citation: true +date: 2024-07-01 +date-format: long format: html: toc: true diff --git a/VB_RegDiagTrans.qmd b/VB_RegDiagTrans.qmd index ef1b52a..84ebd00 100644 --- a/VB_RegDiagTrans.qmd +++ b/VB_RegDiagTrans.qmd @@ -1,6 +1,6 @@ --- -title: "VectorByte Methods Training" -subtitle: "Review of Diagnostics and Transformations for Regression Models" +subtitle: "VectorByte Methods Training" +title: "Review of Diagnostics and Transformations for Regression Models" author: "The VectorByte Team (Leah R. Johnson, Virginia Tech)" title-slide-attributes: data-background-image: VectorByte-logo_lg.png diff --git a/VB_RegDiagTrans_practical.qmd b/VB_RegDiagTrans_practical.qmd index ffd5687..1a64b10 100644 --- a/VB_RegDiagTrans_practical.qmd +++ b/VB_RegDiagTrans_practical.qmd @@ -1,7 +1,12 @@ --- -title: "VectorByte Methods Training" -subtitle: "Practical: Diagnostics and Transformations" -author: "The VectorByte Team (Leah R. Johnson, Virginia Tech)" +title: "VectorByte Methods Training: Regression Diagnostics and Transformations (practical)" +author: + - name: Leah R. Johnson + url: https://lrjohnson0.github.io/QEDLab/leahJ.html + affiliation: Virginia Tech and VectorByte +citation: true +date: 2024-07-01 +date-format: long format: html: toc: true diff --git a/VB_RegRev.qmd b/VB_RegRev.qmd index d73a8b4..628eb2f 100644 --- a/VB_RegRev.qmd +++ b/VB_RegRev.qmd @@ -1,11 +1,12 @@ --- -title: "VectorByte Methods Training" -subtitle: "Regression Review" -author: "The VectorByte Team (Leah R. Johnson, Virginia Tech)" -title-slide-attributes: - data-background-image: VectorByte-logo_lg.png - data-background-size: contain - data-background-opacity: "0.2" +title: "VectorByte Methods Training: Regression Review" +author: + - name: Leah R. Johnson + url: https://lrjohnson0.github.io/QEDLab/leahJ.html + affiliation: Virginia Tech and VectorByte +citation: true +date: 2024-07-01 +date-format: long format: html: toc: true diff --git a/VB_TimeDepData.qmd b/VB_TimeDepData.qmd index dd96516..5f75af3 100644 --- a/VB_TimeDepData.qmd +++ b/VB_TimeDepData.qmd @@ -1,6 +1,6 @@ --- -title: "VectorByte Methods Training" -subtitle: "Regression Methods for Time Dependent Data" +subtitle: "VectorByte Methods Training" +title: "Regression Methods for Time Dependent Data" author: "The VectorByte Team (Leah R. Johnson, Virginia Tech)" title-slide-attributes: data-background-image: VectorByte-logo_lg.png diff --git a/VB_TimeDepData_practical.qmd b/VB_TimeDepData_practical.qmd index 7c56f31..8761f32 100644 --- a/VB_TimeDepData_practical.qmd +++ b/VB_TimeDepData_practical.qmd @@ -1,7 +1,12 @@ --- -title: "VectorByte Methods Training" -subtitle: "Practical: Intro to Time Dependent Data" -author: "The VectorByte Team (Leah R. Johnson, Virginia Tech)" +title: "VectorByte Methods Training: Regression Methods for Time Dependent Data (practical)" +author: + - name: Leah R. Johnson + url: https://lrjohnson0.github.io/QEDLab/leahJ.html + affiliation: Virginia Tech and VectorByte +citation: true +date: 2024-07-01 +date-format: long format: html: toc: true diff --git a/docs/Stats_review.html b/docs/Stats_review.html index c986886..3c78bc3 100644 --- a/docs/Stats_review.html +++ b/docs/Stats_review.html @@ -6,9 +6,10 @@ - + + -VectorByte Training 2024 - VectorByte Methods Training +VectorByte Training 2024 - VectorByte Methods Training: Probability and Statistics Fundamentals - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
- - -
- - - -
- -
-
-

VectorByte Methods Training

-

Practical: Intro to Time Dependent Data

-
- - - -
- -
-
Author
-
-

The VectorByte Team (Leah R. Johnson, Virginia Tech)

-
-
- - - -
- - - -
- - -


-
-

Overview and Instructions

-

The goal of this practical is to practice building models for time-dependent data using simple regression based techniques. This includes incorporated possible transformations, trying out different time dependent predictors (including lagged variables) and assessing model fit using diagnostic plots.

-


-
-
-

Guided example: Monthly average mosquito counts in Walton County, FL

-

The file Culex_erraticus_walton_covariates_aggregated.csv on the course website contains data on average monthly counts of mosquitos (sample_value) in Walton, FL, together with monthly average maximum temperature (MaxTemp in C) and precipitation (Precip in inches) for each month from January 2015 through December 2017 (Month_Yr).

-
-

Exploring the Data

-

As always, we first want to take a look at the data, to make sure we understand it, and that we don’t have missing or weird values.

-
-
mozData<-read.csv("data/Culex_erraticus_walton_covariates_aggregated.csv")
-summary(mozData)
-
-
   Month_Yr          sample_value        MaxTemp          Precip      
- Length:36          Min.   :0.00000   Min.   :16.02   Min.   : 0.000  
- Class :character   1st Qu.:0.04318   1st Qu.:22.99   1st Qu.: 2.162  
- Mode  :character   Median :0.73001   Median :26.69   Median : 4.606  
-                    Mean   :0.80798   Mean   :26.23   Mean   : 5.595  
-                    3rd Qu.:1.22443   3rd Qu.:30.70   3rd Qu.: 7.864  
-                    Max.   :3.00595   Max.   :33.31   Max.   :18.307  
-
-
-

We can see that the minimum observed average number of mosquitoes it zero, and max is only 3 (there are likely many zeros averaged over many days in the month). There don’t appear to be any NAs in the data. In this case the dataset itself is small enough that we can print the whole thing to ensure it’s complete:

-
-
mozData
-
-
   Month_Yr sample_value  MaxTemp       Precip
-1   2015-01  0.000000000 17.74602  3.303991888
-2   2015-02  0.018181818 17.87269 16.544265802
-3   2015-03  0.468085106 23.81767  2.405651215
-4   2015-04  1.619047619 26.03559  8.974406168
-5   2015-05  0.821428571 30.01602  0.567960943
-6   2015-06  3.005952381 31.12094  4.841342729
-7   2015-07  2.380952381 32.81130  3.849010353
-8   2015-08  1.826347305 32.56245  5.562845324
-9   2015-09  0.648809524 30.55155 10.409724627
-10  2015-10  0.988023952 27.22605  0.337750269
-11  2015-11  0.737804878 24.86768 18.306749680
-12  2015-12  0.142857143 22.46588  5.621475377
-13  2016-01  0.000000000 16.02406  3.550622029
-14  2016-02  0.020202020 19.42057 11.254680803
-15  2016-03  0.015151515 23.13610  4.785664728
-16  2016-04  0.026143791 24.98082  4.580424519
-17  2016-05  0.025252525 28.72884  0.053057634
-18  2016-06  0.833333333 30.96990  6.155417473
-19  2016-07  1.261363636 33.30509  4.496368193
-20  2016-08  1.685279188 32.09633 11.338749182
-21  2016-09  2.617142857 31.60575  2.868288451
-22  2016-10  1.212121212 29.14275  0.000000000
-23  2016-11  1.539772727 24.48482  0.005462681
-24  2016-12  0.771573604 20.46054 11.615521725
-25  2017-01  0.045454545 18.35473  0.000000000
-26  2017-02  0.036363636 23.65584  3.150710053
-27  2017-03  0.194285714 22.53573  1.430094952
-28  2017-04  0.436548223 26.15299  0.499381616
-29  2017-05  1.202020202 28.00173  6.580562663
-30  2017-06  0.834196891 29.48951 13.333939858
-31  2017-07  1.765363128 32.25135  7.493927035
-32  2017-08  0.744791667 31.86476  6.082113434
-33  2017-09  0.722222222 30.60566  4.631037395
-34  2017-10  0.142131980 27.73453 11.567112214
-35  2017-11  0.289772727 23.23140  1.195760473
-36  2017-12  0.009174312 18.93603  4.018254442
-
-
-
-
-

Plotting the data

-

First we’ll examine the data itself, including the predictors:

-
-
months<-dim(mozData)[1]
-t<-1:months ## counter for months in the data set
-par(mfrow=c(3,1))
-plot(t, mozData$sample_value, type="l", lwd=2, 
-     main="Average Monthly Abundance", 
-     xlab ="Time (months)", 
-     ylab = "Average Count")
-plot(t, mozData$MaxTemp, type="l",
-     col = 2, lwd=2, 
-     main="Average Maximum Temp", 
-     xlab ="Time (months)", 
-     ylab = "Temperature (C)")
-plot(t, mozData$Precip, type="l",
-     col="dodgerblue", lwd=2,
-     main="Average Monthly Precip", 
-     xlab ="Time (months)", 
-     ylab = "Precipitation (in)")
-
-
-
-

-
-
-
-
-

Visually we noticed that there may be a bit of clumping in the values for abundance (this is subtle) – in particular, since we have a lot of very small/nearly zero counts, a transform, such as a square root, may spread things out for the abundances. It also looks like both the abundance and temperature data are more cyclical than the precipitation, and thus more likely to be related to each other. There’s also not visually a lot of indication of a trend, but it’s usually worthwhile to consider it anyway. Replotting the abundance data with a transformation:

-
-
months<-dim(mozData)[1]
-t<-1:months ## counter for months in the data set
-plot(t, sqrt(mozData$sample_value), type="l", lwd=2, 
-     main="Sqrt Average Monthly Abundance", 
-     xlab ="Time (months)", 
-     ylab = "Average Count")
-
-
-
-

-
-
-
-
-

That looks a little bit better. I suggest we go with this for our response.

-
-
-

Building a data frame

-

Before we get into model building, we always want to build a data frame to contain all of the predictors that we want to consider, at the potential lags that we’re interested in. In the lecture we saw building the AR, sine/cosine, and trend predictors:

-
-
t <- 2:months ## to make building the AR1 predictors easier
-
-mozTS <- data.frame(
-  Y=sqrt(mozData$sample_value[t]), # transformed response
-  Yl1=sqrt(mozData$sample_value[t-1]), # AR1 predictor
-  t=t, # trend predictor
-  sin12=sin(2*pi*t/12), 
-  cos12=cos(2*pi*t/12) # periodic predictors
-  )
-
-

We will also put in the temperature and precipitation predictors. But we need to think about what might be an appropriate lag. If this were daily or weekly data, we’d probably want to have a fairly sizable lag – mosquitoes take a while to develop, so the number we see today is not likely related to the temperature today. However, since these data are agregated across a whole month, as is the temperature/precipitaion, the current month values are likely to be useful. However, it’s even possible that last month’s values may be so we’ll add those in as well:

-
-
mozTS$MaxTemp<-mozData$MaxTemp[t] ## current temps
-mozTS$MaxTempl1<-mozData$MaxTemp[t-1] ## previous temps
-mozTS$Precip<-mozData$Precip[t] ## current precip
-mozTS$Precipl1<-mozData$Precip[t-1] ## previous precip
-
-

Thus our full dataframe:

-
-
summary(mozTS)
-
-
       Y               Yl1               t            sin12         
- Min.   :0.0000   Min.   :0.0000   Min.   : 2.0   Min.   :-1.00000  
- 1st Qu.:0.2951   1st Qu.:0.2951   1st Qu.:10.5   1st Qu.:-0.68301  
- Median :0.8590   Median :0.8590   Median :19.0   Median : 0.00000  
- Mean   :0.7711   Mean   :0.7684   Mean   :19.0   Mean   :-0.01429  
- 3rd Qu.:1.1120   3rd Qu.:1.1120   3rd Qu.:27.5   3rd Qu.: 0.68301  
- Max.   :1.7338   Max.   :1.7338   Max.   :36.0   Max.   : 1.00000  
-     cos12             MaxTemp        MaxTempl1         Precip      
- Min.   :-1.00000   Min.   :16.02   Min.   :16.02   Min.   : 0.000  
- 1st Qu.:-0.68301   1st Qu.:23.18   1st Qu.:23.18   1st Qu.: 1.918  
- Median : 0.00000   Median :27.23   Median :27.23   Median : 4.631  
- Mean   :-0.02474   Mean   :26.47   Mean   :26.44   Mean   : 5.660  
- 3rd Qu.: 0.50000   3rd Qu.:30.79   3rd Qu.:30.79   3rd Qu.: 8.234  
- Max.   : 1.00000   Max.   :33.31   Max.   :33.31   Max.   :18.307  
-    Precipl1     
- Min.   : 0.000  
- 1st Qu.: 1.918  
- Median : 4.631  
- Mean   : 5.640  
- 3rd Qu.: 8.234  
- Max.   :18.307  
-
-
-
-
head(mozTS)
-
-
          Y       Yl1 t         sin12         cos12  MaxTemp MaxTempl1
-1 0.1348400 0.0000000 2  8.660254e-01  5.000000e-01 17.87269  17.74602
-2 0.6841675 0.1348400 3  1.000000e+00  6.123234e-17 23.81767  17.87269
-3 1.2724180 0.6841675 4  8.660254e-01 -5.000000e-01 26.03559  23.81767
-4 0.9063270 1.2724180 5  5.000000e-01 -8.660254e-01 30.01602  26.03559
-5 1.7337683 0.9063270 6  1.224647e-16 -1.000000e+00 31.12094  30.01602
-6 1.5430335 1.7337683 7 -5.000000e-01 -8.660254e-01 32.81130  31.12094
-      Precip   Precipl1
-1 16.5442658  3.3039919
-2  2.4056512 16.5442658
-3  8.9744062  2.4056512
-4  0.5679609  8.9744062
-5  4.8413427  0.5679609
-6  3.8490104  4.8413427
-
-
-
-
-

Building a first model

-

We will first build a very simple model – just a trend – to practice building the model, checking diagnostics, and plotting predictions.

-
-
mod1<-lm(Y ~ t, data=mozTS)
-summary(mod1)
-
-

-Call:
-lm(formula = Y ~ t, data = mozTS)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--0.81332 -0.47902  0.03671  0.37384  0.87119 
-
-Coefficients:
-             Estimate Std. Error t value Pr(>|t|)    
-(Intercept)  0.904809   0.178421   5.071  1.5e-05 ***
-t           -0.007038   0.008292  -0.849    0.402    
----
-Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-
-Residual standard error: 0.4954 on 33 degrees of freedom
-Multiple R-squared:  0.02136,   Adjusted R-squared:  -0.008291 
-F-statistic: 0.7204 on 1 and 33 DF,  p-value: 0.4021
-
-
-

The model output indicates that this model is not useful – the trend is not significant and it only explains about 2% of the variability. Let’s plot the predictions:

-
-
## plot points and fitted lines
-plot(Y~t, data=mozTS, col=1, type="l")
-lines(t, mod1$fitted, col="dodgerblue", lwd=2)
-
-
-
-

-
-
-
-
-

Not good – we’ll definitely need to try something else! Remember that since we’re using a linear model for this, that we should check our residual plots as usual, and then also plot the acf of the residuals:

-
-
par(mfrow=c(1,3), mar=c(4,4,2,0.5))   
-
-## studentized residuals vs fitted
-plot(mod1$fitted, rstudent(mod1), col=1,
-     xlab="Fitted Values", 
-     ylab="Studentized Residuals", 
-     pch=20, main="AR 1 only model")
-
-## qq plot of studentized residuals
-qqnorm(rstudent(mod1), pch=20, col=1, main="" )
-abline(a=0,b=1,lty=2, col=2)
-
-## histogram of studentized residuals
-hist(rstudent(mod1), col=1, 
-     xlab="Studentized Residuals", 
-     main="", border=8)
-
-
-
-

-
-
-
-
-

This doesn’t look really bad, although the histogram might be a bit weird. Finally the acf

-
-
acf(mod1$residuals)
-
-
-
-

-
-
-
-
-

This is where we can see that we definitely aren’t able to capture the pattern. There’s substantial autocorrelation left at a 1 month lag, and around 6 months.

-

Finally, for moving forward, we can extract the BIC for this model so that we can compare with other models that you’ll build next.

-
-
n<-length(t)
-extractAIC(mod1, k=log(n))[2]
-
-
[1] -44.11057
-
-
-
-
-
-

Build and compare your own models

-

Follow the procedure I showed for the model with a simple trend, and build at least 4 more models:

-
    -
  1. one that contains an AR term
  2. -
  3. one with the sine/cosine terms
  4. -
  5. one with the environmental predictors
  6. -
  7. one with a combination
  8. -
-

Check diagnostics/model assumptions as you go. Then at the end compare all of your models via BIC. What is your best model by that metric? We’ll share among the group what folks found to be good models.

-
-
-

Extra Practice

-

Imagine that you are missing a few months at random – how would you need to modify the analysis. Try it out by removing about 5 months not at the beginning or end of the time series.

- - -
- -
- -
- - - - - \ No newline at end of file diff --git a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-10-1.png b/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-10-1.png deleted file mode 100644 index 92d91fb..0000000 Binary files a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-10-1.png and /dev/null differ diff --git a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-11-1.png b/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-11-1.png deleted file mode 100644 index 021a990..0000000 Binary files a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-11-1.png and /dev/null differ diff --git a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-12-1.png b/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-12-1.png deleted file mode 100644 index d0e16aa..0000000 Binary files a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-12-1.png and /dev/null differ diff --git a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-3-1.png b/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-3-1.png deleted file mode 100644 index a1b87b6..0000000 Binary files a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-3-1.png and /dev/null differ diff --git a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-4-1.png b/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-4-1.png deleted file mode 100644 index 4c166ec..0000000 Binary files a/docs/VB_IntroTimeDepData_practical_files/figure-html/unnamed-chunk-4-1.png and /dev/null differ diff --git a/docs/VB_RegDiagTrans.html b/docs/VB_RegDiagTrans.html index 8140768..a8c26d4 100644 --- a/docs/VB_RegDiagTrans.html +++ b/docs/VB_RegDiagTrans.html @@ -11,7 +11,7 @@ - VectorByte Training 2024 - VectorByte Methods Training + VectorByte Training 2024 - Review of Diagnostics and Transformations for Regression Models @@ -391,8 +391,8 @@
-

VectorByte Methods Training

-

Review of Diagnostics and Transformations for Regression Models

+

Review of Diagnostics and Transformations for Regression Models

+

VectorByte Methods Training

diff --git a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-15-1.png b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-15-1.png index c66cd13..1ca3a45 100644 Binary files a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-15-1.png and b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-15-1.png differ diff --git a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-19-1.png b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-19-1.png index 80391b8..6d6fb87 100644 Binary files a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-19-1.png and b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-19-1.png differ diff --git a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-20-1.png b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-20-1.png index 0c0f7b9..be08848 100644 Binary files a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-20-1.png and b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-20-1.png differ diff --git a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-7-1.png b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-7-1.png index 90582a6..d36c052 100644 Binary files a/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-7-1.png and b/docs/VB_RegDiagTrans_files/figure-revealjs/unnamed-chunk-7-1.png differ diff --git a/docs/VB_RegDiagTrans_practical.html b/docs/VB_RegDiagTrans_practical.html index 69d4966..0238dc0 100644 --- a/docs/VB_RegDiagTrans_practical.html +++ b/docs/VB_RegDiagTrans_practical.html @@ -6,9 +6,10 @@ - + + -VectorByte Training 2024 - VectorByte Methods Training +VectorByte Training 2024 - VectorByte Methods Training: Regression Diagnostics and Transformations (practical)