Data

Import the data from a CSV file. This data is from the Google Flu trends dataset which estimates influenza-like illness from online search activity (weekly).

The data can be found here: https://www.google.org/flutrends/about/data/flu/ca/data.txt.

flu_data <- read.csv('datafilecsv.csv')

# drop Canada since we are just dealing with provinces
flu_data = subset(flu_data, select = -c(2)) 
names(flu_data)<-c('Date','AB','BC','MT','NB','NL','NS','ON','SK','QC')
col_order <- c("Date","BC","AB","SK","MT","ON","QC","NB","NS","NL")
flu_data <- flu_data[, col_order]

# Example Plot of Canad and New Brunswick Data
plot(x=as.Date(flu_data$Date),y=flu_data$ON,type='l',
     main='Google Flu Trends for Canada',
     xlab='Time',
     ylab='Estimate of Flu Basedon Search Patterns')
lines(x=as.Date(flu_data$Date),y=flu_data$NB,type='l',col='red')
legend('topleft', legend=c("Ontario", "New Brunswick"),
       col=c("black", "red"), lty=1,cex=0.8)

Transform the Data

Since the data exhibits a strong seasonality (52-week) and also appears to have higher variance for higher values, we will take the log and take a 52 week difference.

flu_data_transformed <- flu_data

# take log
flu_data_transformed[2:10] <- log(flu_data[2:10])

# take 52 week difference
for(j in seq(2,10)){
  flu_data_transformed[53:dim(flu_data_transformed)[1],j] <- diff(flu_data_transformed[,j],52)
}

flu_data_transformed <- flu_data_transformed[53:dim(flu_data_transformed)[1],]

plot(x=as.Date(flu_data_transformed$Date),y=flu_data_transformed$ON,type='l',main="Ontario 52 Week Change in Log Flu",
     xlab='',ylab='')

Fit a Standard VAR Model

Below we fit a VAR model and then print out the AR(1) matrix as a heatmap. Note that the model should be read ‘by row’, i.e. the coefficients used to model BC cases are found in the first row (labelled BC on the right).

library(MTS)
## Warning: package 'MTS' was built under R version 3.6.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.6.3
# create a training set
flu_train <- flu_data_transformed[1:541,]

# Note we use flu_data[2:11] because we don't need to fit the
# model to the date column
flu_var_model = VAR(as.matrix(flu_train[2:10]),
    p=1,
    output=F, 
    include.mean=T, 
    fixed=NULL)

pheatmap(flu_var_model$Phi,display_numbers = T,
         cluster_rows = F, cluster_cols = F,legend=F,
         main='VAR Coefficient Matrix',
         labels_row=names(flu_data[2:10]),labels_col=names(flu_data[2:10]))

Here we predict 1-12 week ahead flu numbers for each province.

flu_predictions = VARpred(flu_var_model,h=12,orig=0, output=T)
par(mfrow=c(3,1))

flu_test_baseline <- flu_data[542:553,]
flu_test_pred <- exp(log(flu_test_baseline[,2:10])+flu_predictions$pred)
flu_test_pred[,'Date'] <- flu_data[594:605,'Date']


plot(x=as.Date(flu_data$Date[1:605]),y=flu_data$BC[1:605],type='l',
     main='BC',
     xlab='',
     ylab='')
lines(x=as.Date(flu_test_pred$Date),y=flu_test_pred$BC,col='Red')

plot(x=as.Date(flu_data$Date[1:605]),y=flu_data$ON[1:605],type='l',
     main='ON',
     xlab='',
     ylab='')
lines(x=as.Date(flu_test_pred$Date),y=flu_test_pred$ON,col='Red')

plot(x=as.Date(flu_data$Date[1:605]),y=flu_data$NB[1:605],type='l',
     main='NB',
     xlab='',
     ylab='')
lines(x=as.Date(flu_test_pred$Date),y=flu_test_pred$NB,col='Red')