Import the data from a CSV file. This data is from the Google Flu trends dataset which estimates influenza-like illness from online search activity (weekly).
The data can be found here: https://www.google.org/flutrends/about/data/flu/ca/data.txt.
flu_data <- read.csv('datafilecsv.csv')
# drop Canada since we are just dealing with provinces
flu_data = subset(flu_data, select = -c(2))
names(flu_data)<-c('Date','AB','BC','MT','NB','NL','NS','ON','SK','QC')
col_order <- c("Date","BC","AB","SK","MT","ON","QC","NB","NS","NL")
flu_data <- flu_data[, col_order]
# Example Plot of Canad and New Brunswick Data
plot(x=as.Date(flu_data$Date),y=flu_data$ON,type='l',
main='Google Flu Trends for Canada',
xlab='Time',
ylab='Estimate of Flu Basedon Search Patterns')
lines(x=as.Date(flu_data$Date),y=flu_data$NB,type='l',col='red')
legend('topleft', legend=c("Ontario", "New Brunswick"),
col=c("black", "red"), lty=1,cex=0.8)
Since the data exhibits a strong seasonality (52-week) and also appears to have higher variance for higher values, we will take the log and take a 52 week difference.
flu_data_transformed <- flu_data
# take log
flu_data_transformed[2:10] <- log(flu_data[2:10])
# take 52 week difference
for(j in seq(2,10)){
flu_data_transformed[53:dim(flu_data_transformed)[1],j] <- diff(flu_data_transformed[,j],52)
}
flu_data_transformed <- flu_data_transformed[53:dim(flu_data_transformed)[1],]
plot(x=as.Date(flu_data_transformed$Date),y=flu_data_transformed$ON,type='l',main="Ontario 52 Week Change in Log Flu",
xlab='',ylab='')
Below we fit a VAR model and then print out the AR(1) matrix as a heatmap. Note that the model should be read ‘by row’, i.e. the coefficients used to model BC cases are found in the first row (labelled BC on the right).
library(MTS)
## Warning: package 'MTS' was built under R version 3.6.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.6.3
# create a training set
flu_train <- flu_data_transformed[1:541,]
# Note we use flu_data[2:11] because we don't need to fit the
# model to the date column
flu_var_model = VAR(as.matrix(flu_train[2:10]),
p=1,
output=F,
include.mean=T,
fixed=NULL)
pheatmap(flu_var_model$Phi,display_numbers = T,
cluster_rows = F, cluster_cols = F,legend=F,
main='VAR Coefficient Matrix',
labels_row=names(flu_data[2:10]),labels_col=names(flu_data[2:10]))
Here we predict 1-12 week ahead flu numbers for each province.
flu_predictions = VARpred(flu_var_model,h=12,orig=0, output=T)
par(mfrow=c(3,1))
flu_test_baseline <- flu_data[542:553,]
flu_test_pred <- exp(log(flu_test_baseline[,2:10])+flu_predictions$pred)
flu_test_pred[,'Date'] <- flu_data[594:605,'Date']
plot(x=as.Date(flu_data$Date[1:605]),y=flu_data$BC[1:605],type='l',
main='BC',
xlab='',
ylab='')
lines(x=as.Date(flu_test_pred$Date),y=flu_test_pred$BC,col='Red')
plot(x=as.Date(flu_data$Date[1:605]),y=flu_data$ON[1:605],type='l',
main='ON',
xlab='',
ylab='')
lines(x=as.Date(flu_test_pred$Date),y=flu_test_pred$ON,col='Red')
plot(x=as.Date(flu_data$Date[1:605]),y=flu_data$NB[1:605],type='l',
main='NB',
xlab='',
ylab='')
lines(x=as.Date(flu_test_pred$Date),y=flu_test_pred$NB,col='Red')