library(COVID19)
# Retrieve COVID-19 data for the United States and prepare a tsibble
<- covid19(country = "US", level = 1, verbose = FALSE) %>%
covid_raw as_tsibble(index = date) %>%
select(date, confirmed, deaths, people_vaccinated) %>%
rename(vaccines = people_vaccinated)
# Convert to NEW weekly cases
<- covid_raw %>%
covid_weekly mutate(across(c(confirmed, deaths, vaccines), ~. - lag(.))) %>%
index_by(week = yearweek(date)) %>%
summarize(
cases = sum(confirmed, na.rm = TRUE),
deaths = sum(deaths, na.rm = TRUE),
vaccines = sum(vaccines, na.rm = TRUE)
%>%
) filter(cases > 0, deaths > 0, vaccines >= 0) %>%
mutate(across(c(cases, deaths, vaccines), ~ log(. + 1)))
<- breakpoints(cases ~ 1 + vaccines + deaths, data = covid_weekly, h = 0.2)
bp_cases <- covid_weekly$week[bp_cases$breakpoints]
break_dates
# Create regime dummy
<- covid_weekly %>%
covid_weekly mutate(
intervention_period = case_when(
< break_dates[1] ~ "pre",
week >= break_dates[1] & week < break_dates[2] ~ "phase1",
week >= break_dates[2] & week < break_dates[3] ~ "phase2",
week >= break_dates[3] ~ "post"
week
) )
Activity44
VAR Modeling and Forecast
Let’s model weekly COVID-19 trajectories using logged differenced time series (\(\Delta y_t = \log(y_t - y_{t-1} + 1)\)) to stabilize variance.
<- nrow(covid_weekly)
n_total <- round(0.2 * n_total)
h <- covid_weekly[1:(n_total - h), ]
train <- covid_weekly[(n_total - h + 1):n_total, ] test
# Initialize forecast storage
<- tibble(
forecast_df week = test$week,
actual_cases = exp(test$cases) - 1,
actual_deaths = exp(test$deaths) - 1,
actual_vaccines = exp(test$vaccines) - 1
)
Model Specifications
- Benchmark: Fit a univariate ARIMA model to weekly cases using automated order selection
- ARIMAX: Extend ARIMA with vaccines as exogenous regressor (\(\text{cases}_t \sim \text{vaccines}_t\))
- VAR(2): Model bidirectional relationships between cases and vaccines with 2-lag structure:
\(Y_t = \begin{bmatrix}\text{cases}_t\\ \text{vaccines}_t\end{bmatrix} = A_1Y_{t-1} + A_2Y_{t-2} + \epsilon_t\)
- VAR(3): Tri-variate system incorporating deaths with 3-lag structure
- ETS: Automated exponential smoothing state-space model
# Model 1: Univariate ARIMA
<- train %>%
fit_arima model(ARIMA(cases))
<- forecast(fit_arima, h = h) %>%
fc_arima mutate(arima_cases = exp(.mean) - 1)
# Model 2: ARIMAX with Vaccines
<- train %>%
fit_arimax model(ARIMA(cases ~ vaccines))
<- forecast(fit_arimax, new_data = test) %>%
fc_arimax mutate(arimax_cases = exp(.mean) - 1)
# Model 3: VAR(2) - Cases & Vaccines
<- train %>% as_tibble() %>%
var2_data select(cases, vaccines) %>%
mutate(across(everything(), difference)) %>%
drop_na()
<- VAR(var2_data, p = 2)
var2_model <- predict(var2_model, n.ahead = h)
fc_var2
<- tail(train$cases, 1)
last_val_cases <- cumsum(fc_var2$fcst$cases[,1]) + last_val_cases
var2_fc_cases $var2_cases <- exp(var2_fc_cases) - 1
forecast_df
<- tail(train$vaccines, 1)
last_val_vaccines <- cumsum(fc_var2$fcst$vaccines[,1]) + last_val_vaccines
var2_fc_vaccines $var2_vaccines <- exp(var2_fc_vaccines) - 1 forecast_df
# Model 4: VAR(3) - Cases, Deaths, Vaccines
<- train %>% as_tibble() %>%
var3_data select(cases, deaths, vaccines) %>%
mutate(across(everything(), difference)) %>%
drop_na()
<- VAR(var3_data, p = 3)
var3_model <- predict(var3_model, n.ahead = h)
fc_var3
<- tail(train$cases, 1)
last_val_cases <- cumsum(fc_var3$fcst$cases[,1]) + last_val_cases
var3_fc_cases $var3_cases <- exp(var3_fc_cases) - 1
forecast_df
<- tail(train$vaccines, 1)
last_val_vaccines <- cumsum(fc_var3$fcst$vaccines[,1]) + last_val_vaccines
var3_fc_vaccines $var3_vaccines <- exp(var3_fc_vaccines) - 1
forecast_df
<- tail(train$deaths, 1)
last_val_deaths <- cumsum(fc_var3$fcst$deaths[,1]) + last_val_deaths
var3_fc_deaths $var3_deaths <- exp(var3_fc_deaths) - 1 forecast_df
# Model 5: ETS
<- train %>%
fit_ets model(ETS(cases))
<- forecast(fit_ets, h = h) %>%
fc_ets mutate(ets_cases = exp(.mean) - 1)
library(Metrics)
library(MLmetrics)
# Accuracy Metrics Calculation
<- tibble(
metrics Model = c("ARIMA", "ARIMAX", "VAR(2)", "VAR(3)", "ETS"),
MAE = c(
mae(forecast_df$actual_cases, fc_arima$arima_cases),
mae(forecast_df$actual_cases, fc_arimax$arimax_cases),
mae(forecast_df$actual_cases, forecast_df$var2_cases),
mae(forecast_df$actual_cases, forecast_df$var3_cases),
mae(forecast_df$actual_cases, fc_ets$ets_cases)
),MAPE = c(
mape(forecast_df$actual_cases, fc_arima$arima_cases),
mape(forecast_df$actual_cases, fc_arimax$arimax_cases),
mape(forecast_df$actual_cases, forecast_df$var2_cases),
mape(forecast_df$actual_cases, forecast_df$var3_cases),
mape(forecast_df$actual_cases, fc_ets$ets_cases)
),RMSE = c(
rmse(forecast_df$actual_cases, fc_arima$arima_cases),
rmse(forecast_df$actual_cases, fc_arimax$arimax_cases),
rmse(forecast_df$actual_cases, forecast_df$var2_cases),
rmse(forecast_df$actual_cases, forecast_df$var3_cases),
rmse(forecast_df$actual_cases, fc_ets$ets_cases)
),Bias = c(
mean(fc_arima$arima_cases - forecast_df$actual_cases),
mean(fc_arimax$arimax_cases - forecast_df$actual_cases),
mean(forecast_df$var2_cases - forecast_df$actual_cases),
mean(forecast_df$var3_cases - forecast_df$actual_cases),
mean(fc_ets$ets_cases - forecast_df$actual_cases)
),R2 = c(
R2_Score(fc_arima$arima_cases, forecast_df$actual_cases),
R2_Score(fc_arimax$arimax_cases, forecast_df$actual_cases),
R2_Score(forecast_df$var2_cases, forecast_df$actual_cases),
R2_Score(forecast_df$var3_cases, forecast_df$actual_cases),
R2_Score(fc_ets$ets_cases, forecast_df$actual_cases)
)
)%>% knitr::kable() metrics
Model | MAE | MAPE | RMSE | Bias | R2 |
---|---|---|---|---|---|
ARIMA | 181316.8 | 0.7059227 | 211070.5 | 169885.8 | -1.7278961 |
ARIMAX | 136716.9 | 0.3821726 | 164800.2 | -133689.6 | -0.6629847 |
VAR(2) | 182251.2 | 0.7037307 | 207695.5 | 180974.8 | -1.6413571 |
VAR(3) | 317627.4 | 1.2251102 | 365140.8 | 313386.1 | -7.1638232 |
ETS | 172758.6 | 0.5315084 | 218620.2 | -171756.7 | -1.9265338 |
# Visualization
ggplot(forecast_df, aes(x = week)) +
geom_line(aes(y = actual_cases, color = "Actual"), size = 1) + # Map to "Actual"
geom_line(aes(y = var2_cases, color = "VAR2")) + # Map to "VAR2"
geom_line(aes(y = var3_cases, color = "VAR3")) + # Map to "VAR3"
geom_line(data = fc_ets, aes(y = ets_cases, color = "ETS")) + # Map to "ETS"
geom_line(data = fc_arima, aes(y = arima_cases, color = "ARIMA")) + # Map to "ARIMA"
geom_line(data = fc_arimax, aes(y = arimax_cases, color = "ARIMAX")) + # Map to "ARIMAX"
labs(
title = "Model Forecast Comparison - Weekly COVID Cases",
subtitle = "Colors represent models",
y = "Cases",
color = "Model"
+
) scale_color_manual(
values = c(
"Actual" = "black",
"VAR2" = "#E69F00",
"VAR3" = "#56B4E9",
"ETS" = "#009E73",
"ARIMA" = "#CC79A7",
"ARIMAX" = "#0072B2"
)+
) theme_minimal() +
theme(legend.position = "bottom")
Conceptual Discussion Questions
Q1: Why difference the series before VAR estimation?
Q2: What advantage does VAR have over ARIMAX for policy analysis?
Q3: What diagnostic checks would you perform to assess VAR model stability, and why might differencing alone be insufficient?
Q4: When would you prefer ETS/ARIMA over VAR despite their lower forecast accuracy in this study?
Q5: Why might automated ARIMA selection fail to detect regime-dependent dynamics?