path2data <- paste0(
"/Users/nguyenpnt/Library/CloudStorage/OneDrive-OxfordUniversityClinicalResearchUnit/Data/HFMD/cleaned/"
)
# Name of cleaned HFMD data file
cases_file <- "hfmd_hcdc.rds"
df_cases <- paste0(path2data, cases_file) |>
readRDS()5 Forecasting model
5.1 Data path
5.2 Library
Required packages:
required <- c("BART",
"tidyverse",
"forecast",
"MASS",
"magrittr",
"parallel",
"lubridate",
"tseries")Installing those that are not installed yet:
to_install <- required[!required %in% installed.packages()[, "Package"]]
if (length(to_install))
install.packages(to_install)Loading the packages for interactive use:
invisible(sapply(required, library, character.only = TRUE))5.3 Function
plot bart function
model_plot_w <- function(data) {
data |>
ggplot(aes(x = adm_week)) +
geom_point(
data = data |> dplyr::select(-week_run),
aes(y = n),
alpha = 0.4,
size = .2
) +
geom_line(aes(y = pred_t)) +
geom_ribbon(
aes(ymin = y_lower95, ymax = y_upper95),
fill = "#4A90D9",
alpha = 0.15,
na.rm = TRUE
) +
# Shaded 80% credible interval
geom_ribbon(
aes(ymin = y_lower80, ymax = y_upper80),
fill = "#4A90D9",
alpha = 0.25,
na.rm = TRUE
) +
scale_x_date(name = "Admission week") +
scale_y_continuous(name = "Total cases") +
labs(title = "Prediction for 3 weeks ahead")+
theme_bw() +
facet_wrap(~ week_run, ncol = 6)
}function to run bart model to forecast 2023 data
cores <- detectCores() - 1
# run_bart_month <- function(w, hfmd_data,y,lambda) {
#
# # hfmd_data <- hfmd_1324
# # y = 2023
# # w = 39
#
# train_data <- hfmd_data |>
# filter(year < y | (year == y & week == w)) |>
# data.frame() |>
# na.omit()
#
# test_data <- hfmd_data |>
# filter(year == y & week %in% c(w+1,w+2,w+3)) |>
# data.frame()
#
# if (nrow(test_data) == 0)
# return(NULL)
#
# model <- wbart(
# x.train = train_data[, c("adm_week", "lag_t.1","lag_t.2","lag_t.3")],
# y.train = train_data$y_transformed,
# # x.test = test_data[, c("adm_week", "lag_t.1", "lag_t.2")],
# ntree = 1000,
# ndpost = 10000,
# nskip = 1000,
# a = 0.95,
# b = 2
# )
#
# pred_draws <- predict(model,test_data[, c("adm_week", "lag_t.1", "lag_t.2", "lag_t.3")])
# pred_ori <- InvBoxCox(pred_draws,lambda)
#
# result <- test_data |>
# mutate(
# pred_t = colMeans(pred_ori),
# y_lower95 = apply(pred_ori, 2, quantile, 0.025),
# y_upper95 = apply(pred_ori, 2, quantile, 0.975),
# y_lower80 = apply(pred_ori, 2, quantile, 0.10),
# y_upper80 = apply(pred_ori, 2, quantile, 0.90),
# week_run = w,
# year_run = y
# )
#
# return(result)
# }Function to run ARIMA model
run_arima_week <- function(w, y, hfmd_data, lambda) {
# i = 1
# w = year_week_grid$w[i]
# y = year_week_grid$y[i]
# hfmd_data = hfmd_1324
# lambda = lambda
# Expanding window training data
train_data <- hfmd_data |>
dplyr::filter(year < y | (year == y & week < w)) |>
data.frame() |>
na.omit()
# Test = 3 weeks ahead from week w
test_data <- hfmd_data |>
dplyr::filter(year == y & week >= w & week <= w + 2) |>
data.frame()
if (nrow(test_data) == 0 | nrow(train_data) < 10) return(NULL)
# Convert training to time series object
ts_train <- ts(train_data$y_transformed, frequency = 52)
# Fit auto ARIMA with tryCatch
model <- auto.arima(ts_train)
if (is.null(model)) return(NULL)
# Fix 1: forecast exactly nrow(test_data) steps, not always 3
h <- nrow(test_data)
fc <- forecast(model, h = h, level = c(80, 95))
# Fix 2: safely back-transform with InvBoxCox instead of manual formula
# handles edge cases like lambda = 0 (log transform)
result <- test_data |>
mutate(
pred_t = as.numeric(InvBoxCox(fc$mean, lambda)),
y_lower95 = as.numeric(InvBoxCox(fc$lower[, "95%"], lambda)),
y_upper95 = as.numeric(InvBoxCox(fc$upper[, "95%"], lambda)),
y_lower80 = as.numeric(InvBoxCox(fc$lower[, "80%"], lambda)),
y_upper80 = as.numeric(InvBoxCox(fc$upper[, "80%"], lambda)),
horizon = row_number(),
week_run = w,
year_run = y
)
return(result)
}5.4 Analysis
hfmd_1324 <- df_cases %>%
mutate(adm_week = as.Date(floor_date(admission_date, "week"))) %>%
group_by(adm_week) %>%
dplyr::count() %>%
ungroup() %>%
mutate(
x = 1:nrow(.),
week = week(adm_week),
month = month(adm_week),
year = year(adm_week)
) %>%
rename(time = x)I applied Box-cox transformation to normalize the count data
bc <- boxcox(n ~ adm_week,data = hfmd_1324)
lambda <- bc$x[which.max(bc$y)]Add transformed cases and lag 1-week term
hfmd_1324 %<>%
mutate(
y_transformed = ((n^lambda - 1) / lambda),
lag_t.1 = lag(y_transformed,1),
lag_t.2 = lag(y_transformed,2),
lag_t.3 = lag(y_transformed,3)
)Before transform:
hfmd_1324 |>
ggplot(aes(n))+
geom_histogram(binwidth = 100)
After transform:
hfmd_1324 |>
ggplot(aes(y_transformed))+
geom_histogram(binwidth = .5)
5.5 BART model
First, I used data from 2013 to 2022 for training and using 2023 for testing. Then I combined 2013-2022 data with each lag of 3 weeks in 2023 to see the prediction 3 weeks ahead
# year_week_grid <- expand.grid(y = c(2023), w = 1:52)
#
# results_list <- mclapply(
# X = 1:nrow(year_week_grid),
# FUN = function(i) {
# run_bart_month(
# w = year_week_grid$w[i],
# y = year_week_grid$y[i],
# hfmd_data = hfmd_1324
# )
# },
# mc.cores = cores
# )
# pred_3w <- bind_rows(results_list)
# save(pred_3w,file = "pred_3w.RData")
load("./data/pred_3w.RData")week 1-26, 2023
pred_3w |>
filter(week_run <= 26) |>
model_plot_w()
week 27-52, 2023
pred_3w |>
filter(week_run > 26) |>
model_plot_w()
week 1-24, 2024
# year_week_grid <- expand.grid(y = 2024, w = 1:26)
#
# results_list <- mclapply(
# X = 1:nrow(year_week_grid),
# FUN = function(i) {
# run_bart_month(
# w = year_week_grid$w[i],
# y = year_week_grid$y[i],
# hfmd_data = hfmd_1324,
# lambda = lambda
# )
# },
# mc.cores = cores
# )
# pred_3w_24 <- bind_rows(results_list)
# save(pred_3w_24, file = "pred_3w_24.RData")load("./data/pred_3w_24.RData")
pred_3w_24 |>
filter(week_run <= 24) |>
model_plot_w()
5.6 ARIMA model
ARIMA assume data is stationary. A time series is considered stationary if its statistical properties, such as mean and variance, remain constant over time.
I used adf and kpss test for stationary testing of hfmd time-series data. For ARIMA model, I used data from 2012-2022 for training.
train_data <- hfmd_1324 |>
dplyr::filter(year < 2023 | (year == 2023 & week < 1)) |>
data.frame() |>
na.omit()First I used number of admission for stationary testing. There are 2 test for stationary testing
adf test, null hypothesis: The time series has a unit root, meaning it is non-stationary.
kpss test, null hypothesis: The series is stationary
Create a time-series
ts_train <- ts(train_data$n, frequency = 52)adf test
adf.test(ts_train) ## can reject the null hypothesis
Augmented Dickey-Fuller Test
data: ts_train
Dickey-Fuller = -4.5366, Lag order = 8, p-value = 0.01
alternative hypothesis: stationary
kpss test
kpss.test(ts_train) ## can reject the null hypothesis
KPSS Test for Level Stationarity
data: ts_train
KPSS Level = 0.64106, Truncation lag parameter = 6, p-value = 0.0189
Usually, there should be a contrast between those 2 test for stationary. If a data is stationary when p-value of adf test < 0.05 and kpss test > 0.05. So the results above state the non-stationary, then I applied the box-cox transformation for the time-series data and retest.
ts_train_t <- ts(train_data$y_transformed, frequency = 52)
adf.test(ts_train_t)
Augmented Dickey-Fuller Test
data: ts_train_t
Dickey-Fuller = -4.3587, Lag order = 8, p-value = 0.01
alternative hypothesis: stationary
kpss.test(ts_train_t)
KPSS Test for Level Stationarity
data: ts_train_t
KPSS Level = 0.30265, Truncation lag parameter = 6, p-value = 0.1
So I used the box-cox transformation for model
# # Explicit cluster setup
# cl <- makeCluster(cores)
#
# # Export everything the workers need
# clusterExport(cl, varlist = c("run_arima_week", "hfmd_1324",
# "lambda", "year_week_grid"))
#
# # Load packages on each worker
# clusterEvalQ(cl, {
# library(forecast)
# library(dplyr)
# })
#
# # Run
# sarima_results_list <- parLapply(
# cl = cl,
# X = 1:nrow(year_week_grid),
# fun = function(i) {
# tryCatch(
# run_arima_week(
# w = year_week_grid$w[i],
# y = year_week_grid$y[i],
# hfmd_data = hfmd_1324,
# lambda = lambda
# ),
# error = function(e) return(NULL)
# )
# }
# )
#
# # Always stop cluster when done
# stopCluster(cl)
#
# results_arima <- bind_rows(sarima_results_list)
# save(results_arima,file = "results_arima.RData")
load("./data/results_arima.RData")Week 1-26, 2023
results_arima |>
filter(week_run <= 26) |>
model_plot_w()
Week 27-52, 2023
results_arima |>
filter(week_run > 26) |>
model_plot_w()
Week 1-24, 2024
# results_arima_24 <- bind_rows(sarima_results_list)
#
# save(results_arima_24,file = "results_arima_24.RData")
load("./data/results_arima_24.RData")
results_arima_24 |>
filter(week_run <= 24) |>
model_plot_w()