path2data <- paste0("C:/Users/ongph/github/data")Chapter 1
Constant
Path to data:
Files names:
# Case notification
cases_file <- "measles/measles_cases_2019-12.2025.xlsx"
# Serology
sero_file <- "sero/cleaned/measles_elisa_hcmc_children.rds"
# Vaccination coverage
## HCMC
vax_hcmc_file <- "vax/vaxreg_hcmc_measles.rds"
## Vietnam
vax_vn_file <- "vax/vaxvn_2023.xlsx"
# Population metadata
map_hcmc_file <- "map/gadm41_hcmc_district.rds"
map_vn_file <- "map/gadm41_vn.rds"
census_file <- "census2019_hcm.rds"Packages
Required packages:
required <- c("openxlsx", "janitor", "tidyverse", "scales", "mgcv", "stringi", "sf", "binom", "ggsci", "ggspatial", "patchwork", "zoo")
# Installing those that are not installed yet:
to_install <- required[! required %in% installed.packages()[, "Package"]]
if (length(to_install)) install.packages(to_install)
# Loading the packages for interactive use:
invisible(sapply(required, library, character.only = TRUE))Functions
Clean Vietnamese district and province name
clean_district <- function(x) {
x |>
stri_trans_general("Latin-ASCII") |>
str_to_lower() |>
str_squish() |>
str_replace_all("đ", "d") |>
str_remove("^(quan |huyen |thanh pho |tp )")
}
clean_province <- function(x) {
x |>
stri_trans_general("Latin-ASCII") |>
str_to_lower() |> # Fix case mismatches (e.g., Ha Noi vs ha noi)
str_squish() |> # Remove trailing, leading, or double spaces
str_replace_all("đ", "d") # Catch the Vietnamese 'đ' if the ASCII conversion misses it
}Calculate vaccination coverage
calc_cov <- function(data, age_expr, date_col, dose_lbl) {
data |>
filter({{ age_expr }}) |>
group_by(district_blank) |>
summarise(
pos = sum({{ date_col }} <= "2024-04-01", na.rm = TRUE),
n = n(),
.groups = "drop"
) |>
mutate(
cov = pos / n,
dose = dose_lbl,
ci_low = binom.wilson(pos, n)$lower,
ci_high = binom.wilson(pos, n)$upper
)
}Plot vaccination coverage
plot_cov <- function(data) {
ggplot(data) +
geom_sf(
aes(fill = cov),
colour = "grey30",
linewidth = 0.15
) +
scale_fill_gsea(
reverse = TRUE,
labels = scales::percent_format(accuracy = 1),
na.value = "grey90"
) +
theme_void(base_family = "sans", base_size = 11) +
theme(
legend.position = "right",
legend.key.height = unit(1.2, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.title = element_text(face = "bold", size = 9),
plot.margin = margin(5, 5, 5, 5)
) +
labs(fill = "Coverage\n", title = NULL)
}Correct seroprevalence by sensitivity and specificity
prev_correct <- function(est_val, sens, spec) {
(est_val + spec - 1) / (sens + spec - 1)
}Predict with confidence interval
predict_ci <- function(model, data, var, ci = 0.95, length = 1000) {
p <- (1 - ci) / 2
link_inv <- model$family$linkinv
n <- nrow(data) - length(model$coefficients)
vals <- seq(from = min(data[[var]], na.rm = T), to = max(data[[var]], na.rm = T), length.out = length)
df <- data.frame(col = vals)
colnames(df) <- var
pred <- data.frame(predict(model, df, se.fit = TRUE))
pred[[var]] <- df[[var]]
pred$lwr <- link_inv(pred$fit + qt(p, n) * pred$se.fit)
pred$upr <- link_inv(pred$fit + qt(1 - p, n) * pred$se.fit)
pred$fit <- link_inv(pred$fit)
pred
}Data
Map:
df_map_hcmc <- file.path(path2data, map_hcmc_file) |>
readRDS() |>
mutate(district_blank = clean_district(district))
df_map_vn <- file.path(path2data, map_vn_file) |>
readRDS() |>
mutate(province_blank = clean_province(province))Census:
df_census <- file.path(path2data, census_file) |>
readRDS()
df_census_age <- df_census |>
group_by(age) |>
summarise(n_census = sum(n), .groups = "drop")
df_census_district <- df_census |>
group_by(district) |>
summarise(n_census = sum(n), .groups = "drop") |>
mutate(
district = clean_district(district)
)Case report data:
df_case <- file.path(path2data, cases_file) |>
read.xlsx(sheet = 4) |>
clean_names() |>
mutate(
ngay_nv = as.Date(as.numeric(ngay_nv), origin = "1899-12-30"),
ngay_sinh = as.Date(as.numeric(ngay_sinh), origin = "1899-12-30"),
age = time_length(interval(ngay_sinh, ngay_nv), "year"),
district = clean_district(quan_huyen)
)Serology:
df_sero <- file.path(path2data, sero_file) |>
readRDS()
# Get children in vaccination age in 3 hospitals, sampling between 01-08-2022 to 30-04-2024
df_sero_3ch <- df_sero |>
filter(
exact_age >= 9/12,
hospital %in% c("Children's hospital 1", "Children's hospital 2", "City Children's Hospital"),
date_collection <= dmy("30-04-2024"),
date_collection >= dmy("01-08-2022")
) |>
mutate(
age_5y = if_else(exact_age < 5, "<5", ">=5"),
age_5y = fct_relevel(age_5y, "<5", ">=5"),
age_ceiling = ceiling(exact_age)
) |>
left_join(df_census, by = join_by(age_ceiling == age)) |>
mutate(
weight = n_census/mean(n_census)
)
# Children < 9 month-old
df_sero_infant <- df_sero |>
filter(exact_age < 9/12)Vaccination coverage
df_vax_hcmc <- file.path(path2data, vax_hcmc_file) |>
readRDS() |>
mutate(district_blank = clean_district(district))
df_vax_vn <- file.path(path2data, vax_vn_file) |>
read.xlsx() |>
mutate(province_blank = clean_province(province))Seroprevalence vs coverage
Calculate daily incidence rate by district for the 2024 epidemic (starts from 1 April 2024)
df_incidence_district <- df_case |>
filter(
ngay_nv >= dmy("01-04-2024"),
district %in% df_census_district$district
) |>
group_by(district, ngay_nv) |>
summarise(n_cases = n(), .groups = "drop") |>
# Complete the time series so days with 0 cases are included
complete(district,
ngay_nv = seq(min(ngay_nv), max(ngay_nv), by = "1 day"),
fill = list(n_cases = 0)) |>
# Join with census data to get population
left_join(df_census_district, by = "district") |>
# Calculate daily and cumulative attack rates (per 100,000)
group_by(district) |>
mutate(
cum_cases = cumsum(n_cases),
cum_ar = (cum_cases / n_census) * 100000,
cum_ar_7day = rollmean(cum_ar, k = 7, fill = NA, align = "right")
) |>
ungroup() |>
mutate(region = case_when(
str_detect(district, "binh chanh|binh tan") ~ "west",
str_detect(district, "hoc mon|12|cu chi") ~ "north",
str_detect(district, "thu duc") ~ "east",
str_detect(district, "nha be|7|can gio") ~ "south",
.default = "central"), # the rest is central
region = factor(region, levels = c("west", "north", "east", "south", "central"))
)df_incidence_district |>
# Handle 0s before logging, as log(0) is -Inf
filter(cum_ar_7day > 0) |>
ggplot(aes(x = ngay_nv, y = cum_ar_7day, color = region, group = district)) +
geom_line(alpha = 0.6) +
scale_y_log10() +
scale_color_startrek() +
labs(
x = "Date of admission",
y = "Cumulative AR (log10 scale)",
color = "District"
) +
theme_classic() +
theme(
legend.position = "bottom"
)Compute seroprevalence by district
df_seroprev_district <- df_sero_3ch |>
mutate(
# equivocal is considered negative
pos = if_else(result == "positive", 1, 0)
) |>
group_by(district) |>
summarise(
pos = sum(pos),
n = n(),
cov_sero = pos / n,
ci_low_sero = binom::binom.wilson(pos, n)$lower,
ci_high_sero = binom::binom.wilson(pos, n)$upper
)Compute coverage by district in December 2023.
1st dose:
df_cov_m1 <- df_vax_hcmc |>
mutate(age = decimal_date(dmy("01-04-2024")) - decimal_date(dob)) |>
filter(age >= 9 / 12, age <= 5) |>
mutate(is_m1 = date_m1 <= dmy("01-04-2024")) |>
group_by(district) |>
summarise(
pos = sum(is_m1, na.rm = T),
n = n(),
cov = pos / n,
ci_low = binom::binom.wilson(pos, n)$lower,
ci_high = binom::binom.wilson(pos, n)$upper,
dose = "1st"
)2nd dose:
df_cov_m2 <- df_vax_hcmc |>
mutate(age = decimal_date(dmy("01-04-2024")) - decimal_date(dob)) |>
filter(age >= 18/12, age <= 5) |>
mutate(is_m2 = date_m2 <= dmy("01-04-2024")) |>
group_by(district) |>
summarise(
pos = sum(is_m2, na.rm = T),
n = n(),
cov = pos / n,
ci_low = binom::binom.wilson(pos, n)$lower,
ci_high = binom::binom.wilson(pos, n)$upper,
dose = "2nd"
)Plot the inconsistency:
df_plot <- rbind(df_cov_m1, df_cov_m2) |>
left_join(df_seroprev_district, by = "district")
df_plot |>
ggplot(aes(x = cov, y = cov_sero)) +
# Reference line for perfect consistency
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
# Error bars for Y (seroprevalence) and X (reported coverage)
geom_errorbar(aes(ymin = ci_low_sero, ymax = ci_high_sero), width = 0, alpha = 0.5) +
geom_errorbarh(aes(xmin = ci_low, xmax = ci_high), height = 0, alpha = 0.5) +
# Facet by dose type
facet_wrap(~dose, labeller = as_labeller(c(`1st` = "1st dose", `2nd` = "2nd dose")), scale = "free") +
labs(
x = "Vaccination coverage",
y = "Seroprevalence"
) +
theme_bw()
ggsave("figs/chap1/seroprev_vs_cov.png", width = 5, height = 3, dpi = 300)Calculate expected seroprevalence from vaccination coverage
Compare empirical seroprevalence with this expected seroprevalence
Age distribution among cases
df_case |>
filter(ngay_nv >= dmy("01-07-2018") & ngay_nv < dmy("01-04-2020")) |>
mutate(
week = floor_date(ngay_nv, "week"),
age_group = case_when(
age < 5 ~ "<5",
age >= 5 & age <= 15 ~ "5-15",
age > 15 ~ ">15"
),
age_group = fct_relevel(age_group, "<5", "5-15", ">15")
) |>
drop_na(week, age_group) |>
count(week, age_group) |>
mutate(percentage = n / sum(n), .by = week) |>
ggplot(aes(x = week, y = percentage)) +
geom_col(
aes(fill = age_group),
width = 7,
# This keeps <5 at the bottom of the bars
position = position_stack(reverse = TRUE),
colour = "white",
linewidth = 0.2
) +
scale_fill_manual(
values = c("<5" = "#56B4E9", "5-15" = "#E69F00", ">15" = "#009E73")
) +
# This flips the legend to match the bars
guides(fill = guide_legend(reverse = TRUE)) +
annotate(
"rect",
xmin = dmy("01-12-2018"),
xmax = dmy("31-05-2019"),
ymin = 0,
ymax = 1,
fill = "grey90",
alpha = 0.5
) +
geom_vline(
xintercept = c(dmy("01-12-2018"), dmy("31-05-2019")),
linetype = "dashed",
colour = "black",
linewidth = 0.6
) +
scale_y_continuous(
labels = label_percent(),
limits = c(0, 1),
breaks = seq(0, 1, by = 0.25),
expand = c(0, 0)
) +
scale_x_date(
date_breaks = "2 months",
date_labels = "%b\n%Y",
expand = expansion(mult = c(0.01, 0.01))
) +
labs(
x = "Admission week",
y = "Proportion of cases",
fill = "Age group"
) +
theme_minimal(base_size = 12) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(colour = "grey90", linewidth = 0.4),
axis.line.x = element_line(colour = "black", linewidth = 0.6),
axis.ticks.x = element_line(colour = "black"),
axis.text = element_text(colour = "black"),
axis.title = element_text(face = "bold"),
legend.position = "right", # Moved to the right (optional, see note below)
legend.title = element_text(face = "bold"),
plot.margin = margin(t = 15, r = 15, b = 10, l = 10)
)
# ggsave("figs/measles_age_distr_2018.png", width = 7, height = 2.5, dpi = 300, bg = "white")Vaccination coverage
HCMC coverage
vax_base <- df_vax_hcmc |>
mutate(age_y = time_length(interval(dob, ymd("2024-04-01")), "years"))
map_m1 <- df_map_hcmc |>
left_join(calc_cov(vax_base, age_y >= 9/12 & age_y <= 5, date_m1, "1st"), by = "district_blank")
map_m2 <- df_map_hcmc |>
left_join(calc_cov(vax_base, age_y >= 18/12 & age_y < 5, date_m2, "2nd"), by = "district_blank")
plot_m1 <- plot_cov(map_m1) + labs(fill = NULL)
plot_m2 <- plot_cov(map_m2) + labs(fill = NULL)
# ggsave("figs/hcmc_coverage_dose1.png", plot = plot_m1, width = 2.5, height = 2.5, bg = "white")
# ggsave("figs/hcmc_coverage_dose2.png", plot = plot_m2, width = 2.5, height = 2.5, bg = "white")Coverage in the south
# 1. Define the raw list
province_south_raw <- c("Bình Dương", "Bình Phước", "Tây Ninh", "Bà Rịa Vũng Tàu", "Đồng Nai", "Long An", "Tiền Giang", "Bến Tre", "Vĩnh Long", "Trà Vinh", "Đồng Tháp", "Hậu Giang", "An Giang", "Kiên Giang", "Bạc Liêu", "Sóc Trăng", "Cà Mau", "Hồ Chí Minh", "Cần Thơ")
# 2. Clean the list using the exact same logic applied to your dataframe
province_south_blank <- province_south_raw |>
stri_trans_general("Latin-ASCII") |>
str_to_lower() |>
str_squish() |>
str_replace_all("đ", "d")
# 3. Join the data and filter for only the Southern provinces
map_south <- df_map_vn |>
left_join(df_vax_vn, by = "province_blank") |>
filter(province_blank %in% province_south_blank)
# 4. Generate the publication-grade plot
plot_measles_south_highlight <- ggplot() +
# 1. Base map of all Southern provinces
geom_sf(
data = map_south,
aes(fill = measles_pct),
colour = "grey30",
linewidth = 0.15
) +
# 2. Highlight layer specifically for Ho Chi Minh
geom_sf(
data = map_south |> filter(province_blank == "ho chi minh"),
fill = NA, # Keeps the map colour visible underneath
colour = "yellow", # High-contrast border colour
linewidth = 0.8 # Thicker line to make it pop
) +
scale_fill_gsea(
reverse = TRUE,
labels = scales::percent_format(scale = 1, accuracy = 1),
na.value = "grey90"
) +
theme_void(base_family = "sans", base_size = 11) +
theme(
legend.position = "right",
legend.key.height = unit(1.2, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.title = element_text(face = "bold", size = 9),
plot.margin = margin(5, 5, 5, 5)
) +
labs(fill = NULL, title = NULL)
# ggsave("figs/south_vn_measles.png", plot = plot_measles_south_highlight, width = 3, height = 3, dpi = 300, bg = "white")Hospital catchment
sero_summary <- df_sero |>
mutate(district_blank = clean_district(district)) |>
count(hospital, district_blank) |> # Shortcut for group_by() + summarise(n = n())
group_by(hospital) |>
mutate(pct = n / sum(n)) |> # Calculate proportion within each hospital
ungroup()
# 3. Split the data into a list of dataframes (one per hospital)
sero_list <- split(sero_summary, sero_summary$hospital)
# 4. Define the plotting function
# imap() passes the dataframe first, and the list name (hospital) second
plot_hospital_map <- function(hosp_data, hosp_name) {
map_data <- df_map_hcmc |>
left_join(hosp_data, by = "district_blank") |>
mutate(pct = replace_na(pct, 0))
ggplot(map_data) +
geom_sf(
aes(fill = pct),
colour = "grey30",
linewidth = 0.15
) +
# Replace the GSEA scale with distiller
scale_fill_distiller(
palette = "Reds",
direction = 1, # 1 forces light colors for low values, dark for high
labels = scales::percent_format(scale = 100, accuracy = 1),
na.value = "grey90"
) +
theme_void(base_family = "sans", base_size = 11) +
theme(
legend.position = "right",
legend.key.height = unit(1.2, "cm"),
legend.key.width = unit(0.3, "cm"),
legend.title = element_text(face = "bold", size = 9),
plot.margin = margin(5, 5, 5, 5)
) +
labs(
fill = NULL,
title = NULL
)
}
# 5. Apply the function across all hospitals
hospital_plots <- imap(sero_list, plot_hospital_map)
ggsave("figs/catchment_ch1.png", plot = hospital_plots[[1]], width = 2.5, height = 2.5, dpi = 300, bg = "white")
ggsave("figs/catchment_ch2.png", plot = hospital_plots[[2]], width = 2.5, height = 2.5, dpi = 300, bg = "white")
ggsave("figs/catchment_cch.png", plot = hospital_plots[[3]], width = 2.5, height = 2.5, dpi = 300, bg = "white")Seroprevalence per hospital
Sensitivity, specificity and critical vaccination threshold
sens <- 0.99
spec <- 0.95
critp <- 0.95Aggregate seroprevalence with confidence intervals
agg_sero <- df_sero |>
group_by(hospital, age_5y, period_collect) |>
summarise(
npos = sum(pos, na.rm = TRUE),
ntot = n(),
prev = npos / ntot,
# Safety: prop.test fails if ntot is 0
ci_lwr = if_else(ntot > 0, prop.test(npos, ntot, conf.level = 0.95)$conf.int[1], NA_real_),
ci_upr = if_else(ntot > 0, prop.test(npos, ntot, conf.level = 0.95)$conf.int[2], NA_real_),
.groups = "drop"
) |>
mutate(
prev_true = prev_correct(prev, sens, spec),
lwr = prev_correct(ci_lwr, sens, spec),
upr = prev_correct(ci_upr, sens, spec)
)Plot
# Set color palette
pals <- hue_pal()(3)
l1 <- map(unique(df_sero$hospital), \(current_hosp) {
# Filter data for the current hospital in the loop
df_tmp <- df_sero |> filter(hospital == current_hosp)
tmp_agg_sero <- agg_sero |> filter(hospital == current_hosp)
# Helper function to fit GAM and compute adjusted predictions
fit_and_adjust_gam <- function(data_subset) {
mod <- gam(
pos ~ s(as.numeric(date_collect)),
family = binomial,
weights = weight,
data = data_subset,
method = "REML"
)
predict_ci(model = mod, data = data_subset, var = "date_collect") |>
mutate(
fit_true = prev_correct(fit, sens, spec),
lwr_true = prev_correct(lwr, sens, spec),
upr_true = prev_correct(upr, sens, spec)
)
}
# Fit the two age groups
pred1 <- fit_and_adjust_gam(df_tmp |> filter(age_5y == "<5"))
pred2 <- fit_and_adjust_gam(df_tmp |> filter(age_5y == ">=5"))
# Define the blue rectangle for the latest month
rect_plot <- tibble(
xmin = floor_date(max(df_tmp$date_collect, na.rm = TRUE), "month"),
xmax = ceiling_date(max(df_tmp$date_collect, na.rm = TRUE), "month") - days(1),
ymin = -0.05,
ymax = 1.05
)
# Define dynamic x-axis breaks based strictly on this hospital's collection periods
ym_lab <- tibble(
date_collect = sort(unique(df_tmp$period_collect))
) |>
mutate(
year = year(date_collect),
add_year = !duplicated(year)
)
# Build the plot
ggplot(df_tmp) +
geom_hline(yintercept = critp, col = "darkred", linetype = "dashed") +
geom_point(aes(x = date_collect, y = pos), shape = 124, alpha = 0.3) +
geom_line(data = pred1, aes(x = date_collect, y = fit_true, color = "a")) +
geom_line(data = pred2, aes(x = date_collect, y = fit_true, color = "b")) +
geom_pointrange(
data = tmp_agg_sero |> filter(age_5y == "<5"),
aes(x = period_collect + days(15), y = prev_true, ymin = lwr, ymax = upr, color = "a"),
size = 0.3, alpha = 0.5
) +
geom_pointrange(
data = tmp_agg_sero |> filter(age_5y == ">=5"),
aes(x = period_collect + days(15), y = prev_true, ymin = lwr, ymax = upr, color = "b"),
size = 0.3, alpha = 0.5
) +
geom_ribbon(data = pred1, aes(x = date_collect, ymin = lwr_true, ymax = upr_true, fill = "a"), alpha = 0.15) +
geom_ribbon(data = pred2, aes(x = date_collect, ymin = lwr_true, ymax = upr_true, fill = "b"), alpha = 0.15) +
geom_rect(data = rect_plot, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), fill = "transparent", color = "blue") +
scale_y_continuous(
breaks = seq(0, 1, 0.2),
limits = c(-0.1, 1.1),
labels = scales::label_percent(suffix = "")
) +
scale_x_date(
breaks = ym_lab$date_collect,
labels = format(ym_lab$date_collect, ifelse(ym_lab$add_year, "%b\n%Y", "%b")),
expand = expansion(mult = c(0.02, 0.02))
) +
scale_color_manual(values = pals[c(2, 3)], breaks = c("a", "b"), labels = c("<5", ">=5")) +
scale_fill_manual(values = pals[c(2, 3)], breaks = c("a", "b"), labels = c("<5", ">=5")) +
labs(
x = NULL,
y = "Seroprevalence",
fill = "Age",
color = "Age",
title = current_hosp
) +
theme_light() +
theme(
strip.placement = "outside",
strip.clip = "off",
strip.text = element_text(color = "grey30"),
strip.background = element_rect(fill = NA, color = NA),
panel.spacing = unit(0, "cm"),
panel.border = element_rect(color = NA),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black")
)
})
# ggsave(plot = l1[[1]], "figs/ch1.png", width = 5.5, height = 1.6, dpi = 300, bg = "white")
# ggsave(plot = l1[[2]], "figs/cch.png", width = 5.5, height = 1.6, dpi = 300, bg = "white")
# ggsave(plot = l1[[3]], "figs/ch2.png", width = 5.5, height = 1.6, dpi = 300, bg = "white")
library(patchwork)
# Turn the legend back on for the very first plot in your list
l1[[1]] <- l1[[1]] + theme(legend.position = "right")
# Wrap the entire list into a grid, and collect the guide (legend) to the bottom
l1[[1]]
# ggsave(plot = l1[[1]], "figs/seroprev_legend.png", width = 5.5, height = 1.6, dpi = 300, bg = "white")All sero
# 1. Fit the models
mod1 <- gam(
pos ~ s(as.numeric(date_collection)),
family = binomial,
data = df_sero_all,
method = "REML"
)
mod_u5 <- gam(
pos ~ s(as.numeric(date_collection)),
family = binomial,
data = df_sero_all |> filter(exact_age < 5 | age_max < 5),
method = "REML"
)
mod_o5 <- gam(
pos ~ s(as.numeric(date_collection)),
family = binomial,
data = df_sero_all |> filter(exact_age >= 5 | age_max >= 5),
method = "REML"
)
# 2. Generate predictions
# Assuming your predict_ci function returns columns like: date_collection, fit, lwr, upr
pred1 <- predict_ci(model = mod1, data = df_sero_all, var = "date_collection")
pred_u5 <- predict_ci(model = mod_u5, data = df_sero_all, var = "date_collection")
pred_o5 <- predict_ci(model = mod_o5, data = df_sero_all, var = "date_collection")
# Combine into one dataframe for clean ggplot mapping
pred_combined <- bind_rows(
pred1 |> mutate(age_group = "All ages"),
pred_u5 |> mutate(age_group = "Under 5"),
pred_o5 |> mutate(age_group = "5 and older")
) |>
mutate(
age_group = factor(age_group, levels = c("All ages", "Under 5", "5 and older")),
date_collection = as.Date(date_collection) # Converts POSIXct to Date
)
# 3. Aggregate case data
df_case_weekly <- df_case |>
mutate(week_nv = floor_date(ngay_nv, unit = "week", week_start = 1)) |>
count(week_nv)
# 4. Calculate the scaling factor for the dual axis
# We scale the 0-1 probabilities up to match the height of the maximum case count
scale_factor <- max(df_case_weekly$n, na.rm = TRUE)
# 5. Plot
ggplot() +
# Layer 1: Case counts (bars) without alpha
geom_col(
data = df_case_weekly,
aes(x = week_nv, y = n),
fill = "#374F77",
width = 7
) +
# Layer 2: GAM Confidence Intervals (ribbons)
geom_ribbon(
data = pred_combined,
aes(x = date_collection, ymin = lwr * scale_factor, ymax = upr * scale_factor, fill = age_group),
alpha = 0.3
) +
# Layer 3: GAM Fits (lines)
geom_line(
data = pred_combined,
aes(x = date_collection, y = fit * scale_factor, colour = age_group)
) +
scale_colour_manual(values = c(pals[[1]], pals[[2]], pals[[3]])) +
scale_fill_manual(values = c(pals[[1]], pals[[2]], pals[[3]])) +
geom_hline(
yintercept = 0.95 * scale_factor,
linetype = "dashed",
colour = "darkred",
linewidth = 0.8
) +
scale_x_date(
name = "Week of admission",
date_breaks = "1 year",
date_labels = "%Y",
expand = expansion(mult = c(0.02, 0.02))
) +
scale_y_continuous(
name = "Weekly case counts",
sec.axis = sec_axis(~ . / scale_factor, name = "Seroprevalence", labels = scales::percent)
) +
theme_classic(base_family = "sans") +
theme(
axis.text = element_text(size = 11, colour = "black"),
legend.position = "top",
legend.title = element_blank(),
plot.margin = margin(t = 5, r = 15, b = 5, l = 5, unit = "pt")
)
ggsave("figs/sero_2017_2024.png", width = 5, height = 2.5, dpi = 300, bg = "white")Case notification
df_case |>
mutate(week_nv = floor_date(ngay_nv, unit = "week", week_start = 1)) |>
count(week_nv) |>
ggplot(aes(x = week_nv, y = n)) +
# Using a professional dark blue-grey; width = 7 makes the weekly bars touch
geom_col(fill = "#374F77", width = 7) +
# Set the x-axis to show yearly breaks
scale_x_date(
date_breaks = "1 year",
date_labels = "%Y",
expand = expansion(mult = c(0.02, 0.02)) # Slight padding so bars don't clip the edges
) +
labs(
x = "Week of admission",
y = "Weekly case counts"
) +
theme_classic() +
theme(
# Optional: adjust axis text size if needed
axis.text = element_text(size = 11, colour = "black"),
plot.margin = margin(t = 5, r = 15, b = 5, l = 5, unit = "pt")
)df_case_summary <- df_case |>
mutate(
# 1. Clean the hospital names
ten_bv = ten_bv |>
stri_trans_general("Latin-ASCII") |>
str_to_lower() |>
str_squish() |>
str_replace_all("đ", "d") |>
str_replace_all("\\bbv\\b", "benh vien"), # Expands standalone "bv" only
# 2. Recode to specific hospital acronyms
ten_bv = case_match(
ten_bv,
"benh vien nhi dong 1" ~ "CH1",
"benh vien nhi dong 2" ~ "CH2",
"benh vien nhi dong thanh pho" ~ "CCH",
.default = NA_character_
),
# 3. Clean the district column
district_blank = clean_district(quan_huyen),
# 4. Extract year and group into periods
year_adm = year(ngay_nv),
year_group = case_when(
year_adm %in% 2018:2020 ~ "2018-2020",
year_adm %in% 2024:2025 ~ "2024-2025",
.default = NA_character_ # Tags 2021-2023 or missing years as NA
)
) |>
# 5. Filter out unwanted hospitals and years
filter(
!is.na(ten_bv),
!is.na(year_group)
) |>
# 6. Count rows per hospital, district, and year group
count(ten_bv, district_blank, year_group, name = "n_cases") |>
# Enforce your requirement: only keep districts that match the map
filter(district_blank %in% df_map_hcmc$district_blank) |>
# Generate a complete matrix so 0-count districts are explicitly recorded as 0
complete(
district_blank = unique(df_map_hcmc$district_blank),
ten_bv,
year_group,
fill = list(n_cases = 0)
) |>
# Drop any NA combinations if they slipped through
filter(!is.na(ten_bv), !is.na(year_group)) |>
mutate(
ten_bv = factor(ten_bv, levels = c("CH1", "CH2", "CCH"), labels = c("Children's Hospital 1", "Children's Hospital 2", "City Children's Hospital"))
)
map_cases <- df_map_hcmc |>
inner_join(df_case_summary, by = "district_blank")
# 3. Plot the faceted map grid
plot_cases <- ggplot(map_cases) +
geom_sf(
aes(fill = n_cases),
colour = "grey30",
linewidth = 0.15
) +
# Use ColorBrewer's Blues palette (direction = 1 makes 0 light, high counts dark)
scale_fill_distiller(
palette = "Blues",
direction = 1,
na.value = "grey90"
) +
# Create a matrix layout: Hospitals on the y-axis, Year Groups on the x-axis
facet_grid(ten_bv ~ year_group, switch = "y") +
theme_void(base_family = "sans", base_size = 11) +
theme(
legend.position = "left",
legend.key.width = unit(0.3, "cm"),
legend.title = element_text(face = "bold", size = 9, vjust = 0.8),
# Add spacing and bolding to the row/column labels
strip.text = element_text(face = "bold", size = 10, margin = margin(t = 5, b = 5, l = 5, r = 5)),
plot.margin = margin(10, 10, 10, 10)
) +
labs(fill = "Case counts", title = NULL)
ggsave(plot = plot_cases, "figs/hospital_case_counts.png", width = 5, height = 5, dpi = 300, bg = "white")df_case |>
filter(ngay_nv > dmy("01-04-2024") & ngay_nv < dmy("15-07-2024")) |>
mutate(week_nv = floor_date(ngay_nv, unit = "week", week_start = 1)) |>
count(week_nv) |>
ggplot(aes(x = week_nv, y = n)) +
# Using a professional dark blue-grey; width = 7 makes the weekly bars touch
geom_col(fill = "#374F77", width = 7) +
# Set the x-axis to show yearly breaks
scale_x_date(
date_breaks = "1 year",
date_labels = "%Y",
expand = expansion(mult = c(0.02, 0.02)) # Slight padding so bars don't clip the edges
) +
labs(
x = "Week of admission",
y = NULL
) +
theme_classic() +
theme(
# Optional: adjust axis text size if needed
axis.text = element_text(size = 11, colour = "black"),
plot.margin = margin(t = 5, r = 15, b = 5, l = 5, unit = "pt")
)
# ggsave("figs/peak_2025.png", width = 3, height = 3, dpi = 300, bg = "transparent")Sero of infants
df_sero_infant |>
# 1. Convert age inline
mutate(age_month = age * 12) |>
ggplot(aes(x = age_month, y = titre)) +
geom_hline(yintercept = 200, linetype = "dashed") +
# Added alpha to prevent overplotting with your raw points
geom_point(alpha = 0.4) +
# 2. Fit and plot the GLM in one step
geom_smooth(
method = "glm",
method.args = list(family = Gamma(link = "log")),
level = 0.95,
colour = "blue"
) +
# Vaccination milestones
geom_vline(xintercept = 9, linetype = "dashed", colour = "#D55E00", linewidth = 0.7) +
geom_vline(xintercept = 12, linetype = "dashed", colour = "#0072B2", linewidth = 0.7) +
scale_y_continuous(breaks = c(200, 500, 1000, 1500, 2000)) +
scale_x_continuous(breaks = seq(0, 12, 2)) +
labs(
x = "Age (months)",
y = "Titer (mIU/mL)"
) +
theme_classic()
# ggsave("figs/infant.png", width = 5, height = 2, dpi = 300)