2  Manuscript figures

2.1 Library

Required packages:

required <- c(
  "readxl",
  "magrittr",
  "lubridate",
  "janitor",
  "tidyverse",
  "stringi",
  "stringr",
  "gtsummary",
  "zoo",
  "patchwork",
  "cluster",
  "factoextra",
  "fitdistrplus",
  "mgcv",
  "scam"
)

Installing those that are not installed yet:

to_install <- required[!required %in% installed.packages()[, "Package"]]
if (length(to_install))
  install.packages(to_install)

Loading the packages for interactive use:

invisible(sapply(required, library, character.only = TRUE))

2.2 Constant

user <- Sys.getenv("USER")

if (user == "nguyenpnt") {
  path2data <- paste0(
    "/Users/nguyenpnt/Library/CloudStorage/OneDrive-OxfordUniversityClinicalResearchUnit/Data/HFMD/cleaned/"
  )
}

# Name of cleaned HFMD data file
cases_file <- "hfmd_hcdc.rds"

# Name of cleaned sero file
sero_file <- "sero_EV-A71_1222-1223.rds"

# Name of cleaned viro file
viro_file <- "virology_cleaned.rds"

# Name of outpatient data file
ch1_outpatient <- "CH1_outpatient_HCM_cleaned.rds"

2.3 Functions

Function to read data

readRDS2 <- function(x) readRDS(paste0(path2data, x))

Function for table 1

calculate_prop_test <- function(data, variable, by, ...) {
  # subset to non-missing
  data <- tidyr::drop_na(data, dplyr::all_of(c(variable, by)))

  prop.trend.test(
    x = table(data[[variable]], data[[by]])[2, ], # get the second row (the positive row)
    n = table(data[[by]])
  ) |>
    broom::tidy()
}

Overide select funtion

select <- dplyr::select

Function to create the cohort lines

cohort_lines <- function(cohort_ages = seq(0, 5, by = 1),viro = FALSE){
  start_date <- as.Date("2023-01-01")
  end_date   <- as.Date("2023-12-31")

  ## generate the cohort line
  tseq <- seq(start_date, end_date, by = "day")
  cohort_lines <- lapply(cohort_ages, function(a0){
    data.frame(
      date = tseq,
      age  = a0 + as.numeric(tseq - start_date) / 365,
      cohort = as.factor(a0)
    )
  }) |> dplyr::bind_rows()
  
  color_cohort <- ifelse(viro == TRUE,"black","#80FFFFFF")

  ## create mid-opacity white of cohort lines before June
  cohort_lines <- cohort_lines |>
    mutate(trend = case_when(
      date < as.Date("2023-06-01") ~ color_cohort,
      date >= as.Date("2023-06-01") ~ "#FFFFFFFF",
    ))
  cohort_lines$group <- consecutive_id(cohort_lines$trend)
  cohort_lines <- head(do.call(rbind, by(cohort_lines, cohort_lines$group, rbind, NA)), -1)
  cohort_lines[, c("trend", "group")] <- lapply(cohort_lines[, c("trend", "group")], na.locf)
  cohort_lines[] <- lapply(cohort_lines, na.locf, fromLast = T)

  cohort_lines$trend <- factor(cohort_lines$trend,
                               levels = c(color_cohort,"#FFFFFFFF"))
  cohort_lines
}

Constrained algorithm for serological data

age_profile_constrained_cohort2 <- function(data,
                                            age_values = seq(0, 15, le = 512),
                                            ci = .95,
                                            n = 100) {
  predict2 <- function(...)
    predict(..., type = "response") |> as.vector()
  
  age_profile <- function(data,
                          age_values = seq(0, 15, le = 512),
                          ci = .95) {
    model <- gam(pos ~ s(age), binomial, data)
    link_inv <- family(model)$linkinv
    df <- nrow(data) - length(coef(model))
    p <- (1 - ci) / 2
    model |>
      predict(list(age = age_values), se.fit = TRUE) %>%
      c(list(age = age_values), .) |>
      as_tibble() |>
      mutate(
        lwr = link_inv(fit + qt(p, df) * se.fit),
        upr = link_inv(fit + qt(1 - p, df) * se.fit),
        fit = link_inv(fit)
      ) |>
      dplyr::select(-se.fit)
  }
  
  
  age_profile_unconstrained <- function(data,
                                        age_values = seq(0, 15, le = 512),
                                        ci = .95) {
    data |>
      group_by(collection) |>
      group_map( ~ age_profile(.x, age_values, ci))
  }
  
  shift_right <- function(n, x) {
    if (n < 1)
      return(x)
    c(rep(NA, n), head(x, -n))
  }
  
  dpy <- 365 # number of days per year
  
  mean_collection_times <<- data |>
    group_by(collection) |>
    summarise(mean_col_date = mean(col_date2)) |>
    with(setNames(mean_col_date, collection))
  
  cohorts <- cumsum(c(0, diff(mean_collection_times))) |>
    divide_by(dpy * mean(diff(age_values))) |>
    round() |>
    map(shift_right, age_values)
  
  age_time <- map2(mean_collection_times,
                   cohorts,
                   ~ tibble(collection_time = .x, cohort = .y))
  
  age_time_inv <- age_time |>
    map( ~ cbind(.x, age = age_values)) |>
    bind_rows() |>
    na.exclude()
  
  data |>
    # Step 1:
    group_by(collection) |>
    group_modify( ~ .x |>
                    age_profile(age_values, ci) |>
                    mutate(across(c(fit, lwr, upr), ~ map(
                      .x, ~ rbinom(n, 1, .x)
                    )))) |>
    group_split() |>
    map2(age_time, bind_cols) |>
    bind_rows() |>
    unnest(c(fit, lwr, upr)) |>
    pivot_longer(c(fit, lwr, upr), names_to = "line", values_to = "seropositvty") |>
    # Step 2a:
    filter(cohort < max(age) - diff(range(mean_collection_times)) / dpy) |>
    group_by(cohort, line) |>
    group_modify(
      ~ .x %>%
        scam(seropositvty ~ s(collection_time, bs = "mpi"), binomial, .) |>
        predict2(list(collection_time = mean_collection_times)) %>%
        tibble(collection_time = mean_collection_times, seroprevalence  = .)
    ) |>
    ungroup() |>
    group_by(collection_time, line) |>
    group_modify( ~ .x |>
                    mutate(across(
                      seroprevalence, ~ gam(.x ~ s(cohort), betar) |>
                        predict2()
                    ))) |>  ### modified
    ungroup() |>
    pivot_wider(names_from = line, values_from = seroprevalence) |>
    group_by(collection_time) |>
    group_split()
}

2.4 Data

Serological data:

df_sero <- readRDS2(sero_file)

Virological data:

df_viro <- readRDS2(viro_file) %>%
  mutate(pos = case_when(
    sero_gr == "EV-A71" ~ TRUE,
    sero_gr != "EV-A71" ~ FALSE),
    pos_cv = case_when(
      sero_gr == "CV-A6" ~ TRUE,
      sero_gr != "CV-A6" ~ FALSE),
    peak = case_when(
      month(admission_date) >= 6 & month(admission_date) <= 8 ~ "6/2023 - 8/2023",
      month(admission_date) >= 9 ~ "9/2023 - 12/2023"
    ),
    adm_week = as.Date(floor_date(admission_date, "week")),
    adm_month = as.Date(floor_date(admission_date, "month")),
    time = as.numeric(admission_date)
  )

CH1 outpatient data:

ch1_outpatient <- readRDS2(ch1_outpatient)

Case report data:

df23 <- cases_file |>
  readRDS2() |> 
  dplyr::select(dob, admission_date, district, inout, severity, medi_cen) |> 
  filter(year(admission_date) == 2023) |> 
  filter_out(is.na(dob)) |> 
  unique() |> 
  mutate(across(severity, ~ case_when(is.na(.x) ~ "Undefined",
                                      severity %in% c("3", "4") ~ "3 or 4",
                                      .default = severity) |> as.factor()),
         across(medi_cen, ~ tolower(stri_trans_general(.x, "Latin-ASCII"))),
         across(medi_cen,
                ~ case_when(.x == "benh vien nhi dong 1" ~ "Children's hospital 1",
                            .x == "benh vien benh nhiet doi tphcm" ~
                              "Hospital of Tropical Diseases",
                            .x == "benh vien nhi dong 2" ~ "Children's hospital 2",
                            .x == "benh vien nhi dong thanh pho" ~
                              "City Children's hospital", .default = .x)),
         age1     = interval(dob, admission_date) / years(1),
         time     = as.numeric(admission_date),
         adm_week = as.Date(floor_date(admission_date, "week")),
         age      = round(age1))

2.5 Analysis

2.5.1 K-mean

Using k-mean clustering algorithm to separate 2 waves

set.seed(123)

kmeans_result_1 <- kmeans(df23[, c("age1", "time")], centers = 2, nstart = 25)

df23$wave <- factor(kmeans_result_1$cluster,
                    labels = c("1st", "2nd"),
                    levels = c(2, 1))

## show the time point that seperate two peaks

time_cut_1st <- df23 |>
  filter(wave == "1st") |>
  pull(time) |>
  max()

time_cut_2nd <- df23 |>
  filter(wave == "2nd") |>
  pull(time) |>
  min()

time_cut <- mean(time_cut_1st, time_cut_2nd) |> as.numeric()
time_cut
[1] 19605

So I selected time points at 1.9605^{4} to seperate two peaks

df23 |>
  filter(age1 <= 15) |>
  ggplot(aes(x = admission_date, y = age1, color = wave)) +
  geom_point(size = .5) +
  scale_x_datetime(date_breaks = "1 month" , date_labels = "%b") +
  theme_minimal() +
  geom_vline(xintercept = as.Date(time_cut), linewidth = .2) +
  labs(x = "Admission date (2023)", y = "Age (years)") +
  guides(color = guide_legend(override.aes = list(size = 8), title = "Wave")) +
  theme(
    axis.title.x = element_text(size = 18),
    axis.text.x = element_text(size = 18),
    axis.ticks.x = element_blank(),
    legend.position = "bottom",
    axis.title.y = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 18),
    legend.title = element_text(size = 18)
  )

ggsave(
  "./fig_manuscript/sup2_new.svg",
  width = 15,
  height = 9,
  dpi = 500
)
ggsave(
  "./fig_manuscript/sup2_new.png",
  width = 15,
  height = 9,
  dpi = 500
)

2.5.2 Description table

### table1

tab_age <-
  df23 |>
  tbl_summary(by = wave,
              label = list(age1 ~ "Age (Years)"),
              digits = list(all_continuous() ~ c(2,2),
                            all_categorical() ~ c(0,2)),
              include = c(age1)) |>
  remove_row_type(type = "missing") 

tab_sev <-
  df23 |> 
  filter(severity != "Undefined") |> 
  dplyr::select(wave, severity)  |> 
  mutate(rn = row_number()) |> 
  tidyr::spread(severity, severity) |> 
  mutate(across(-c(wave, rn), ~ ifelse(is.na(.), 0, 1))) |>
  dplyr::select(-rn) |>
  tbl_summary(by = wave,
              statistic = list(
                all_categorical() ~ "{n} / {N} ({p}%)"
              ),
              digits = all_categorical() ~ c(0,0,2)) |>
  modify_table_body(
    ~ .x |>
      mutate(row_type = "level",
             variable = "wave") %>%
      {bind_rows(
        tibble(variable = "wave", row_type = "label", label = "Severity"),
        .
      )}
  )

tab_inout <-
  df23 |>
  filter(inout %in% c("Inpatient","Outpatient")) |> 
  select(wave, inout) |>
  mutate(rn = row_number()) |>
  tidyr::spread(inout, inout) |>
  mutate(across(-c(wave, rn), ~ ifelse(is.na(.), 0, 1))) |>
  select(-rn) |>
  tbl_summary(by = wave,
              statistic = list(
                all_categorical() ~ "{n} / {N} ({p}%)"
              ),
              digits = all_categorical() ~ c(0,0,2)) |>
  modify_table_body(
    ~ .x |>
      mutate(row_type = "level",
             variable = "wave") %>%
      {bind_rows(
        tibble(variable = "wave", row_type = "label", label = "Treatment"),
        .
      )}
  )

final_table <- tbl_stack(list(tab_age, tab_sev, tab_inout),quiet = TRUE) 

final_table
Characteristic 1st
N = 23,2151
2nd
N = 19,9581
Age (Years) 2.66 (1.78, 3.75) 2.35 (1.51, 3.68)
Severity

    1 5,200 / 8,191 (63.48%) 4,168 / 5,787 (72.02%)
    2A 2,445 / 8,191 (29.85%) 1,454 / 5,787 (25.13%)
    2B 415 / 8,191 (5.07%) 128 / 5,787 (2.21%)
    3 or 4 131 / 8,191 (1.60%) 37 / 5,787 (0.64%)
Treatment

    Inpatient 3,151 / 22,424 (14.05%) 2,425 / 19,510 (12.43%)
    Outpatient 19,273 / 22,424 (85.95%) 17,085 / 19,510 (87.57%)
1 Median (Q1, Q3)

Table 1: Demographic and clinical characteristics of reported HFMD cases in Ho Chi Minh City in 2023.

2.5.3 Geography

df23 %>%
  group_by(adm_week, district) %>%
  count() %>%
  ggplot(aes(x = adm_week, y = n)) +
  geom_col() +
  scale_x_date(
    date_breaks = "2 months",
    date_labels = "%b",
    name = "Admission week",
    limits = c(as.Date("2023-01-01"), as.Date("2023-12-31"))
  ) +
  theme_bw() +
  labs(y = "Number of admission") +
  facet_wrap( ~ district, ncol = 4) +
  theme(
    axis.title.x = element_text(size = 15),
    axis.text.x = element_text(size = 15),
    axis.ticks.x = element_blank(),
    legend.position = "none",
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.y = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    strip.text.x = element_text(size = 18)
  )

ggsave(
  "./fig_manuscript/sup1_new.svg",
  width = 12,
  height = 12,
  dpi = 500
)

2.5.4 Age distribution

Code to plot time series cases

## figure 1a: time series cases
f1a <- df23 |>
  group_by(adm_week,wave) |> 
  count() |> 
  ungroup() |> 
  ggplot(aes(x = adm_week, y = n,fill = wave))+
  geom_col(alpha = 0.6,color="black")+
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b",
               name = "Admission date (2023)",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
  # geom_vline(xintercept = as.Date(time_cut))+
  scale_fill_manual(values = c("#582C83FF","#FFC72CFF"))+
  theme_minimal()+
  labs(y = "Cases",tag = "A")+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill = guide_colourbar(barwidth=20),
         color = "none")

Code to fit GAM to age distribution

## aggregate number of admission cases overtime
df_smooth <- df23 |>
  mutate(age3 = floor(age1)) |>
  group_by(admission_date, age3) |>
  summarize(n = n(), .groups = "drop") |>
  complete(age3, admission_date, fill = list(n = 0)) |>
  mutate(
    age = as.numeric(age3),
    time = as.numeric(admission_date)   # convert to numeric scale
  )


## gam formular with tensor-product
fit_gau_bs <- gam(
  n ~ s(age, bs = "bs") +
    s(time, bs = "bs") +
    ti(age, time, bs = c("bs", "bs")) ,
  data = df_smooth,
  family = gaussian(link = "log"),
  method = "REML"
)

## create data for predict
age_val <- seq(0, 6, le = 365)

collection_date_val <- seq(min(df23$adm_week), max(df23$adm_week), le = 365)

new_data <- expand.grid(age = age_val, time = as.numeric(collection_date_val))
## predict
new_data$pred_gau_bs <- predict(fit_gau_bs, newdata = new_data, type = "response")

f1b_new <- ggplot(new_data) +
  geom_raster(aes(x = as.Date(time), y = age, fill = pred_gau_bs), interpolate = TRUE) +
  geom_line(
    data = cohort_lines(),
    aes(
      x = date,
      y = age,
      group = cohort,
      color = trend
    ),
    linewidth = 0.3,
    alpha = 0.4
  ) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  scale_fill_viridis_c(option = "turbo", n.breaks = 10) +
  theme_minimal() +
  labs(x = "Admission date",
       y = "Age (years)",
       fill = "Number of\nadmissions",
       tag = "B") +
  theme_minimal() +
  scale_y_reverse(name = "Age (years)",
                  lim = rev(c(-0.5, 6.5)),
                  breaks = seq(-1, 7)) +
  theme(
    axis.title.y = element_text(size = 18),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.position = "bottom",
    plot.tag = element_text(face = "bold", size = 18),
    axis.text.x = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 18)
  ) +
  guides(fill = guide_colourbar(barwidth = 25), color = "none")

Fiting age distribution as a function of binary time variable

fit1 <- fitdist(df23 %>% filter(age1 > 0 &
                                  wave == "1st") %>% pull(age1), "lnorm")
fit2 <- fitdist(df23 %>% filter(age1 > 0 &
                                  wave == "2nd") %>% pull(age1), "lnorm")

x_vals <- seq(0, 7, le = 512)


fit_1p <- data.frame(
  wave = "1st",
  age = x_vals,
  density = dlnorm(
    x_vals,
    meanlog = fit1$estimate[1],
    sdlog = fit1$estimate[2]
  )
)

fit_2p <- data.frame(
  wave = "2nd",
  age = x_vals,
  density = dlnorm(
    x_vals,
    meanlog = fit2$estimate[1],
    sdlog = fit2$estimate[2]
  )
)

f1c_new <- ggplot(data = df23) +
  geom_histogram(aes(x = age1, fill = wave),
                 color = "white",
                 alpha = .5) +
  scale_fill_manual(values = c("#582C83FF", "#FFC72CFF")) +
  geom_line(data = rbind(fit_1p, fit_2p), aes(x = age, y = density * 5000)) +
  facet_wrap( ~ wave) +
  scale_x_reverse(
    limit = c(6, 0),
    breaks = seq(0, 6, by = 1),
    minor_breaks = NULL
  ) +
  scale_y_continuous(
    minor_breaks = NULL,
    name = "Cases count",
    sec.axis = sec_axis(trans =  ~ . / 5000, name = "Density")
  ) +
  coord_flip() +
  theme_bw() +
  labs(x = "Age", y = "Density", tag = "C") +
  theme(
    axis.title.y = element_blank(),
    axis.ticks.x = element_blank(),
    # legend.position = "top",
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.x = element_text(size = 18),
    axis.text.y = element_blank(),
    axis.text.x = element_text(size = 18),
    legend.text = element_text(size = 18),
    strip.text = element_text(size = 18),
    legend.title = element_text(size = 18)
  )
((f1a | plot_spacer()) /
   (f1b_new | wrap_elements(full = f1c_new  +
                              theme(
                                plot.margin = margin(-40, 0, 67, 0)
                              )))
)+
  plot_layout(widths = c(1,1))

ggsave("./fig_manuscript/fig1_new2.svg",
       width = 15,height = 9,dpi = 500)

2.5.5 Age distribution of CH1

df_cases_ch1_23 <- df23 |>
  filter(medi_cen == "Children's hospital 1")

## fig 2C
ts_ch1 <- df_cases_ch1_23 %>%
  group_by(adm_week, wave) %>%
  count() %>%
  ggplot(aes(x = adm_week, y = n, fill = wave)) +
  geom_col(alpha = 0.6, color = "black") +
  scale_x_date(
    date_breaks = "1 month",
    date_labels = "%b %Y",
    name = "Collection date",
    limits = c(as.Date("2023-01-01"), as.Date("2023-12-31"))
  ) +
  scale_y_continuous(limits = c(0, 900), breaks = seq(0, 900, le = 4)) +
  scale_fill_manual(values = c("#582C83FF", "#FFC72CFF")) +
  theme_minimal() +
  annotate(
    "rect",
    fill = "grey",
    xmin = as.Date(c(
      "2023-01-01", "2023-04-05", "2023-08-02", "2023-12-06"
    )),
    xmax = as.Date(c(
      "2023-01-15", "2023-04-26", "2023-08-30", "2023-12-27"
    )),
    ymin = 0,
    ymax = Inf,
    alpha = .5
  ) +
  labs(y = "Cases") +
  theme(
    axis.title.y = element_text(size = 18),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.position = "bottom",
    plot.tag = element_text(face = "bold", size = 18),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 18)
  ) +
  guides(fill = guide_colourbar(barwidth = 20), color = "none")

2.5.6 Outpatient from CH1

expected_age_cm <- ch1_outpatient |> 
  group_by(period,age_gr) |> 
  summarise(total_adm = sum(n),.groups = "drop") |>  
  group_by(period) |> 
  group_modify(~ gam(total_adm~s(age_gr),method = "REML",data = .) %>% 
                 predict(list(age_gr = seq(0,15,le = 512)),type="response") %>%
                 as.tibble() %>% 
                 mutate(age = seq(0,15,le = 512)))

2.5.7 Serological data from CH1

## calculate atk rate as a function of time
df_sero <- df_sero |>
  as_tibble() |>
  mutate(
    collection = id |>
      str_remove(".*-") |>
      as.numeric() |>
      divide_by(1e4) |>
      round(),
    col_date2 = as.numeric(col_date),
    across(pos, ~ .x > 0)
  )

outttt <- age_profile_constrained_cohort2(df_sero)

## fig 2B
age_profile_sp_cm <- outttt |>
  bind_rows() |>
  mutate(
    col_date = as.Date(collection_time),
    col_lab = format(col_date, format = "%b %Y"),
    col_lab = factor(col_lab, levels = c(
      "Dec 2022", "Apr 2023", "Aug 2023", "Dec 2023"
    ))
  ) |>
  ggplot(aes(x = cohort, y = fit)) +
  geom_line() +
  geom_ribbon(aes(ymin = lwr, ymax = upr),
              fill = "blue",
              alpha = .3) +
  geom_point(data = df_sero |>
               mutate(col_lab = factor(
                 col_time, levels = c("Dec 2022", "Apr 2023", "Aug 2023", "Dec 2023")
               )),
             aes(x = age, y = pos),
             shape = "|") +
  facet_wrap(~ col_lab, nrow = 1) +
  labs(y = "Seroprevalence (%)", x = "Age (years)", tag = "B") +
  scale_y_continuous(labels = scales::label_percent(scale = 100),
                     limits = c(0, 1)) +
  theme_bw() +
  theme(
    axis.title.x = element_text(size = 18),
    axis.text.x = element_text(size = 18),
    legend.position = "none",
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.y = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    strip.text.x = element_text(size = 18)
  )

## fig 2A

ouut <- list()
dat <- list()

for (i in 1:3) {
  dat[[i]] <- df_cases_ch1_23 |>
    filter(
      admission_date >= as.Date(mean_collection_times[i]) &
        admission_date <= as.Date(mean_collection_times[i + 1])
    )
}


expected_incidence <- map2(
  head(outttt, -1),
  tail(outttt, -1),
  ~ left_join(na.exclude(.x), na.exclude(.y), "cohort") |>
    mutate(
      attack = (fit.y - fit.x) / (1 - fit.x),
      date = as.Date(collection_time.y)
    )
) |>
  bind_rows() |>
  mutate(
    period = case_when(
      date <= as.Date("2023-04-30") ~ "12/2022 - 4/2023",
      date > as.Date("2023-04-30") &
        date <= as.Date("2023-08-31") ~ "4/2023 - 8/2023",
      date > as.Date("2023-08-31") ~ "8/2023 - 12/2023"
    )
  ) |>
  group_by(period) |>
  group_split() |>
  map( ~ left_join(.x, expected_age_cm, by = join_by(cohort == age, period))) %>%
  bind_rows(.id = "id") |>
  mutate(sp_gap = fit.y - fit.x, exp_inc = sp_gap * value)

exp_in_ch1 <- expected_incidence |>
  ggplot() +
  geom_line(aes(x = cohort, y = exp_inc),linewidth = 2) +
  geom_col(
    data = dat %>%
      bind_rows(.id = "id") %>%
      filter(age1 > 0) |>
      group_by(id, age) |>
      count(),
    aes(x = age, y = n),
    color = "white",
    fill = "black",
    alpha = 0.2
  ) +
  facet_wrap( ~ factor(
    id,
    labels = c("12/2022 - 4/2023", "4/2023 - 8/2023", "8/2023 - 12/2023")
  )) +
  labs(y = "Expected infections", x = "Age (years)", tag = "A") +
  theme_bw() +
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 18),
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.x = element_text(size = 18),
    axis.title.y = element_text(size = 18),
    legend.title = element_text(size = 18),
    strip.text = element_text(size = 18),
    panel.grid.minor.x = element_blank()
  )

## fig 2D
df_cases_count_ch1_23 <- df_cases_ch1_23 |>
  group_by(age, time) |>
  count() |>
  ungroup()

fit_gau_bs_ch1 <- gam(
  n ~ s(age, bs = "bs") +
    s(time, bs = "bs") +
    ti(age, time, bs = c("bs", "bs")) ,
  data = df_cases_count_ch1_23,
  family = gaussian(link = "log"),
  method = "REML"
)

age_val <- seq(0, 6, le = 365)

collection_date_val <- seq(min(df23$adm_week), max(df23$adm_week), le = 365)

new_data2 <- expand.grid(age = age_val, time = as.numeric(collection_date_val))

new_data2$pred_gau_bs_ch1 <- predict(fit_gau_bs_ch1, newdata = new_data2, type = "response")

hm_ch1 <- ggplot(new_data2) +
  geom_raster(aes(x = as.Date(time), y = age, fill = pred_gau_bs_ch1), interpolate = TRUE) +
  geom_line(
    data = cohort_lines(),
    aes(
      x = date,
      y = age,
      group = cohort,
      color = trend
    ),
    linewidth = 0.3,
    alpha = 0.4
  ) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  scale_fill_viridis_c(option = "turbo", n.breaks = 10) +
  theme_minimal() +
  labs(x = "Admission date",
       y = "Age (years)",
       fill = "Number of\nadmissions",
       tag = "D") +
  theme_minimal() +
  scale_y_reverse(name = "Age (years)",
                  lim = rev(c(-0.5, 6.5)),
                  breaks = seq(-1, 7)) +
  theme(
    axis.title.y = element_text(size = 18),
    axis.title.x = element_text(size = 18),
    axis.ticks.x = element_blank(),
    legend.position = "bottom",
    plot.tag = element_text(face = "bold", size = 18),
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 18)
  ) +
  guides(fill = guide_colourbar(barwidth = 25), color = "none")
fig2 <- exp_in_ch1/
  age_profile_sp_cm/
  (ts_ch1+labs(tag="C"))/
  (hm_ch1+labs(tag="D"))

ggsave("./fig_manuscript/fig2_new.svg",plot = fig2, 
       width = 15,height = 20,dpi = 500)

2.5.8 Virological data

Percentage per serotype

per_serotype <- df_viro %>%
  group_by(adm_month) %>%
  count(sero_gr) %>%
  mutate(total = sum(n),
         per = n/total) %>%
  ggplot(aes(x = adm_month,y=per,fill = sero_gr))+
  geom_bar(position="fill", stat="identity")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")),
               name = "Collection month")+
  scale_y_continuous(labels = scales::label_percent())+
  labs(fill = "Serotype group",y = "Percentage (%)",tag = "B")+
  theme_minimal()+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))

Model perecentage of EV-A71 as a function of age and time

## model ev-a71 percentage as a function of time and age

model_viro_age_time <- gam(pos ~ s(age_adm) + s(time) + ti(age_adm,time),
                           family = binomial,method = "REML",data = df_viro)

collection_date_val <- seq(min(df_viro$time),
                           max(df_viro$time), le = 365)

new_data2 <- expand.grid(age_adm = age_val,
                         time = as.numeric(collection_date_val))

new_data2$pred <- predict(model_viro_age_time, newdata = new_data2, type = "response")


ev_a71_age_time_p <- ggplot() +
  geom_raster(data = new_data2, 
              aes(x = as.Date(time), y = age_adm, fill = pred), interpolate = TRUE) +
  scale_fill_viridis_c(option = "turbo",n.breaks = 10) +
  geom_line(
    data = cohort_lines(viro = TRUE),
    aes(x = date, y = age, group = cohort,color = trend),
    linewidth = 0.25,
    alpha = 0.3
  ) +
  labs(x = "Week", y = "Age", fill = "Smoothed EV-A71 percentage")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b", name = "Collection date in 2023",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
  theme_classic()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(-0.5,6.5)),breaks = seq(-1,7))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

Reconstruct epicurve

cases_fit <- fit_gau_bs_ch1 |>  
    predict(newdata = df_cases_count_ch1_23, se.fit = TRUE) |> 
    as.tibble() |> 
    cbind(df_cases_count_ch1_23) |>
    mutate(
      # c_lwr = fit_gau_bs_ch1$family$linkinv(fit - 1.96 * se.fit),
      # c_upr = fit_gau_bs_ch1$family$linkinv(fit + 1.96 * se.fit),
      c_fit = fit_gau_bs_ch1$family$linkinv(fit)
           ) |> 
    dplyr::select(-c(se.fit,fit))

recontructed_df <- model_viro_age_time |>
  predict(newdata = df_cases_count_ch1_23 |> rename(age_adm = age), se.fit = TRUE) |>
  as.tibble() |>
  cbind(df_cases_count_ch1_23) |>
  mutate(v_lwr = model_viro_age_time$family$linkinv(fit - 1.96 * se.fit),
         v_upr = model_viro_age_time$family$linkinv(fit + 1.96 * se.fit),
         v_fit = model_viro_age_time$family$linkinv(fit)) |>
  dplyr::select(-c(se.fit,fit)) |>
  full_join(cases_fit, by = join_by(age, time,n)) |>
  mutate(ev71_c = c_fit*v_fit,
         ev71_u = c_fit*v_upr,
         ev71_l = c_fit*v_lwr)

recontructed_cases_count_df <- recontructed_df |>
  mutate(day = as.Date(time),
         week = as.Date(floor_date(day, "week"))) |> 
  group_by(week) |>
  summarise(cases = sum(c_fit),
            # cases_u = sum(c_upr),
            # cases_l = sum(c_lwr),
            ev71_cases = sum(ev71_c),
            ev71_cases_u = sum(ev71_u),
            ev71_cases_l = sum(ev71_l)) 

f3a <- recontructed_cases_count_df |> 
  ggplot(aes(x = week))+
  geom_line(aes(y = cases),linetype = "dashed")+
  # geom_ribbon(aes(ymin = cases_u,ymax = cases_l),fill = "blue",alpha = .3)+
  geom_line(aes(y = ev71_cases))+
  geom_ribbon(aes(ymin = ev71_cases_u,ymax = ev71_cases_l),fill = "red",alpha = .3)+
  geom_col(data = df_cases_count_ch1_23 |>
             mutate(day = as.Date(time),
                    week = as.Date(floor_date(day, "week")))|>
             group_by(week) |>
             summarise(cases = sum(n)),
           aes(x = week, y  = cases),alpha = .3)+
  geom_point(data = df_viro, aes(x = admission_date, y = pos*1000),shape = "|",size = 2)+
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b",
               name = "Collection date",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31"))) +
  labs(x = "Admission day (2023)",y = "Total number of cases",tag = "A") +
  theme_minimal()+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=20),
         color = "none")

Reconstruct histogram for EV-A71

compare_ev71 <- recontructed_df |>
  mutate(
    day = as.Date(time),
    week = as.Date(floor_date(day, "week")),
    period = case_when(
      week <= as.Date("2023-04-30") ~ "12/2022 - 4/2023",
      week > as.Date("2023-04-30") &
        week <= as.Date("2023-08-31") ~ "4/2023 - 8/2023",
      week > as.Date("2023-08-31") ~ "8/2023 - 12/2023"
    )
  ) |>
  group_by(period, age) |>
  summarise(
    ev_a71_cases = sum(ev71_c),
    raw_n = sum(n),
    .groups = "drop"
  ) |>
  ggplot(aes(x = age, y = ev_a71_cases)) +
  geom_col(alpha = 0.3) +
  geom_line(data = expected_incidence, aes(x = cohort, y = exp_inc)) +
  scale_x_continuous(limits = c(0, 15), breaks = seq(0, 15, by = 5)) +
  facet_wrap( ~ period) +
  labs(y = "Expected infections", x = "Age (years)", tag = "A") +
  theme_bw() +
  theme(
    axis.text.x = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 18),
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.x = element_text(size = 18),
    axis.title.y = element_text(size = 18),
    legend.title = element_text(size = 18),
    strip.text = element_text(size = 18),
    panel.grid.minor.x = element_blank()
  )
fig3_new <- (compare_ev71+labs(tag="A"))/
  (f3a+labs(tag="B"))/
  (per_serotype+labs(tag="C"))

ggsave("./fig_manuscript/fig3_new2.svg", plot = fig3_new,
       width = 13,height = 15,dpi = 500)

2.6 Confidence interval of expected EV-A71 infections

First I estimate the standard error of difference in seroprevalence between 2 successive collection time points by the following formula:

\[ SE_{diff} = \sqrt{SE_1^2 + SE_2^2 - 2\rho . SE_1 . SE_2} \]

\(\rho\) is the assumed correlation of seroprevalence between two successive collection times points. Because we applied the constraint algorithm, I think we assumed two collection time points are dependence. So I choose \(\rho = 1\)

rho = 1

Then estimate the point estimate and se of seroprevalence different between two successive collection time points. Then estimate the expected EV-A71 infections

expected_incidence <- expected_incidence |>
  mutate(
    se1 = (upr.x - lwr.x) / (2 * 1.96),
    se2 = (upr.y - lwr.y) / (2 * 1.96),
    se.diff = sqrt(se1^2 + se2^2 - 2 * rho * se1 * se2),
    lo_diff = sp_gap - 1.96 * se.diff,
    hi_diff = sp_gap + 1.96 * se.diff,
    hi_exp_inc = hi_diff * value,
    lo_exp_inc = lo_diff * value
  ) 

Then plot it

expected_incidence |>
  ggplot(aes(x = cohort)) +
  geom_line(aes(y = exp_inc)) +
  geom_ribbon(
    aes(y = exp_inc, ymin = lo_exp_inc, ymax = hi_exp_inc),
    alpha = 0.2,
    fill = "blue"
  ) +
  coord_cartesian(ylim = c(0, 2500)) +
  geom_col(
    data = dat %>%
      bind_rows(.id = "id") %>%
      filter(age1 > 0) |>
      group_by(id, age) |>
      count(),
    aes(x = age, y = n),
    color = "white",
    fill = "black",
    alpha = 0.2
  ) +
  facet_wrap(~ factor(
    id,
    labels = c("12/2022 - 4/2023", "4/2023 - 8/2023", "8/2023 - 12/2023")
  )) +
  theme_bw()