4  Reframe ms figure

4.1 Import data

Code
## file path
path2data <- paste0("D:/OUCRU-Onedrive/OneDrive - Oxford University Clinical Research Unit/Data/HFMD/cleaned/")

# Name of cleaned HFMD data file
cases_file <- "hfmd_hcdc.rds"

# Name of cleaned sero file
sero_file <- "sero_EV-A71_1222-1223.rds"

# Name of cleaned viro file

viro_file <- "virology_cleaned.rds"

## import data

df_cases <- paste0(path2data, cases_file) |>
  readRDS()

df_sero <- paste0(path2data, sero_file) |>
  readRDS()

df_viro <- paste0(path2data, viro_file) |>
  readRDS()

4.2 Library

Code
library(readxl)
library(tidyverse)
library(gtsummary)
library(patchwork)
library(lubridate)
library(stringr)
library(mgcv)
library(janitor)
library(predtools)
library(magrittr)
library(slider)
library(scam)
library(epitools)
library(sf)
library(stringi)
library(gtsummary)
library(zoo)
invisible(Sys.setlocale("LC_TIME", "English"))

4.2.1 Filtering 2023 cases and cleaning nessessary variables

Code
## import and check whether duplicated cases

df23 <- df_cases  |>  filter(year(admission_date) == 2023)

df23 |> 
  get_dupes()
# A tibble: 0 × 16
# ℹ 16 variables: dob <date>, age <dbl>, gender <chr>, commune <chr>,
#   district <chr>, reported_date <date>, onset_date <date>,
#   admission_date <date>, medi_cen <chr>, inout <chr>, severity <chr>,
#   year_dob <dbl>, year_reported <dbl>, year_onset <dbl>,
#   year_admission <dbl>, dupe_count <int>
Code
df23 |> 
  summary()
      dob                  age              gender            commune         
 Min.   :1952-05-26   Min.   :   0.000   Length:43304       Length:43304      
 1st Qu.:2019-12-02   1st Qu.:   2.000   Class :character   Class :character  
 Median :2021-02-07   Median :   2.000   Mode  :character   Mode  :character  
 Mean   :2020-09-28   Mean   :   2.958                                        
 3rd Qu.:2021-12-26   3rd Qu.:   4.000                                        
 Max.   :2023-11-22   Max.   :2023.000                                        
 NA's   :8                                                                    
   district         reported_date          onset_date        
 Length:43304       Min.   :2023-01-02   Min.   :2020-11-03  
 Class :character   1st Qu.:2023-07-24   1st Qu.:2023-07-21  
 Mode  :character   Median :2023-08-27   Median :2023-08-24  
                    Mean   :2023-09-03   Mean   :2023-08-31  
                    3rd Qu.:2023-10-24   3rd Qu.:2023-10-21  
                    Max.   :2024-03-07   Max.   :2023-12-31  
                                         NA's   :8482        
 admission_date         medi_cen            inout             severity        
 Min.   :2023-01-01   Length:43304       Length:43304       Length:43304      
 1st Qu.:2023-07-22   Class :character   Class :character   Class :character  
 Median :2023-08-25   Mode  :character   Mode  :character   Mode  :character  
 Mean   :2023-09-01                                                           
 3rd Qu.:2023-10-22                                                           
 Max.   :2023-12-31                                                           
                                                                              
    year_dob    year_reported    year_onset   year_admission
 Min.   :1952   Min.   :2023   Min.   :2020   Min.   :2023  
 1st Qu.:2019   1st Qu.:2023   1st Qu.:2023   1st Qu.:2023  
 Median :2021   Median :2023   Median :2023   Median :2023  
 Mean   :2020   Mean   :2023   Mean   :2023   Mean   :2023  
 3rd Qu.:2021   3rd Qu.:2023   3rd Qu.:2023   3rd Qu.:2023  
 Max.   :2023   Max.   :2024   Max.   :2029   Max.   :2023  
 NA's   :8                     NA's   :8481                 

In this analysis, we only care about dob, admission date, district, inout, severity, medi_cen

4.2.2 NA in Dob

Code
df23 |> 
  filter(is.na(dob)) 
# A tibble: 8 × 15
  dob      age gender commune         district   reported_date onset_date
  <date> <dbl> <chr>  <chr>           <chr>      <date>        <date>    
1 NA         3 Female vinh loc b      binh chanh 2023-03-10    NA        
2 NA         2 Male   an phu tay      binh chanh 2023-03-10    NA        
3 NA         3 Female binh tri dong b binh tan   2023-03-21    NA        
4 NA         3 Male   long phuoc      thu duc    2023-07-19    NA        
5 NA         1 Female 4               10         2023-11-02    NA        
6 NA         6 Male   truong tho      thu duc    2023-10-13    NA        
7 NA         2 Male   tan phu         thu duc    2023-08-16    NA        
8 NA         2 Male   linh trung      thu duc    2023-08-17    NA        
# ℹ 8 more variables: admission_date <date>, medi_cen <chr>, inout <chr>,
#   severity <chr>, year_dob <dbl>, year_reported <dbl>, year_onset <dbl>,
#   year_admission <dbl>

To facilitate analysis, I excluded those cases had NA of dob

Code
df23 %<>% 
  filter(!is.na(dob)) 

4.2.3 Merge severity

Code
df23 %<>%
  mutate(
    severity2 = case_when(
      is.na(severity) ~ "Undefined",
      severity %in% c("3","4") ~ "3 or 4",
      .default = severity
    ) |> as.factor()
  ) 

4.2.4 Change name of hospitals

I only consider 4 hospital: Children’s hospital 1, Children’s hospital 2, City Children’s hospital, Hospital of Tropical Diseases

Code
df23  %<>%  
  mutate(
    medi_cen = case_when(
      medi_cen %in% c("Bệnh viện Nhi đồng 1","Bênh viện Nhi Đồng 1","Bệnh viện Nhi Đồng 1") ~ "Children's hospital 1",
      medi_cen == "Bênh viện Bệnh nhiệt đới TPHCM" ~ "Hospital of Tropical Diseases",
      medi_cen %in% c("Bệnh viện Nhi đồng 2","Bệnh viện Nhi Đồng 2") ~ "Children's hospital 2",
      medi_cen == "Bệnh viện Nhi đồng thành phố" ~ "City Children's hospital",
      .default = medi_cen
    )
  )

4.2.5 Adding additional variables

Code
df23  %<>%  
  mutate(
    age1 = interval(dob, admission_date) / years(1),
    time = as.numeric(admission_date),
    adm_week = as.Date(floor_date(admission_date, "week")),
    age = round(age1)
  )

4.3 Using k-means to find the time point that separate two peaks

Code
library(cluster)
library(factoextra)

set.seed(123)

kmeans_result_1 <- kmeans(df23[,c("age1","time")], centers = 2, nstart = 25)

df23$wave <- factor(kmeans_result_1$cluster,labels = c("1st","2nd"),levels = c(2,1))

## show the time point that seperate two peaks

time_cut_1st <- df23 |> 
  filter(wave == "1st") |> 
  pull(time) |> 
  max()

time_cut_2nd <- df23 |> 
  filter(wave == "2nd") |> 
  pull(time) |> 
  min()

time_cut <- mean(time_cut_1st,time_cut_2nd) |> as.numeric()
time_cut
[1] 19605

So I selected time points at 1.9605^{4} to seperate two peaks

Code
df23 |> 
  filter(age1 <= 15) |> 
  ggplot(aes(x = admission_date, y = age1, color = wave)) +
  geom_point(size = .5) + 
  scale_x_datetime(date_breaks = "1 month" , date_labels = "%b")+
  theme_minimal()+
  geom_vline(xintercept = as.Date(time_cut),linewidth = .2)+
  labs(x = "Admission date (2023)", y = "Age (years)")+
  guides(color=guide_legend(override.aes=list(size=8), title = "Wave"))+
  theme(
    axis.title.x = element_text(size = 18),
    axis.text.x = element_text(size = 18),
    axis.ticks.x = element_blank(),
    legend.position = "bottom",
    axis.title.y = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 18),
    legend.title = element_text(size = 18)
  )

Code
ggsave("./fig_manuscript/sup2_new.svg",
       width = 15,height = 9,dpi = 500)
ggsave("./fig_manuscript/sup2_new.png",
       width = 15,height = 9,dpi = 500)

4.4 Table 1

Code
### table1

tab_age <-
  df23 |>
  tbl_summary(by = wave,
              label = list(age1 ~ "Age (Years)",
                           gender ~ "Gender"),
              digits = list(all_continuous() ~ c(2,2),
                            all_categorical() ~ c(0,2)),
              include = c(age1,gender)) |>
  remove_row_type(type = "missing") |>
  add_p()


calculate_prop_test <- function(data, variable, by, ...) {
  # subset to non-missing
  data <- tidyr::drop_na(data, dplyr::all_of(c(variable, by)))

  prop.trend.test(
    x = table(data[[variable]], data[[by]])[2, ], # get the second row (the positive row)
    n = table(data[[by]])
  ) |>
    broom::tidy()
}

tab_sev <-
  df23 |> 
  filter(severity2 != "Undefined") |> 
  select(wave, severity2)  |> 
  mutate(rn = row_number()) |> 
  tidyr::spread(severity2, severity2) |> 
  mutate(across(-c(wave, rn), ~ ifelse(is.na(.), 0, 1))) |>
  select(-rn) |>
  tbl_summary(by = wave,
              statistic = list(
                all_categorical() ~ "{n} / {N} ({p}%)"
              ),
              digits = all_categorical() ~ c(0,0,2)) |>
  add_p() |>
  modify_table_body(
    ~ .x |>
      mutate(row_type = "level",
             variable = "wave") %>%
      {bind_rows(
        tibble(variable = "wave", row_type = "label", label = "Severity"),
        .
      )}
  )

tab_inout <-
  df23 |>
  filter(inout %in% c("Inpatient","Outpatient")) |> 
  select(wave, inout) |>
  mutate(rn = row_number()) |>
  tidyr::spread(inout, inout) |>
  mutate(across(-c(wave, rn), ~ ifelse(is.na(.), 0, 1))) |>
  select(-rn) |>
  tbl_summary(by = wave,
              statistic = list(
                all_categorical() ~ "{n} / {N} ({p}%)"
              ),
              digits = all_categorical() ~ c(0,0,2)) |>
  add_p() |>
  modify_table_body(
    ~ .x |>
      mutate(row_type = "level",
             variable = "wave") %>%
      {bind_rows(
        tibble(variable = "wave", row_type = "label", label = "Treatment"),
        .
      )}
  )

final_table <- tbl_stack(list(tab_age, tab_sev, tab_inout),quiet = TRUE) 

final_table
Characteristic 1st
N = 23,2851
2nd
N = 20,0111
p-value2
Age (Years) 2.66 (1.78, 3.75) 2.35 (1.51, 3.68) <0.001
Gender

0.040
    Female 10,016 (43.01%) 8,412 (42.04%)
    Male 13,269 (56.99%) 11,599 (57.96%)
Severity


    1 5,214 / 8,212 (63.49%) 4,181 / 5,801 (72.07%) <0.001
    2A 2,451 / 8,212 (29.85%) 1,455 / 5,801 (25.08%) <0.001
    2B 415 / 8,212 (5.05%) 128 / 5,801 (2.21%) <0.001
    3 or 4 132 / 8,212 (1.61%) 37 / 5,801 (0.64%) <0.001
Treatment


    Inpatient 3,160 / 22,493 (14.05%) 2,432 / 19,563 (12.43%) <0.001
    Outpatient 19,333 / 22,493 (85.95%) 17,131 / 19,563 (87.57%) <0.001
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Table 1: Demographic and clinical characteristics of reported HFMD cases in Ho Chi Minh City in 2023.

4.5 Figure 1

Code
cohort_lines <- function(cohort_ages = seq(0, 5, by = 1),viro = FALSE){
  start_date <- as.Date("2023-01-01")
  end_date   <- as.Date("2023-12-31")

  ## generate the cohort line
  tseq <- seq(start_date, end_date, by = "day")
  cohort_lines <- lapply(cohort_ages, function(a0){
    data.frame(
      date = tseq,
      age  = a0 + as.numeric(tseq - start_date) / 365,
      cohort = as.factor(a0)
    )
  }) |> dplyr::bind_rows()
  
  color_cohort <- ifelse(viro == TRUE,"black","#80FFFFFF")

  ## create mid-opacity white of cohort lines before June
  cohort_lines <- cohort_lines |>
    mutate(trend = case_when(
      date < as.Date("2023-06-01") ~ color_cohort,
      date >= as.Date("2023-06-01") ~ "#FFFFFFFF",
    ))
  cohort_lines$group <- consecutive_id(cohort_lines$trend)
  cohort_lines <- head(do.call(rbind, by(cohort_lines, cohort_lines$group, rbind, NA)), -1)
  cohort_lines[, c("trend", "group")] <- lapply(cohort_lines[, c("trend", "group")], na.locf)
  cohort_lines[] <- lapply(cohort_lines, na.locf, fromLast = T)

  cohort_lines$trend <- factor(cohort_lines$trend,
                               levels = c(color_cohort,"#FFFFFFFF"))
  cohort_lines
}

## figure 1a: time series cases

f1a <- df23 |>
  group_by(adm_week,wave) |> 
  count() |> 
  ungroup() |> 
  ggplot(aes(x = adm_week, y = n,fill = wave))+
  geom_col(alpha = 0.6,color="black")+
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b",
               name = "Admission date (2023)",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
  # geom_vline(xintercept = as.Date(time_cut))+
  scale_fill_manual(values = c("#582C83FF","#FFC72CFF"))+
  theme_minimal()+
  labs(y = "Cases",tag = "A")+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill = guide_colourbar(barwidth=20),
         color = "none")


## figure 1b
f1b <- df23 |>
  select(adm_week,age1) |>
  ggplot(aes(x = adm_week, y = age1)) +
  stat_density_2d(
    aes(fill = after_stat(count)),
    geom = "raster",
    contour = FALSE,
    interpolate = TRUE
  ) +
  geom_line(
    data = cohort_lines(seq(0,5,by=1)),
    aes(x = date, y = age, group = cohort,color = trend),
    linewidth = 0.25,
    alpha = 0.3
  )+
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  scale_fill_viridis_c(option = "inferno", n.breaks = 10) +
  theme_minimal() +
  labs(
    x = "Admission week (2023)",
    y = "Age (years)",
    fill = "Number of\nadmissions",
    tag = "B"
  )+
  theme_minimal()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(0,6)),breaks = seq(0,6))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

f1c <- ggplot(data=df23, aes(x=age1, group=wave, fill=wave)) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = c("#582C83FF","#FFC72CFF")) +
  scale_x_reverse(limit = c(6,0),
                  breaks = seq(0,6,by=1),
                  minor_breaks = NULL)+
  scale_y_continuous(minor_breaks = NULL)+
  coord_flip()+
  theme_minimal()+
  labs(x = "Age", 
       y = "Density",
       fill = "Wave",
       tag = "C")+
  theme(axis.title.y = element_blank(),
        axis.ticks.x = element_blank(),
        # legend.position = "top",
        plot.tag = element_text(face = "bold", size = 18),
        axis.title.x = element_text(size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_blank(),
        legend.text = element_text(size = 18),
        legend.title = element_text(size = 18))


((f1a/f1b)|(plot_spacer()/f1c))+
  plot_layout(widths = c(2.5,1))

Code
ggsave("./fig_manuscript/fig1_new.svg",
       width = 15,height = 9,dpi = 500)
NoteExplain how to generate figure 1B

I just used the admission week and age (float form), and use geom_density_2d function to plot it. The function performed a 2D kernel density estimation and display results with contours

4.5.1 Trying to using gam to smooth the raw data

In this step, I just used the admission week and age (integer form), i.e I counted the number of admission at each age in each week and plot it by heatmap below.

Code
df_smooth <- df23 |>
  mutate(age3 = floor(age1)) |>
  group_by(admission_date, age3) |>
  summarize(n = n(), .groups = "drop") |>
  complete(age3, admission_date, fill = list(n = 0)) |>
  mutate(
    age = as.numeric(age3),
    time = as.numeric(admission_date)   # convert to numeric scale
  )

df_smooth |>
  filter(age < 7) |>
  ggplot() +
  geom_tile(aes(x = as.Date(time), y = age, fill = n))+
  labs(x = "Week", y = "Age", fill = "Smoothed cases")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  scale_fill_viridis_c(option = "inferno", n.breaks = 10) +
  theme_minimal() +
  labs(
    x = "Admission date",
    y = "Age (years)",
    fill = "Number of admissions"
  )+
  theme_minimal()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(-0.5,6.5)),breaks = seq(-1,7))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

The total number admission from the raw data

In this step, I tried to use GAM to model the raw data, I tried to used gaussian family with log link function, and bs smoothing terms, with tensor-product

Code
## data for GAM fitting, using n ~ s(age) + s(time)
df_smooth |> 
  select(-age3) |> 
  head()
# A tibble: 6 × 4
  admission_date     n   age  time
  <date>         <int> <dbl> <dbl>
1 2023-01-01         0     0 19358
2 2023-01-02         3     0 19359
3 2023-01-03         0     0 19360
4 2023-01-04         3     0 19361
5 2023-01-05         0     0 19362
6 2023-01-06         1     0 19363
Code
## create data for predict
age_val <- seq(0,6,le = 365)

collection_date_val <- seq(min(df23$adm_week),
                           max(df23$adm_week), le = 365)

new_data <- expand.grid(age = age_val,
                        time = as.numeric(collection_date_val))
Code
## gam formular with tensor-product
fit_gau_bs <- gam(n ~ s(age,bs = "bs") + 
                      s(time,bs = "bs") + 
                      ti(age, time, bs = c("bs", "bs")) , data = df_smooth,
                  family = gaussian(link = "log"),method = "REML")

new_data$pred_gau_bs <- predict(fit_gau_bs, newdata = new_data, type = "response")

f1b_new <- ggplot(new_data) +
  geom_raster(aes(x = as.Date(time), y = age, fill = pred_gau_bs),interpolate = TRUE) +
  geom_line(
    data = cohort_lines(),
    aes(x = date, y = age, group = cohort,color = trend),
    linewidth = 0.3,
    alpha = 0.4
  )+
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  scale_fill_viridis_c(option = "turbo", n.breaks = 10) +
  theme_minimal() +
  labs(
    x = "Admission date",
    y = "Age (years)",
    fill = "Number of\nadmissions",
    tag = "B"
  )+
  theme_minimal()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(-0.5,6.5)),breaks = seq(-1,7))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

f1b_new

Code
library(plotly)

plot_ly(new_data, x = ~sort(unique(as.Date(time))),
        y = ~sort(unique(age)),
        z = ~matrix(pred_gau_bs, 365),
        showscale = F) |> 
  add_surface() |> 
  layout(autosize = F, width = 700, height = 700)

4.5.2 tensor-product plot

Code
ggplot(new_data, aes(x = as.Date(time), y = age, z = pred_gau_bs)) +
  geom_raster(aes(fill = pred_gau_bs), interpolate = TRUE) +
  geom_contour(color = "white", alpha = 0.7) +
  scale_fill_viridis_c(option = "turbo",n.breaks = 10) +  
  labs(x = "Week", y = "Age", fill = "Smoothed cases")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  labs(
    title = "Tensor-product smooth of age × time",
    x = "Age",
    y = "Time",
    fill = "Smooth (fit)"
  ) +
  theme_minimal()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(-0.5,6.5)),breaks = seq(-1,7))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

In this plot, I fitted log normal distribution to age distribution of 2 wave, the histograms are the raw data and the lines are the fitted lognormal distribution

Code
library(fitdistrplus)
fit1 <- fitdist(df23 %>% filter(age1 >0 & wave == "1st") %>% pull(age1),"lnorm")
fit2 <- fitdist(df23 %>% filter(age1 >0 & wave == "2nd") %>% pull(age1),"lnorm")

x_vals <- seq(0, 7, le = 512)


fit_1p <- data.frame(wave = "1st",age = x_vals, density = dlnorm(x_vals,
                                                                 meanlog = fit1$estimate[1],
                                                                 sdlog = fit1$estimate[2]))

fit_2p <- data.frame(wave = "2nd",age = x_vals, density = dlnorm(x_vals,
                                                                 meanlog = fit2$estimate[1],
                                                                 sdlog = fit2$estimate[2]))

f1c_new <- ggplot(data=df23) +
  geom_histogram(aes(x=age1, fill = wave),
                 color = "white",
                 alpha = .5) +
  scale_fill_manual(values = c("#582C83FF","#FFC72CFF"))+
  geom_line(data = rbind(fit_1p,fit_2p),aes(x = age,y = density*5000))+
  facet_wrap(~wave)+
  scale_x_reverse(limit = c(6,0),
                  breaks = seq(0,6,by=1),
                  minor_breaks = NULL)+
  scale_y_continuous(minor_breaks = NULL,
                     name = "Cases count",
                     sec.axis = sec_axis( trans=~./5000, name="Density"))+
  coord_flip()+
  theme_bw()+
  labs(x = "Age",
       y = "Density",
       tag = "C")+
  theme(axis.title.y = element_blank(),
        axis.ticks.x = element_blank(),
        # legend.position = "top",
        plot.tag = element_text(face = "bold", size = 18),
        axis.title.x = element_text(size = 18),
        axis.text.y = element_blank(),
        axis.text.x = element_text(size = 18),
        legend.text = element_text(size = 18),
        strip.text = element_text(size = 18),
        legend.title = element_text(size = 18))


((f1a|plot_spacer())/
    (f1b_new|wrap_elements(full = f1c_new  + 
                             theme(plot.margin = margin(-40,0,67,0)))))+
  plot_layout(widths = c(1,1))

Code
ggsave("./fig_manuscript/fig1_new2.svg",
       width = 15,height = 9,dpi = 500)

4.6 Figure 2

Code
age_profile_constrained_cohort2 <- function(data, age_values = seq(0, 15, le = 512),
                                            ci = .95, n = 100) {
  
  predict2 <- function(...) predict(..., type = "response") |> as.vector()
  
  age_profile <- function(data, age_values = seq(0, 15, le = 512), ci = .95) {
    model <- gam(pos ~ s(age), binomial, data)
    link_inv <- family(model)$linkinv
    df <- nrow(data) - length(coef(model))
    p <- (1 - ci) / 2
    model |>
      predict(list(age = age_values), se.fit = TRUE) %>%
      c(list(age = age_values), .) |>
      as_tibble() |>
      mutate(lwr = link_inv(fit + qt(    p, df) * se.fit),
             upr = link_inv(fit + qt(1 - p, df) * se.fit),
             fit = link_inv(fit)) |>
      dplyr::select(- se.fit)
  }
  
  
  age_profile_unconstrained <- function(data, age_values = seq(0, 15, le = 512),
                                        ci = .95) {
    data |>
      group_by(collection) |>
      group_map(~ age_profile(.x, age_values, ci))
  }
  
  shift_right <- function(n, x) {
    if (n < 1) return(x)
    c(rep(NA, n), head(x, -n))
  }
  
  dpy <- 365 # number of days per year

  mean_collection_times <<- data |>
    group_by(collection) |>
    summarise(mean_col_date = mean(col_date2)) |>
    with(setNames(mean_col_date, collection))

  cohorts <- cumsum(c(0, diff(mean_collection_times))) |>
    divide_by(dpy * mean(diff(age_values))) |>
    round() |>
    map(shift_right, age_values)

  age_time <- map2(mean_collection_times, cohorts,
                   ~ tibble(collection_time = .x, cohort = .y))

  age_time_inv <- age_time |>
    map(~ cbind(.x, age = age_values)) |>
    bind_rows() |>
    na.exclude()

  data |>
    # Step 1:
    group_by(collection) |>
    group_modify(~ .x |>
                   age_profile(age_values, ci) |>
                   mutate(across(c(fit, lwr, upr), ~ map(.x, ~ rbinom(n, 1, .x))))) |>
    group_split() |>
    map2(age_time, bind_cols) |>
    bind_rows() |>
    unnest(c(fit, lwr, upr)) |>
    pivot_longer(c(fit, lwr, upr), names_to = "line", values_to = "seropositvty") |>
    # Step 2a:
    filter(cohort < max(age) - diff(range(mean_collection_times)) / dpy) |>
    group_by(cohort, line) |>
    group_modify(~ .x %>%
                   scam(seropositvty ~ s(collection_time, bs = "mpi"), binomial, .) |>
                   predict2(list(collection_time = seq(mean_collection_times[1],19722,le=52))) %>%
                   tibble(collection_time = seq(mean_collection_times[1],19722,le=52),
                          seroprevalence  = .)) |>
    ungroup() |>
    group_by(collection_time, line) |>
    group_modify(~ .x |>
                   mutate(across(seroprevalence, ~ gam(.x ~ s(cohort), betar) |>
                                   predict2()))) |>  ### modified
    ungroup() |>
    pivot_wider(names_from = line, values_from = seroprevalence) |>
    group_by(collection_time) |>
    group_split()
}

First I show how I calculate the expected admission of CH1. I used outpatient cases from CH1: group number of outpatient admitted to CH1 by age and district. I used the 2019 census as the denominator, also group number of populatiion by age and district. Then the probability of CH1 admission is calculated by divided \(\frac{CH1 \ outpatients_{[a,d]}}{HCMC \ population \ in \ 2019_{[a,d]}}\). After that at each age group, I weighted the probability of CH1 admission, then multiply it again to hcmc population at age and district d to get the expected CH1 admission.

Code
ch1_outpatient <- paste0(path2data, "CH1_outpatient_HCM_cleaned.rds") |>
  readRDS()

# census2019 <- paste0(path2data, "census2019.rds") |>
#   readRDS() 

## generate denominator

# N_ad <- census2019 |>
#   filter(province == "Thành phố Hồ Chí Minh") |>
#   mutate(district = district %>%
#            str_replace_all(
#              c("Quận 2" = "Thủ Đức",
#                "Quận 9" = "Thủ Đức")) %>%
#            str_remove("Quận|Huyện") %>%
#            trimws(which = "both") %>%
#            stri_trans_general("latin-ascii") %>%
#            tolower()) %>%
#   group_by(district,age) |>
#   summarise(n = sum(n),.groups = "drop") |>
#   mutate(across(age, ~ stringr::str_remove(.x, " tuổi| \\+") |> as.integer())) |>
#   arrange(age) |>
#   filter(age < 17)

## probability of CH1 admission
# expected_catchment_CH1 <- left_join(ch1_outpatient,
#           N_ad,
#           by = join_by(district,age_gr == age)) %>% 
#   na.omit(n.y) |> 
#   mutate(p_hos = n.x/n.y) %>%  
#   arrange(age_gr) %>% 
#   group_by(age_gr) %>% 
#   mutate(weight = p_hos/sum(p_hos),
#          expected_cm = n.y*weight) %>% 
#   ungroup() |> 
#   group_by(period,age_gr) |> 
#   summarise(e_admission = sum(expected_cm),.groups = "drop")  

## age profile of expected CH1 catchment
# expected_age_cm <- expected_catchment_CH1 |> 
#   group_by(period) |> 
#   group_modify(~ gam(e_admission~s(age_gr),method = "REML",data = .) %>% 
#                  predict(list(age_gr = seq(0,15,le = 512)),type="response") %>%
#                  as.tibble() %>% 
#                  mutate(age = seq(0,15,le = 512)))

expected_age_cm <- ch1_outpatient |> 
  group_by(period,age_gr) |> 
  summarise(total_adm = sum(n),.groups = "drop") |>  
  group_by(period) |> 
  group_modify(~ gam(total_adm~s(age_gr),method = "REML",data = .) %>% 
                 predict(list(age_gr = seq(0,15,le = 512)),type="response") %>%
                 as.tibble() %>% 
                 mutate(age = seq(0,15,le = 512)))

expected_age_cm |> 
  ggplot(aes(x = age,y = value))+
  geom_line()+
  # geom_ribbon(aes(ymin = lwr,ymax=upr),fill = "blue",alpha = .3)+
  geom_col(aes(x = age_gr, y = total_adm),alpha = .3,
             data = ch1_outpatient |> 
               group_by(period,age_gr) |> 
               summarise(total_adm = sum(n),.groups = "drop"))+
  theme_bw()+
  facet_wrap(~period)+
  labs(y = "Expected admission",x = "Age (years)")+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18),
        strip.text = element_text(size = 18))

Point is the data and line is the fitted GAM model
Code
ggsave("./fig_manuscript/sup3_new.svg",
       width = 12,height = 8,dpi = 500)
Code
df_cases_ch1_23 <- df23 |> 
  filter(medi_cen == "Children's hospital 1") 

## fig 2C
ts_ch1 <- df_cases_ch1_23 %>%
  group_by(adm_week,wave) %>%
  count() %>%
  ggplot(aes(x = adm_week, y = n,fill = wave))+
  geom_col(alpha = 0.6,color="black")+
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b %Y",
               name = "Collection date",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
  scale_y_continuous(limits = c(0,900),breaks = seq(0,900,le = 4))+
  scale_fill_manual(values = c("#582C83FF","#FFC72CFF"))+
  theme_minimal()+
  annotate("rect", fill = "grey",
           xmin = as.Date(c("2023-01-01","2023-04-05","2023-08-02","2023-12-06")),
           xmax = as.Date(c("2023-01-15","2023-04-26","2023-08-30","2023-12-27")),
           ymin = 0, ymax = Inf, alpha = .5)+
  labs(y = "Cases")+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=20),
         color = "none")

## fig 2D

df_cases_count_ch1_23 <- df_cases_ch1_23 |> 
  group_by(age,time) |> 
  count() |> 
  ungroup()
  
fit_gau_bs_ch1 <- gam(n ~ s(age,bs = "bs") + 
                    s(time,bs = "bs") + 
                    ti(age, time, bs = c("bs", "bs")) , data = df_cases_count_ch1_23,
                  family = gaussian(link = "log"),method = "REML")

age_val <- seq(0,6,le = 365)

collection_date_val <- seq(min(df23$adm_week),
                           max(df23$adm_week), le = 365)

new_data2 <- expand.grid(age = age_val,
                        time = as.numeric(collection_date_val))

new_data2$pred_gau_bs_ch1 <- predict(fit_gau_bs_ch1, newdata = new_data2, type = "response")

hm_ch1 <- ggplot(new_data2) +
  geom_raster(aes(x = as.Date(time), y = age, fill = pred_gau_bs_ch1),interpolate = TRUE) +
  geom_line(
    data = cohort_lines(),
    aes(x = date, y = age, group = cohort,color = trend),
    linewidth = 0.3,
    alpha = 0.4
  )+
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  scale_fill_viridis_c(option = "turbo", n.breaks = 10) +
  theme_minimal() +
  labs(
    x = "Admission date",
    y = "Age (years)",
    fill = "Number of\nadmissions",
    tag = "D"
  )+
  theme_minimal()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(-0.5,6.5)),breaks = seq(-1,7))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

## calculate atk rate as a function of time
df_sero <- df_sero |>
  as_tibble() |>
  mutate(collection = id |>
           str_remove(".*-") |>
           as.numeric() |>
           divide_by(1e4) |>
           round(),
         col_date2 = as.numeric(col_date),
         across(pos, ~ .x > 0))

outttt <- age_profile_constrained_cohort2(df_sero)


## fig 2E
atk_sero <- map2(head(outttt, -15),
     tail(outttt,-15),
     ~ left_join(na.exclude(.x), na.exclude(.y), "cohort")|>
       mutate(attack = (fit.y - fit.x) / (1 - fit.x),
              date = as.Date(collection_time.y))) |> 
  bind_rows() |> 
  ggplot() +
  geom_raster(aes(x=date, y=cohort,fill = attack),interpolate = TRUE)+
  geom_line(
    data = cohort_lines(seq(0,15,by=1)),
    aes(x = date, y = age, group = cohort,color = trend),
    linewidth = 0.25,
    alpha = 0.3
  )+
  scale_fill_viridis_c(option = "turbo",name = "Attack rate")+
  theme_classic()+
  scale_y_reverse(name = "Age (years)",
                  lim= rev(c(0,14)),
                  breaks = seq(0,14,by=2),
                  minor_breaks = NULL)+
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b",
               name = "Collection date in 2023",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")),
               minor_breaks = NULL)+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(color = "none")


## fig 2B
age_profile_sp_cm <- outttt |> 
  bind_rows() |> 
  mutate(col_date = as.Date(collection_time)) |> 
  filter(col_date %in% as.Date(c("2022-12-29","2023-04-11","2023-08-13","2023-12-31"))) |> 
  mutate(col_lab = format(col_date, format = "%b %Y"),
         col_lab = factor(col_lab,levels = c("Dec 2022","Apr 2023","Aug 2023","Dec 2023"))) |> 
  ggplot(aes(x = cohort,y = fit))+
  geom_line()+
  geom_ribbon(aes(ymin = lwr,ymax = upr),fill = "blue", alpha = .3)+
  geom_point(data = df_sero |> 
               mutate(col_lab = factor(col_time,levels = c("Dec 2022","Apr 2023",
                                                          "Aug 2023","Dec 2023"))),
             aes(x = age, y = pos),shape = "|")+
  facet_wrap(~ col_lab,nrow = 1)+
  labs(y = "Seroprevalence (%)",x = "Age (years)",tag="B")+
  scale_y_continuous(labels = scales::label_percent(scale = 100),limits = c(0,1))+
  theme_bw()+
  theme(axis.title.x = element_text(size = 18),
        axis.text.x = element_text(size = 18),
        legend.position = "none",
        plot.tag = element_text(face = "bold", size = 18),
        axis.title.y = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        strip.text.x = element_text(size = 18))


## fig 2A 

ouut <- list()
dat <- list()

for (i in 1:3){
  dat[[i]] <- df_cases_ch1_23 |> 
    filter(admission_date >= as.Date(mean_collection_times[i]) &
             admission_date <= as.Date(mean_collection_times[i+1]))
}


expected_incidence <- map2(head(outttt, -15),
                           tail(outttt,-15),
                           ~ left_join(na.exclude(.x), na.exclude(.y), "cohort")|>
                             mutate(attack = (fit.y - fit.x) / (1 - fit.x),
                                    date = as.Date(collection_time.y))) |> 
  bind_rows() |> 
  filter(date %in% as.Date(c("2023-04-11","2023-08-13","2023-12-31"))) |> 
  mutate(period = case_when(
    date <= as.Date("2023-04-30") ~ "12/2022 - 4/2023",
    date > as.Date("2023-04-30") & date <= as.Date("2023-08-31") ~ "4/2023 - 8/2023",
    date > as.Date("2023-08-31") ~ "8/2023 - 12/2023"
  )) |> 
  group_by(period) |> 
  group_split() |> 
  map(~ left_join(.x,expected_age_cm, by = join_by(cohort == age,period))) %>% 
  bind_rows(.id = "id") |> 
  mutate(sp_gap = fit.y-fit.x,
         exp_inc = sp_gap*value)

exp_in_ch1 <- expected_incidence |> 
  ggplot()+
  geom_line(aes(x = cohort,y = exp_inc))+
  geom_col(data = dat %>% 
                   bind_rows(.id = "id") %>% 
                   filter(age1 >0) |> 
                   group_by(id,age) |> 
                   count(),
                 aes(x = age, y = n),
                 color = "white",fill = "black",alpha = 0.2)+
  facet_wrap(~factor(id,labels = c("12/2022 - 4/2023",
                                   "4/2023 - 8/2023",
                                   "8/2023 - 12/2023")))+
  labs(y = "Expected infections",x = "Age (years)",tag = "A")+
  # coord_cartesian(ylim = c(0,1200))+
  theme_bw()+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 18),
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.x = element_text(size = 18),
    axis.title.y = element_text(size = 18),
    legend.title = element_text(size = 18),
    strip.text = element_text(size = 18),
    panel.grid.minor.x = element_blank())
Code
fig2 <- exp_in_ch1/
  age_profile_sp_cm/
  (ts_ch1+labs(tag="C"))/
  (hm_ch1+labs(tag="D"))/
  (atk_sero+labs(tag="E"))

ggsave("./fig_manuscript/fig2_new.svg",plot = fig2, 
       width = 15,height = 20,dpi = 500)

Explain how I generated figure 2E. I used constrainted algorithm. I mentioned the algorithm overhere: Using GAMs to model seroprevalence as a function of both age and time using a 2-step algorithm.

-* Step 1 (age profile): for each time point (i.e. samples collection):

  • fit an unconstrained binomial GAM to seropositivity as a function of age

  • convert population seroprevalence into individual seropositivity realizations: for each value of a large vector of age values:

    • generate the prediction + confidence interval

    • from each of the rate values of the prediction and confidence interaval lower and upper bounds, generate random realizations of a Bernoulli process to convert population seroprevalence into individual seropositivity

  • Step 2a (epidemiological time): for each value of age:

    • fit a monotonically increasing binomial GAM to the Bernoulli realizations as a function of time
    • for each time point: generate the prediction + confidence interval
  • Step 2b (smoothing out the stochasticity introduced by the seroprevalence to seropositivity conversion): for each time point fit an unconstrained beta GAMs to the predictions and confidence interval bounds of step 2a as a function of age

But in the step 2b, instead of fitting 4 collection time points, I fitted 52 colection time point as 52 week, then I exclude the age profile of seroprevalence before April 2023. And I calculated attack rate followed \(attack \ racte = \frac{seroprevalence_{a,t}-seroprevalence_{a,t-1}}{1 - seroprevalence_{a,t-1}}\). And plot it by ggplot

4.7 Figure 3

Try to model virology as a function of age and time

In this section, I used the GAM fitted as a function of age and time of CH1 reported cases (used to generate heatmap in section 9.5.1 but for CH1 cases only) (1), combined with GAM fitted as a function of age and time of virological data (used to generate figure 3C)(2). Then I multiplied \(reported \ cases (1) \times EV-A71 \ percentage (2)\) at each age and time then I had number of ev-a71 of each age and time. Then I group by each time and sum ev-a71 cases at all age group to generate the plot below.

Code
time_f <- df_cases_ch1_23 %>%
  group_by(adm_week) %>%
  count() %>%
  ungroup()

df_viro <- df_viro %>%
  mutate(pos = case_when(
    sero_gr == "EV-A71" ~ TRUE,
    sero_gr != "EV-A71" ~ FALSE),
    pos_cv = case_when(
      sero_gr == "CV-A6" ~ TRUE,
      sero_gr != "CV-A6" ~ FALSE),
    peak = case_when(
      month(admission_date) >= 6 & month(admission_date) <= 8 ~ "6/2023 - 8/2023",
      month(admission_date) >= 9 ~ "9/2023 - 12/2023"
    ),
    adm_week = as.Date(floor_date(admission_date, "week")),
    adm_month = as.Date(floor_date(admission_date, "month")),
    time = as.numeric(admission_date)
  )

# dt_vr <- df_viro %>%
#   group_by(adm_month) %>%
#   count(pos) %>%
#   ungroup() %>%
#   pivot_wider(names_from = pos,values_from = n,names_prefix = "pos") %>%
#   replace(is.na(.),0) %>%
#   mutate(total = posTRUE+posFALSE,
#          per = posTRUE/total,
#          time = adm_month %>% as.numeric()
#          )%>%
#   rowwise() %>%                          # needed for prop.test()
#   mutate(
#     lwr = prop.test(posTRUE, total)$conf.int[1],   # 95% CI lower bound
#     upr = prop.test(posTRUE, total)$conf.int[2]    # optional upper bound
#   )


# modelll <- gam(pos~s(time),family = binomial,method = "REML",data = df_viro)
# 
# link_inv <- modelll$family$linkinv
# 
# recon_epicurve_dt <- modelll %>%
#   predict(list(time = time_f %>%
#                  filter(adm_week >= min(df_viro$admission_date)) %>%
#                  pull(adm_week) %>%
#                  as.numeric()
#   ),se.fit = TRUE) %>%
#   as_tibble() %>%
#   mutate(time = time_f %>%
#            filter(adm_week >= min(df_viro$admission_date)) %>%
#            pull(adm_week),
#          lwr = link_inv(fit - 1.96*se.fit),
#          upr = link_inv(fit + 1.96*se.fit),
#          fit = link_inv(fit)
#          ) %>%
#   dplyr::select(-se.fit) %>%
#   left_join(time_f,., by = join_by(adm_week == time)) %>%
#   mutate(e_ev71 = fit*n,
#          e_ev71_lwr = lwr*n,
#          e_ev71_upr = upr*n) %>%
#   replace(is.na(.),-1) 
# 
# recon_epicurve <- recon_epicurve_dt %>%
#   ggplot(aes(x = adm_week))+
#   geom_line(aes(y = fit*1000),linetype = "dashed")+
#   geom_ribbon(aes(y = fit*1000,ymin = lwr*1000,ymax = upr*1000),fill = "blue",alpha = .3)+
#   geom_line(aes(y = e_ev71))+
#   geom_ribbon(aes(y = e_ev71_upr,ymin = e_ev71_lwr,ymax = e_ev71_upr),fill = "red",alpha = .5)+
#   scale_y_continuous(
#     name = "CH1 admission",
#     sec.axis = sec_axis(~ . /1000, name = "EV-A71 percentage"),
#     limits = c(0,1000)
#   ) +
#   geom_col(aes(y=n),alpha = 0.3)+
#   geom_point(data = df_viro, aes(x = admission_date, y = pos*1000),shape = "|")+
#   scale_x_date(date_breaks = "1 month",
#                date_labels = "%b",
#                limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
#   theme_minimal()+
#   theme(axis.title.y = element_text(size = 18),
#         axis.title.x = element_blank(),
#         axis.ticks.x = element_blank(),
#         legend.position = "bottom",
#         plot.tag = element_text(face = "bold", size = 18),
#         axis.text.x = element_blank(),
#         axis.text.y = element_text(size = 18),
#         legend.text = element_text(size = 15),
#         legend.title = element_text(size = 18))

per_serotype <- df_viro %>%
  group_by(adm_month) %>%
  count(sero_gr) %>%
  mutate(total = sum(n),
         per = n/total) %>%
  ggplot(aes(x = adm_month,y=per,fill = sero_gr))+
  geom_bar(position="fill", stat="identity")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
  scale_y_continuous(labels = scales::label_percent())+
  labs(fill = "Serotype group",y = "Percentage (%)",tag = "B")+
  theme_minimal()+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))

## model ev-a71 percentage as a function of time and age

model_viro_age_time <- gam(pos ~ s(age_adm) + s(time) + ti(age_adm,time),
                           family = binomial,method = "REML",data = df_viro)

collection_date_val <- seq(min(df_viro$time),
                           max(df_viro$time), le = 365)

new_data2 <- expand.grid(age_adm = age_val,
                         time = as.numeric(collection_date_val))

new_data2$pred <- predict(model_viro_age_time, newdata = new_data2, type = "response")


ev_a71_age_time_p <- ggplot() +
  geom_raster(data = new_data2, 
              aes(x = as.Date(time), y = age_adm, fill = pred), interpolate = TRUE) +
  scale_fill_viridis_c(option = "turbo",n.breaks = 10) +
  geom_line(
    data = cohort_lines(viro = TRUE),
    aes(x = date, y = age, group = cohort,color = trend),
    linewidth = 0.25,
    alpha = 0.3
  ) +
  labs(x = "Week", y = "Age", fill = "Smoothed EV-A71 percentage")+
  scale_x_date(date_breaks = "1 month", date_labels = "%b", name = "Collection date in 2023",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31")))+
  theme_classic()+
  scale_y_reverse(name = "Age (years)",lim= rev(c(-0.5,6.5)),breaks = seq(-1,7))+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=25),
         color = "none")

# fig3 <- (recon_epicurve+labs(tag="A"))/
#   (per_serotype+labs(tag="B"))/
#   (ev_a71_age_time_p+labs(tag="C"))
# 
# ggsave("./fig_manuscript/fig3_new.svg", plot = fig3,
#        width = 13,height = 12,dpi = 500)
Code
# df_cases_count_ch1_23 <- df_cases_ch1_23 |> 
#   group_by(age,time) |> 
#   count() |> 
#   ungroup()
#   
# fit_gau_bs_ch1 <- gam(n ~ s(age,bs = "bs") + 
#                     s(time,bs = "bs") + 
#                     ti(age, time, bs = c("bs", "bs")) , data = df_cases_count_ch1_23,
#                   family = gaussian(link = "log"),method = "REML")

cases_fit <- fit_gau_bs_ch1 |>  
    predict(newdata = df_cases_count_ch1_23, se.fit = TRUE) |> 
    as.tibble() |> 
    cbind(df_cases_count_ch1_23) |>
    mutate(
      # c_lwr = fit_gau_bs_ch1$family$linkinv(fit - 1.96 * se.fit),
      # c_upr = fit_gau_bs_ch1$family$linkinv(fit + 1.96 * se.fit),
      c_fit = fit_gau_bs_ch1$family$linkinv(fit)
           ) |> 
    dplyr::select(-c(se.fit,fit))
    
recontructed_df <- model_viro_age_time |>
  predict(newdata = df_cases_count_ch1_23 |> rename(age_adm = age), se.fit = TRUE) |>
  as.tibble() |>
  cbind(df_cases_count_ch1_23) |>
  mutate(v_lwr = model_viro_age_time$family$linkinv(fit - 1.96 * se.fit),
         v_upr = model_viro_age_time$family$linkinv(fit + 1.96 * se.fit),
         v_fit = model_viro_age_time$family$linkinv(fit)) |>
  dplyr::select(-c(se.fit,fit)) |>
  full_join(cases_fit, by = join_by(age, time,n)) |>
  mutate(ev71_c = c_fit*v_fit,
         ev71_u = c_fit*v_upr,
         ev71_l = c_fit*v_lwr)

recontructed_cases_count_df <- recontructed_df |>
  mutate(day = as.Date(time),
         week = as.Date(floor_date(day, "week"))) |> 
  group_by(week) |>
  summarise(cases = sum(c_fit),
            # cases_u = sum(c_upr),
            # cases_l = sum(c_lwr),
            ev71_cases = sum(ev71_c),
            ev71_cases_u = sum(ev71_u),
            ev71_cases_l = sum(ev71_l)) 

f3a <- recontructed_cases_count_df |> 
  ggplot(aes(x = week))+
  geom_line(aes(y = cases),linetype = "dashed")+
  # geom_ribbon(aes(ymin = cases_u,ymax = cases_l),fill = "blue",alpha = .3)+
  geom_line(aes(y = ev71_cases))+
  geom_ribbon(aes(ymin = ev71_cases_u,ymax = ev71_cases_l),fill = "red",alpha = .3)+
  geom_col(data = df_cases_count_ch1_23 |>
             mutate(day = as.Date(time),
                    week = as.Date(floor_date(day, "week")))|>
             group_by(week) |>
             summarise(cases = sum(n)),
           aes(x = week, y  = cases),alpha = .3)+
  geom_point(data = df_viro, aes(x = admission_date, y = pos*1000),shape = "|",size = 2)+
  scale_x_date(date_breaks = "1 month",
               date_labels = "%b",
               name = "Collection date",
               limits = c(as.Date("2023-01-01"),as.Date("2023-12-31"))) +
  labs(x = "Admission day (2023)",y = "Total number of cases",tag = "A") +
  theme_minimal()+
  theme(axis.title.y = element_text(size = 18),
        axis.title.x = element_text(size = 18),
        axis.ticks.x = element_blank(),
        legend.position = "bottom",
        plot.tag = element_text(face = "bold", size = 18),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 18),
        legend.text = element_text(size = 15),
        legend.title = element_text(size = 18))+
  guides(fill=guide_colourbar(barwidth=20),
         color = "none")

f3a

The bar show the raw data of reported cases admitted to CH1 at each date. The dased line is the fitted GAM model. The solid line is the reconstructed EV-A71 cases.

And compare with the expected infectious estimated from serological data

Code
compare_ev71 <- recontructed_df |> 
  mutate(day = as.Date(time),
         week = as.Date(floor_date(day, "week")),
         period = case_when(
           week <= as.Date("2023-04-30") ~ "12/2022 - 4/2023",
           week > as.Date("2023-04-30") & week <= as.Date("2023-08-31") ~ "4/2023 - 8/2023",
           week > as.Date("2023-08-31") ~ "8/2023 - 12/2023"
         )) |> 
  group_by(period,age) |> 
  summarise(ev_a71_cases = sum(ev71_c),
            raw_n = sum(n),
            .groups = "drop") |> 
  ggplot(aes(x = age, y = ev_a71_cases))+
  geom_col(alpha = 0.3)+
  geom_line(data = expected_incidence,
            aes(x = cohort,y=exp_inc))+
  scale_x_continuous(limits = c(0,15),
                     breaks = seq(0,15,by=5))+
  facet_wrap(~period)+
  labs(y = "Expected infections",x = "Age (years)",tag = "A")+
  theme_bw()+
  theme(
    axis.text.x = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    legend.text = element_text(size = 18),
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.x = element_text(size = 18),
    axis.title.y = element_text(size = 18),
    legend.title = element_text(size = 18),
    strip.text = element_text(size = 18),
    panel.grid.minor.x = element_blank())


compare_ev71

Combined all analysis in this section

Code
fig3_new <- (compare_ev71+labs(tag="A"))/
  (f3a+labs(tag="B"))/
  (per_serotype+labs(tag="C"))/
  (ev_a71_age_time_p+labs(tag="D"))

ggsave("./fig_manuscript/fig3_new2.svg", plot = fig3_new,
       width = 13,height = 15,dpi = 500)

4.8 Supplementary plot

Code
df23 %>%
  group_by(adm_week, district) %>%
  count() %>%
  ggplot(aes(x = adm_week, y = n)) +
  geom_col() +
  scale_x_date(
    date_breaks = "2 months",
    date_labels = "%b",
    name = "Admission week",
    limits = c(as.Date("2023-01-01"), as.Date("2023-12-31"))
  ) +
  theme_bw() +
  labs(y = "Number of admission") +
  facet_wrap(~district, ncol = 4) +
  theme(
    axis.title.x = element_text(size = 15),
    axis.text.x = element_text(size = 15),
    axis.ticks.x = element_blank(),
    legend.position = "none",
    plot.tag = element_text(face = "bold", size = 18),
    axis.title.y = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    strip.text.x = element_text(size = 18)
  )

Code
ggsave("./fig_manuscript/sup1_new.svg",
       width = 12,height = 12,dpi = 500)

4.8.1 compare the age distributions in the hospital data with the age distribution in the districts where the hospital patients are coming from

Code
df_cases_ch1_23 |> 
  mutate(age3 = ceiling(age1)) |> 
  group_by(age3,district) |> 
  count() |> 
  left_join(ch1_outpatient |> 
              group_by(district,age_gr) |> 
              summarise(n = sum(n)),
            by = join_by(district,age3 == age_gr)) |> 
  ungroup() |> 
  set_colnames(c("age","district","hfmd_cases","outpatient")) |> 
  ggplot(aes(x = age))+
  geom_line(aes(y = outpatient))+
  geom_point(aes(y = hfmd_cases*6))+
  scale_y_continuous(
    name = "Outpatients admissions",
    # Define the secondary axis
    sec.axis = sec_axis(
      trans = ~ . / 6, 
      name = "HFMD cases")
  )+
  facet_wrap(~district)

The lines show age distribution of outpatients cases admitted to CH1 in 2023. The points show age distribution of HFMD cases admitted to CH1 in 2023