2 HFMD Seroprevalence

Code

library(readxl)
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
library(lubridate)
library(magrittr)
library(mgcv)
library(tidyverse)
library(patchwork)
library(sf)
library(janitor)
library(ggsci)
library(cowplot)
invisible(Sys.setlocale("LC_TIME", "English"))

3 Import data

3.1 Serology data

Code

apr_2023 <- read_excel("D:/OUCRU/hfmd/data/4_2023.xlsx")
aug_2023 <- read_excel("D:/OUCRU/hfmd/data/08_2023.xlsx")
dec_2022 <- read_excel("D:/OUCRU/hfmd/data/12_2022.xls")
dec_2023 <- read_excel("D:/OUCRU/hfmd/data/12_2023.xlsx")

t423 <- data.frame(apr_2023[-c(1,2),c(6,8,10:14)])
t423$pos <- replace(t423$...14,is.na(t423$...14),0) %>%
  str_detect(regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t423) <- c("id","age_gr","age","col_day","col_month","col_year","neutralization","pos")
t423$age <- as.numeric(t423$age)
t423$col_time <- rep("Apr 2023",nrow(t423))


t823 <- data.frame(aug_2023[-c(1,2),c(6,8,9,14:17)])
t823$pos <- str_detect(t823$...17,regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t823) <- c("id","age_gr","age","col_day","col_month","col_year","neutralization","pos")
t823$age <- as.numeric(t823$age)
t823$col_time <- rep("Aug 2023",nrow(t823))

t1222 <- data.frame(dec_2022[-c(1,2),c(6,8,10:14)])
t1222$pos <- replace(t1222$...14,is.na(t1222$...14),0) %>%
  str_detect(regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t1222) <- c("id","age_gr","age","col_day","col_month","col_year","neutralization","pos")
t1222$age <- as.numeric(t1222$age)
t1222$col_time <- rep("Dec 2022",nrow(t1222))


t1223 <- data.frame(dec_2023[-c(1,2),c(6,8,9,14:17)])
t1223$pos <- replace(t1223$...17,is.na(t1223$...17),0) %>%
  str_detect(regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t1223) <- c("id","age_gr","age","col_day","col_month","col_year","neutralization","pos")
t1223$age <- as.numeric(t1223$age)
t1223$col_time <- rep("Dec 2023",nrow(t1223))

3.2 Case

Code

df1 <- read_excel("D:/OUCRU/hfmd/data/TCM_full.xlsx", 
                  col_types = c("date", "numeric", "text", 
                                "text", "text", "date", "date", "date", 
                                "text", "text", "text"))
colnames(df1) <- c("dob", "age", "gender", "commune", "district",
                   "reported_date", "onset_date","adm_date",
                   "medi_cen","inout","severity")
df1$dob <- df1$dob %>% as_date()
df1$adm_date <- df1$adm_date %>% as_date()

df1$age1 <- interval(df1$dob, df1$adm_date) / years(1)
df1$adm_week <- as.Date(floor_date(df1$adm_date, "week"))
df1$district <- df1$district %>% str_replace_all(
  c( "Quận Gò vấp"  = "Quận Gò Vấp"))
df1$district <- df1$district %>% 
  str_remove("Quận|Huyện|Thành phố") %>%
  trimws(which = "both") 

df_plot <- df1 %>% filter(year(adm_week) == "2023") %>%
  filter(!is.na(adm_week) ) %>%
  count(adm_week) %>% as.data.frame()

ts <- ggplot()+
  geom_bar(data = df_plot, aes(x = as.Date(adm_week), y = n),stat = "identity",
           alpha = 0.5) +
  labs(x = "Admission week",y = "Cases")+
  geom_vline(xintercept = as.Date("2022-12-01"),
             alpha = 0.4,col = "#0808cf")+
  geom_vline(xintercept = as.Date("2022-12-30"),
             alpha = 0.4,col = "#0808cf")+
  geom_vline(xintercept = as.Date("2023-04-01"),
             alpha = 0.4,col = "#ed097b")+
  geom_vline(xintercept = as.Date("2023-04-30"),
             alpha = 0.4,col = "#ed097b")+
  geom_vline(xintercept = as.Date("2023-08-01"),
             alpha = 0.4,col = "#ed6b00")+
  geom_vline(xintercept = as.Date("2023-08-30"),
             alpha = 0.4,col = "#ed6b00")+
  geom_vline(xintercept = as.Date("2023-12-01"),
             alpha = 0.4,col = "#33516b")+
  geom_vline(xintercept = as.Date("2023-12-30"),
             alpha = 0.4,col = "#33516b")+
  xlim(as.Date("2022-11-24"),as.Date("2024-01-01"))+theme_classic()

4 Function

Code

predict2 <- function(x, ci = .95, le = 512, m = 100) {
  p <- (1 - ci) / 2

  link_inv <- x$family$linkinv
  dataset <- x$data
  n <- nrow(dataset) - length(x$coefficients)
  age_range <- range(dataset$age)

  ages <- seq(age_range[1], age_range[2], le = le)

  x |>
    predict(data.frame(age = ages), se.fit = TRUE) |>
    extract(c("fit", "se.fit")) %>%
    c(age = list(ages), .) |>
    as_tibble() |>
    mutate(lwr = m * link_inv(fit + qt(    p, n) * se.fit),
           upr = m * link_inv(fit + qt(1 - p, n) * se.fit),
           fit = m * link_inv(fit)) |>
    select(- se.fit)
}

5 Seroprevalence by age

5.1 Fit by GLM

Code

m <- 100
eps <- 1

plot1222 <-  glm(pos ~ age, binomial, data = t1222) |>
  predict2() %>% as.data.frame() %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Dec 2022"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5)+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Dec 2022" = "#0808cf"))+
  labs(y = "Seroprevalence(%)")+
  geom_point(data= t1222, aes(x = age,y = m * pos + eps),shape = "|",size = 3,
             col = "#0808cf")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.title= element_blank(),
    legend.position = "inside",
    legend.position.inside =  c(0.15,0.80),
    legend.text = element_text(size = 15))

Code

plot1222 <-  glm(pos ~ age, binomial, data = t1222) |>
  predict2() %>% as.data.frame() %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Dec 2022"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#0808cf")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Dec 2022" = "#0808cf"))+
  labs(y = "Seroprevalence (%)")+
  geom_point(data= t1222, aes(x = age,y = m * pos + eps),shape = "|",
             col = "#0808cf")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Dec 2022"),size = 6)

plot0423 <-  glm(pos ~ age + I(age ^2), binomial, data = t423) |>
  predict2() %>% as.data.frame() %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Apr 2023"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#ed097b")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Apr 2023" = "#ed097b"))+
  labs(y = "Seroprevalence (%)")+
  geom_point(data= t423, aes(x = age, m * pos + eps),
             shape = "|",
             col = "#ed097b")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Apr 2023"),size = 6)

plot0823 <-  glm(pos ~ age + I(age^2) + I(age^3), binomial, data = t823) |>
  predict2() %>% as.data.frame() %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Aug 2023"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#ed6b00")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Aug 2023" = "#ed6b00"))+
  labs(y = "Seroprevalence (%)")+
  geom_point(data= t823, aes(x = age, m * pos + eps),
             shape = "|",
             col = "#ed6b00")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Aug 2023"),size = 6)

plot1223 <-  glm(pos ~ age + I(age^2) + I(age^3), binomial, data = t1223) |>
  predict2() %>% as.data.frame() %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Dec 2023"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#33516b")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Dec 2023" = "#33516b"))+
  labs(y = "Seroprevalence (%)")+
  geom_point(data= t1223, aes(x = age, m * pos + eps),
             shape = "|",
             col = "#33516b")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Dec 2023"),size = 6)

Code

plot1222 + plot0423 + plot0823 + plot1223

5.2 Fit by GAM

Code

## 
predictg <- function(x, ci = .95, le = 512, m = 100) {
  p <- (1 - ci) / 2
  
  link_inv <- x$family$linkinv
  dataset <- x$model
  n <- nrow(dataset) - length(x$coefficients)
  age_range <- range(dataset$age)
  
  ages <- seq(age_range[1], age_range[2], le = le)
  
  x |>
    predict(data.frame(age = ages), se.fit = TRUE) |>
    extract(c("fit", "se.fit")) %>%
    c(age = list(ages), .) |>
    as_tibble() |>
    mutate(lwr = m * link_inv(fit + qt(    p, n) * se.fit),
           upr = m * link_inv(fit + qt(1 - p, n) * se.fit),
           fit = m * link_inv(fit)) |>
    select(- se.fit)
}

## plot

plot_gam <- function(x,date){
  
  clo <- case_when(date == 1222 ~ "Dec 2022",
                   date == 423 ~ "Apr 2023",
                   date == 823 ~ "Aug 2023",
                   date == 1223 ~ "Dec 2023")
  clo2 <- case_when(date == 1222 ~ "#0808cf",
                    date == 423 ~ "#ed097b",
                    date == 823 ~ "#ed6b00",
                    date == 1223 ~ "#33516b")
  
  dtaa <- case_when(date == 1222 ~ t1222,
                    date == 423 ~ t423,
                    date == 823 ~ t823,
                    date == 1223 ~ t1223)
  
  x %>% as.data.frame() %>%
    ggplot(aes(x = age,y = fit))+
    geom_line(aes(col = clo))+
    geom_ribbon(aes(x = age,y = fit,
                    ymin=lwr, ymax=upr),alpha = 0.5,fill = clo2)+
    ylim(0,101)+
    theme_minimal() +
  scale_color_manual(name = "Y series",
                     values = c(clo = clo2))+
    labs(y = "Seroprevalence (%)")+
    geom_point(data= dtaa, aes(x = age, m * pos + 1),
               shape = "|",
               col = clo2)+
    theme(
      axis.text.x = element_text(size = 15),
      axis.text.y = element_text(size = 15),
      legend.position = "hide",
      legend.text = element_text(size = 15),
      axis.title.x = element_text(size = 15),
      axis.title.y = element_text(size = 15))+
    annotate("text", x = 3, y = 90, label = c(clo),size = 6) 
}  

g1222 <- gam(pos~s(age,bs = "bs"),method = "REML",
    family = "binomial",data = t1222) %>% 
  predictg() %>% plot_gam(date = 1222)  

g423 <- gam(pos~s(age,bs = "bs"),method = "REML",
    family = "binomial",data = t423) %>% 
  predictg() %>% plot_gam(date = 423)  

g823 <-gam(pos~s(age,bs = "bs"),method = "REML",
    family = "binomial",data = t823) %>% 
  predictg() %>% plot_gam(date = 823)  

g1223 <-gam(pos~s(age,bs = "bs"),method = "REML",
    family = "binomial",data = t1223) %>% 
  predictg() %>% plot_gam(date = 1223)

Code

g1222 | g423 | g823 | g1223

5.3 Constrain in each time point

Code

## Constrain function
contrain <- function(data1,data2){
  new_data <- data.frame(age = data2$age,
                         fit = rep(0,nrow(data2)),
                         lwr = rep(0,nrow(data2)),
                         up = rep(0,nrow(data2)))
  for (i in 1:512){
    if(data2$fit[i] < data1$fit[i]){
      new_data$fit[i] <- data1$fit[i]
      new_data$lwr[i] <- data1$lwr[i]
      new_data$upr[i] <- data1$upr[i]
    } else{
      new_data$fit[i] <- data2$fit[i]
      new_data$lwr[i] <- data2$lwr[i]
      new_data$upr[i] <- data2$upr[i]
    }
  }
  
  new_data$fit <- gam(fit ~ s(age),data = new_data)$fitted.values
  new_data$lwr <- gam(lwr ~ s(age),data = new_data)$fitted.values
  new_data$upr <- gam(upr ~ s(age),data = new_data)$fitted.values
  
  return(new_data)
}

tes1222 <- gam(pos~s(age,bs = "bs"),method = "REML",
    family = "binomial",data = t1222) %>% 
  predictg() 
tes0423 <- gam(pos~s(age,bs = "bs"),method = "REML",
    family = "binomial",data = t423) %>% 
  predictg() 
tes0823 <- gam(pos~s(age,bs = "bs"),method = "REML",
               family = "binomial",data = t823) %>% 
  predictg() 
tes1223 <- gam(pos~s(age,bs = "bs"),method = "REML",
               family = "binomial",data = t1223) %>% 
  predictg() 

## contrain 
con423 <- contrain(tes1222,tes0423)
con823 <- contrain(con423,tes0823)
con1223 <- contrain(con823,tes1223)


c423 <- con423 %>% plot_gam(date = 423) 
c823 <- con823 %>% plot_gam(date = 823) 
c1223 <- con1223 %>% plot_gam(date = 1223)

Code

gam_contrain <- g1222 | c423 | c823 | c1223

gam_contrain/
  ts

6 Model age and time at the same time

Code

library(plotly)
library(scam)

Code

atdf <- rbind(t1222,t423,t823,t1223) %>%
  mutate(col_date = make_date(year = col_year,
                              month = col_month,
                              day = col_day))

Code

head(atdf)

               id  age_gr        age col_day col_month col_year neutralization
1 20FL-001-060001 0≤ & <1 0.36986301      28        12     2022           <NA>
2 20FL-001-060002 0≤ & <1 0.33150685      27        12     2022           <NA>
3 20FL-001-060003 0≤ & <1 0.07671233       4         1     2023           <NA>
4 20FL-001-060004 0≤ & <1 0.43835616      27        12     2022           <NA>
5 20FL-001-060005 0≤ & <1 0.93150685       3         1     2023           <NA>
6 20FL-001-060006 1≤ & <2 1.95616438      26        12     2022           1:16
  pos col_time   col_date
1   0 Dec 2022 2022-12-28
2   0 Dec 2022 2022-12-27
3   0 Dec 2022 2023-01-04
4   0 Dec 2022 2022-12-27
5   0 Dec 2022 2023-01-03
6   1 Dec 2022 2022-12-26

6.1 Fit with SCAM model

mpi is monotone increasing SCOP-splines: bs=“mpi”. To achieve monotone increasing smooths this reparameterizes the coefficients so that they form an increasing sequence.

Model

Code

s1 <- scam(pos~s(age)+s(col_date,bs = "mpi"),family=binomial,
           mutate(atdf, across(col_date, as.numeric)))

6.2 3D plot

Code

age_val <- c(.1,1:14)

collection_date_val <- seq(min(atdf$col_date),
                           max(atdf$col_date), le = 512)

new_data <- expand.grid(age = age_val,
                        col_date = as.numeric(collection_date_val))

scamf <- cbind(new_data,
               fit = 100 * predict(s1, new_data,"response"))

scamf$col_date <- as.Date(scamf$col_date)

Code

plot_ly(scamf, x = ~sort(unique(as.Date(col_date))),
        y = ~sort(unique(age)),
        z = ~matrix(fit, 15),
        showscale = F) %>%
  add_surface()%>%
  layout(scene = list(
    xaxis = list(title = "Collection date"),
    yaxis = list(title = "Age"),
    zaxis = list(title = "Seroprevalence",range = c(0,100))
  ))%>% layout(autosize = F, width = 890, height = 1000, margin = m)

6.3 2-D plot

Code

predict2 <- function(x, ci = .95, le = 512, m = 100){
  p <- (1 - ci) / 2
  link_inv <- x$family$linkinv
  dataset <- x$model
  n <- nrow(dataset) - length(x$coefficients)
  age_range <- range(dataset$age)
  ages <- seq(age_range[1], age_range[2], le = le)
  date_range <- range(dataset$col_date)
  dates <- seq(date_range[1], date_range[2], le = le)

out <- x |>
  predict(expand.grid(age = ages,
                        col_date = dates), se.fit = TRUE) |>
  extract(c("fit", "se.fit")) %>%
  c(expand.grid(age = ages,
               date = dates),
    .) |>
  as_tibble() |>
  mutate(lwr = m * link_inv(fit + qt(    p, n) * se.fit),
           upr = m * link_inv(fit + qt(1 - p, n) * se.fit),
           fit = m * link_inv(fit)) |>
  select(-se.fit)

return(out)
}

Code

out <- predict2(s1)

out$date <- as.Date(out$date)

Code

s1222 <- out %>% filter(month(date) == 12 & year(date) == 2022) %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Dec 2022"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#0808cf")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Dec 2022" = "#0808cf"))+
  labs(y = "Seroprevalence (%)")+
  # geom_point(data= t1222, aes(x = age, m * pos + 1),
  #            shape = "|",
  #            col = "#0808cf")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Dec 2022"),size = 6)

s423 <- out %>% filter(month(date) == 4 & year(date) == 2023) %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Apr 2023"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#ed097b")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Apr 2023" = "#ed097b"))+
  labs(y = "Seroprevalence (%)")+
  # geom_point(data= t1222, aes(x = age, m * pos + 1),
  #            shape = "|",
  #            col = "#0808cf")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Apr 2023"),size = 6)

s823 <- out %>% filter(month(date) == 8 & year(date) == 2023) %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Aug 2023"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#ed6b00")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Aug 2023" = "#ed6b00"))+
  labs(y = "Seroprevalence (%)")+
  # geom_point(data= t1222, aes(x = age, m * pos + 1),
  #            shape = "|",
  #            col = "#0808cf")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Aug 2023"),size = 6)

s1223 <- out %>% filter(month(date) == 12 & year(date) == 2023) %>%
  ggplot(aes(x = age,y = fit))+
  geom_line(aes(col = "Dec 2023"))+
  geom_ribbon(aes(x = age,y = fit,
                  ymin=lwr, ymax=upr),alpha = 0.5,fill = "#33516b")+
  ylim(0,101)+
  theme_minimal()+
  scale_color_manual(name = "Y series",
                     values = c("Dec 2023" = "#33516b"))+
  labs(y = "Seroprevalence (%)")+
  # geom_point(data= t1222, aes(x = age, m * pos + 1),
  #            shape = "|",
  #            col = "#0808cf")+
  theme(
    axis.text.x = element_text(size = 15),
    axis.text.y = element_text(size = 15),
    legend.position = "hide",
    legend.text = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15))+
  annotate("text", x = 3, y = 90, label = c("Dec 2023"),size = 6)

Code

result_sero <- s1222 | s423 | s823 | s1223

result_sero/
  ts

6.4 GLM with PAVA algorithm

In chapter 9 of 2012 serobook, there are an argument that following Friedman and Tibshirani (1984) and Mammen et al. (2001), Shkedy et al. (2003) suggested to estimate π(a) and λ(a) using local polynomials and smoothing splines and, if necessary, a posteriori apply the PAVA to isotonize the resulting estimate.

Without loss of generality they assume \(\pi(a_{1}) \leq \pi(a_{2}) \leq ...\leq \pi(a_{i})\). The PAVA states that if \(\pi(a_{i}) \leq \pi(a_{i - 1})\) these values need to be “pooled.” In other words \(\hat{\pi}(a_{i})\) and \(\hat{\pi}(a_{i-1})\) are both replaced by

\[\frac{\hat{\pi}(a_{i})+\hat{\pi}(a_{i-1})}{2}\]

In my case, I fitted the glm to seropositive by age and time, then applied PAVA for fitted seroprevalence and 95% CI of each age group.

Model

mod1 <- glm(pos ~ age * col_date +
              I(age ^ 2) * col_date, binomial,
            mutate(atdf, across(col_date, as.numeric)))

Code

age_val <- c(.1, seq(1,15,0.5))
collection_date_val <- seq(min(atdf$col_date),
                           max(atdf$col_date))

new_data <- expand.grid(age = age_val, col_date = as.numeric(collection_date_val))

prdcts <- cbind(new_data, fit = 100 * predict(mod1, new_data, "response")) |> 
  as_tibble() |> 
  arrange(col_date) |> 
  mutate(across(col_date, as_date))

6.4.1 Unconstrained plot

Code

plot_ly(prdcts, x = ~sort(unique(as.Date(col_date))),
        y = ~sort(unique(age)),
        z = ~matrix(fit, length(age_val)),
        showscale = F) %>%
  add_surface()%>%
  layout(scene = list(
    xaxis = list(title = "Collection date"),
    yaxis = list(title = "Age"),
    zaxis = list(title = "Seroprevalence",range = c(0,100))
  ))%>% layout(autosize = F, width = 700, height = 1000, margin = m)

6.4.2 Applying PAVA algorithm

Code

library(Iso)
out_pava <- prdcts %>% 
  mutate(time_numeric = as.numeric(col_date)) %>% 
  group_by(age) %>% 
  arrange(time_numeric) %>%
  mutate(
    seroprev_monotonic = pava(fit, time_numeric, decreasing = FALSE)
  )

Code

plot_ly(out_pava,x = ~sort(unique(as.Date(col_date))),
          y = ~sort(unique(age)),
          z = ~matrix(seroprev_monotonic, length(age_val)),
          showscale = F) %>%
  add_surface()%>%
  layout(scene = list(
    xaxis = list(title = "Collection date"),
    yaxis = list(title = "Age"),
    zaxis = list(title = "Seroprevalence",range = c(0,100))
  ))%>% layout(autosize = F, width = 700, height = 1000, margin = m)

Code

## boostrap 

bootstrap <- function(data, new_data) {
  # Step 1: resample
  boot_data <- data %>% slice_sample(n = nrow(data), replace = TRUE)
  
  # Step 2: refit model
  mod <- glm(pos ~ age * col_date + I(age^2) * col_date, 
             family = binomial, 
             data = mutate(boot_data, col_date = as.numeric(col_date)))
  
  # Step 3: predict
  preds <- predict(mod, new_data, type = "response") * 100  # seroprev %
  preds_df <- bind_cols(new_data, fit = preds) |> 
    mutate(col_date = as_date(col_date))
  
  # Step 4: apply PAVA per age
  preds_df %>%
    mutate(time_numeric = as.numeric(col_date)) %>%
    group_by(age) %>%
    arrange(time_numeric) %>%
    summarise(
      col_date = col_date,
      seroprev_monotonic = pava(fit, time_numeric, decreasing = FALSE),
      .groups = "keep"
    )
}


B <- 10  # number of bootstrap iterations

set.seed(42)  # for reproducibility
boot_results <- map_dfr(1:B, ~bootstrap(atdf, new_data) %>% 
                          mutate(bootstrap = .x))

point_est <- out_pava %>% 
  mutate(col_time = case_when(
    month(col_date) == 12 & year(col_date) == 2022 ~ "Dec 2022",
    month(col_date) == 04 & year(col_date) == 2023 ~ "Apr 2023",
    month(col_date) == 08 & year(col_date) == 2023 ~ "Aug 2023",
    month(col_date) == 12 & year(col_date) == 2023 ~ "Dec 2023"
  )) %>% na.omit() %>% 
  group_by(col_time,age) %>%
  summarise(
    sp = mean(seroprev_monotonic)
  )

sp_ci <- boot_results %>% 
  mutate(col_time = case_when(
    month(col_date) == 12 & year(col_date) == 2022 ~ "Dec 2022",
    month(col_date) == 04 & year(col_date) == 2023 ~ "Apr 2023",
    month(col_date) == 08 & year(col_date) == 2023 ~ "Aug 2023",
    month(col_date) == 12 & year(col_date) == 2023 ~ "Dec 2023"
  )) %>% na.omit() %>% 
  group_by(col_time,age) %>%
  summarise(
    lower = quantile(seroprev_monotonic, 0.025),
    upper = quantile(seroprev_monotonic, 0.975),
    .groups = "drop"
  )  

# df.list <- list(t1222,t423,t823,t1223)

ggplot() +
  geom_line(data = point_est,aes(x = age, y = sp), color = "blue") +
  geom_ribbon(data = sp_ci, aes(x = age,ymin = lower, ymax = upper), fill = "blue", alpha = 0.3) +
  geom_point(aes(x = age, pos*100),
             shape = "|",data = atdf)+
  labs(y = "Seroprevalence (%)", x = "Age (years)") +
  facet_wrap(~factor(col_time,levels = c("Dec 2022","Apr 2023","Aug 2023","Dec 2023")),
             nrow = 1) +
  scale_x_continuous(breaks = seq(0,15,by = 3), minor_breaks = NULL)+
  scale_y_continuous(limits = c(0,100),breaks = seq(0,100,by = 25),
                     labels = scales::label_percent(scale = 1), minor_breaks = NULL)+
  theme_bw()+ 
  theme(axis.title.x = element_text(size = 18),
        axis.text.x = element_text(size = 18),
        legend.position = "none",
        plot.tag = element_text(face = "bold", size = 18),
        axis.title.y = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        strip.text.x = element_text(size = 18))

6.5 GAM with PAVA algorithm

Model

Code

gam1 <- gam(pos ~ s(age, bs = "bs") + s(col_date, bs = "bs"), method = "REML",
            family = "binomial",
            data = mutate(atdf, across(col_date, as.numeric)))

Code

gam_pred <- cbind(new_data, fit = 100 * predict(gam1, new_data, "response")) |> 
  as_tibble() |> 
  arrange(col_date) |> 
  mutate(across(col_date, as_date))

out <- gam_pred %>% 
mutate(time_numeric = as.numeric(col_date)) %>% 
  group_by(age) %>% 
  arrange(time_numeric) %>%
  mutate(
    seroprev_monotonic = pava(fit, time_numeric, decreasing = FALSE)
  ) 
  
sp1222 <- out %>% filter(month(col_date) == 12 & year(col_date) == 2022) %>% 
  group_by(age) %>% 
  summarise(
    sp = mean(seroprev_monotonic),
  ) %>% ggplot(aes(x = age,y = sp))+
  geom_line()+
  ylim(c(0,100))+
  labs(tag = "Dec 2022",y = "Seroprevalence (%)")+
  theme_minimal()



sp0423 <-out %>% filter(month(col_date) == 04 & year(col_date) == 2023) %>% 
  group_by(age) %>% 
  summarise(
    sp = mean(seroprev_monotonic),
  ) %>% ggplot(aes(x = age,y = sp))+
  geom_line()+
  ylim(c(0,100))+
  labs(tag = "Apr 2023",y = "Seroprevalence (%)")+
  theme_minimal()

sp0823 <- out %>% filter(month(col_date) == 08 & year(col_date) == 2023) %>% 
  group_by(age) %>% 
  summarise(
    sp = mean(seroprev_monotonic),
  ) %>% ggplot(aes(x = age,y = sp))+
  geom_line()+
  ylim(c(0,100))+
  labs(tag = "Aug 2023",y = "Seroprevalence (%)")+
  theme_minimal()

sp1223 <- out %>% filter(month(col_date) == 12 & year(col_date) == 2023) %>% 
  group_by(age) %>% 
  summarise(
    sp = mean(seroprev_monotonic),
  ) %>% ggplot(aes(x = age,y = sp))+
  geom_line()+
  ylim(c(0,100))+
  labs(tag = "Dec 2023",y = "Seroprevalence (%)")+
  theme_minimal()

Code

plot_ly(out, x = ~sort(unique(as.Date(col_date))),
        y = ~sort(unique(age)),
        z = ~matrix(seroprev_monotonic, length(age_val)),
        showscale = F) %>%
  add_surface()%>%
  layout(scene = list(
    xaxis = list(title = "Collection date"),
    yaxis = list(title = "Age"),
    zaxis = list(title = "Seroprevalence",range = c(0,100))
  ))%>% layout(autosize = F, width = 700, height = 1000, margin = m)

Code

sp1222 | sp0423 | sp0823 | sp1223

7 Spatial distribution of serum sample

Code

# cleaned <- read_csv("D:/OUCRU/HCDC/project phân tích sero quận huyện/cleaned.csv")
# 
# sero <- rbind(t1222,t1223,t423,t823)
# 
# sero_add <- full_join(cleaned,sero, by =  c("id" = "id"))
# 
# data_pt <- sero_add %>% filter(!is.na(age)&!is.na(qhchuan)) %>% 
#   select(qhchuan,age,col_month,col_year,pos) %>% 
#   as.data.frame()
# 
# # HCMC map
# map_path <- "D:/OUCRU/HCDC/project phân tích sero quận huyện/"
# vn_qh <- st_read(dsn = file.path(map_path, "gadm41_VNM.gpkg"), layer = "ADM_ADM_2")
# 
# vn_qh1 <- vn_qh %>%
#   clean_names() %>%     ## cho thành chữ thường
#   filter(
#     str_detect(
#       name_1,
#       "Hồ Chí Minh"
#     )
#   )
# qhtp <- vn_qh1[-c(14,21),]
# 
# qhtp$geom[qhtp$varname_2 == "Thu Duc"] <- vn_qh1[c("21","24","14"),] %>%
#   st_union()
# 
# qhtp <- qhtp %>% st_cast("MULTIPOLYGON")
# library(stringi)
# qhtp$varname_2 <- stri_trans_general(qhtp$varname_2, "latin-ascii") %>%
#   tolower() %>%
#   str_remove("district") %>%
#   trimws(which = "both")
# 
# qhtp$nl_name_2 <- c("BC","BTân","BT","CG","CC","GV",
#                     "HM","NB","PN","1","10","11","12",
#                     "3","4","5","6","7","8","TB",
#                     "TP","TĐ")

Code

# ## function
# get_sp <- function(month,year){
#   data_pt %>% filter(col_month == month & col_year == year) %>% 
#     select(-age) %>% group_by(qhchuan) %>% count()  %>% 
#     as.data.frame() %>%
#     dplyr::mutate(pre = n / sum(n))  
# }
# 
# plot_cover <- function(data,map){
#   output <- left_join(map, data.frame(data), by = join_by(varname_2 == qhchuan))
#   output %>% ggplot() +
#     geom_sf(aes(fill = pre),show.legend = T)+
#     # scale_fill_continuous(low="yellow", high="red",
#     #                       guide="colorbar",na.value="white",
#     #                       name = "Percentage",
#     #                       limits = c(0,20))+
#     # geom_sf_text(aes(label = nl_name_2),size=2.5)+
#     scale_fill_gsea(na.value = "white",
#                     breaks = seq(0,0.2,by = 0.05),
#                     limits = c(0, 0.2),
#                     name = "Percentage",
#                     labels = scales::label_percent())+
#     theme_void()  
# }

Code

# sp1222 <- get_sp(12,2022)
# sp0423 <- get_sp(4,2023)
# sp0823 <- get_sp(8,2023)
# sp1223 <- get_sp(12,2023)
# 
# p1222 <- plot_cover(data = sp1222,map = qhtp)
# p423 <- plot_cover(data = sp0423,map = qhtp)
# p823 <- plot_cover(data = sp0823,map = qhtp)
# p1223 <- plot_cover(data = sp1223,map = qhtp)
# 
# serum_dis <- plot_grid(p1222,p423,p823,p1223,
#           # labels = c("Dec 2022","Apr 2023","Aug 2023","Dec 2023"),
#           ncol = 4)

8 Final visualization

8.1 Unconstrained fit

Code

# #| fig-width: 15
# #| fig-height: 8
# #| out-width: "100%"
# glmfit <- plot1222 | plot0423 | plot0823 | plot1223
# 
# serum_dis/
#   glmfit/
#   ts
# 
# ggsave("D:/OUCRU/hfmd/figure/EV71 present/uncon.svg",dpi = 500,width = 15,height = 8,bg = "white")
# ggsave("D:/OUCRU/hfmd/figure/EV71 present/uncon.png",dpi = 500,width = 15,height = 8,bg = "white")

9 new method

Code

### data
atdf <- rbind(t1222,t423,t823,t1223) %>%
  mutate(col_date = make_date(year = col_year,
                              month = col_month,
                              day = col_day))

hfmd <- atdf %>% 
as_tibble() |> 
  mutate(collection = id |>
           str_remove(".*-") |> 
           as.numeric() |> 
           divide_by(1e4) |> 
           round(),
         col_date2 = as.numeric(col_date),
         across(pos, ~ .x > 0))

hfmd

# A tibble: 300 × 12
   id     age_gr    age col_day col_month col_year neutralization pos   col_time
   <chr>  <chr>   <dbl> <chr>   <chr>     <chr>    <chr>          <lgl> <chr>   
 1 20FL-… 0≤ & … 0.370  28      12        2022     <NA>           FALSE Dec 2022
 2 20FL-… 0≤ & … 0.332  27      12        2022     <NA>           FALSE Dec 2022
 3 20FL-… 0≤ & … 0.0767 4       1         2023     <NA>           FALSE Dec 2022
 4 20FL-… 0≤ & … 0.438  27      12        2022     <NA>           FALSE Dec 2022
 5 20FL-… 0≤ & … 0.932  3       1         2023     <NA>           FALSE Dec 2022
 6 20FL-… 1≤ & … 1.96   26      12        2022     1:16           TRUE  Dec 2022
 7 20FL-… 1≤ & … 1.40   26      12        2022     <NA>           FALSE Dec 2022
 8 20FL-… 1≤ & … 1.07   27      12        2022     <NA>           FALSE Dec 2022
 9 20FL-… 1≤ & … 1.71   6       12        2022     <NA>           FALSE Dec 2022
10 20FL-… 1≤ & … 1.92   27      12        2022     <NA>           FALSE Dec 2022
# ℹ 290 more rows
# ℹ 3 more variables: col_date <date>, collection <dbl>, col_date2 <dbl>

9.0.1 Function

Code

## function to calculate age profile
age_profile <- function(data, age_values = seq(0, 15, le = 512), ci = .95) {
  model <- gam(pos ~ s(age), binomial, data)
  link_inv <- family(model)$linkinv
  df <- nrow(data) - length(coef(model))
  p <- (1 - ci) / 2
  model |> 
    predict(list(age = age_values), se.fit = TRUE) %>%
    c(list(age = age_values), .) |> 
    as_tibble() |> 
    mutate(lwr = link_inv(fit + qt(    p, df) * se.fit),
           upr = link_inv(fit + qt(1 - p, df) * se.fit),
           fit = link_inv(fit)) |> 
    select(- se.fit)
}

predict2 <- function(...) predict(..., type = "response") |> as.vector()

Code

## extract collection time from data
col_date_ex <- hfmd  %>% 
  group_by(collection,age_gr) %>% 
  summarise(min_col_date = min(col_date2),
            max_col_date = max(col_date2))

col_date_ex

# A tibble: 60 × 4
# Groups:   collection [4]
   collection age_gr    min_col_date max_col_date
        <dbl> <chr>            <dbl>        <dbl>
 1          6 0≤ & <1          19353        19361
 2          6 10≤ & <11        19340        19354
 3          6 11≤ & <12        19333        19354
 4          6 12≤ & <13        19340        19354
 5          6 13≤ & <14        19332        19353
 6          6 14≤ & <15        19333        19342
 7          6 1≤ & <2          19332        19353
 8          6 2≤ & <3          19352        19354
 9          6 3≤ & <4          19340        19353
10          6 4≤ & <5          19352        19353
# ℹ 50 more rows

Code

## generate age label
age_label <- col_date_ex$age_gr %>%
  unique() %>%
  tibble(age_group = .) %>%
  mutate(age_start = as.numeric(str_extract(age_group, "^\\d+"))) %>%
  arrange(age_start) %>%
  pull(age_group)

age_label

 [1] "0≤ & <1"   "1≤ & <2"   "2≤ & <3"   "3≤ & <4"   "4≤ & <5"   "5≤ & <6"  
 [7] "6≤ & <7"   "7≤ & <8"   "8≤ & <9"   "9≤ & <10"  "10≤ & <11" "11≤ & <12"
[13] "12≤ & <13" "13≤ & <14" "14≤ & <15"

Code

age_values = seq(0, 15, le = 512)
ci = .95
n = 100

## Use extracted date assign to simulated individual 
set.seed(123)
sim_constr_tim <- hfmd |> 
  group_by(collection) |> 
  group_modify(~ .x |>
                 age_profile(age_values, ci) |> 
                 mutate(across(c(fit, lwr, upr), ~ map(.x, ~ rbinom(n, 1, .x))))) |> 
  ungroup() %>% 
  mutate(age2 = cut(age,breaks = c(seq(0,14),15.1),right = FALSE,labels = age_label))  %>%
  left_join(col_date_ex, join_by(collection == collection, age2 == age_gr)) %>% 
  unnest(c(fit, lwr, upr)) %>% 
  rowwise() %>% 
  # group_by(collection,age2) %>% 
  mutate(sim_col_time = runif(1, min_col_date, max_col_date)) %>%
  ungroup()

sim_constr_tim %>% select(-c("min_col_date","max_col_date"))

# A tibble: 204,800 × 7
   collection   age   fit   lwr   upr age2    sim_col_time
        <dbl> <dbl> <int> <int> <int> <chr>          <dbl>
 1          6     0     0     0     0 0≤ & <1       19353.
 2          6     0     0     0     1 0≤ & <1       19354.
 3          6     0     0     0     1 0≤ & <1       19361.
 4          6     0     0     0     1 0≤ & <1       19360.
 5          6     0     1     0     0 0≤ & <1       19361.
 6          6     0     0     0     1 0≤ & <1       19357.
 7          6     0     0     0     0 0≤ & <1       19356.
 8          6     0     0     0     1 0≤ & <1       19355.
 9          6     0     0     0     0 0≤ & <1       19359.
10          6     0     0     0     0 0≤ & <1       19356.
# ℹ 204,790 more rows

Code

## group age by 3 years
sim_constr <- sim_constr_tim %>% 
  select(-c(min_col_date,max_col_date)) %>% 
  mutate(age_gr = cut(age,breaks = c(seq(0, 15, by = 3),82),
                      labels = c("0< & ≤3 year-old",
                                 "3< & ≤6 year-old",
                                 "6< & ≤9 year-old",
                                 "9< & ≤12 year-old",
                                 "12< & ≤15 year-old",
                                 "16+ "),right = F)) %>% 
  select(fit,sim_col_time,age_gr) %>% filter(age_gr != "16+ ")
sim_constr

# A tibble: 204,400 × 3
     fit sim_col_time age_gr          
   <int>        <dbl> <fct>           
 1     0       19353. 0< & ≤3 year-old
 2     0       19354. 0< & ≤3 year-old
 3     0       19361. 0< & ≤3 year-old
 4     0       19360. 0< & ≤3 year-old
 5     1       19361. 0< & ≤3 year-old
 6     0       19357. 0< & ≤3 year-old
 7     0       19356. 0< & ≤3 year-old
 8     0       19355. 0< & ≤3 year-old
 9     0       19359. 0< & ≤3 year-old
10     0       19356. 0< & ≤3 year-old
# ℹ 204,390 more rows

Code

## model unconstrained gam as function of time

date_value <- seq(as.Date("2023-01-01"),as.Date("2023-12-31"))|> as.numeric()

age_profile_time <- function(data, date_value, ci = .95) {
  model <- gam(fit ~ s(sim_col_time),binomial,data = data) 
  link_inv <- family(model)$linkinv
  df <- nrow(data) - length(coef(model))
  p <- (1 - ci) / 2
  model |> 
    predict(list(sim_col_time = date_value), se.fit = TRUE) %>%
    c(list(date = date_value), .) |> 
    as_tibble() |> 
    mutate(lwr = link_inv(fit + qt(    p, df) * se.fit),
           upr = link_inv(fit + qt(1 - p, df) * se.fit),
           fit = link_inv(fit)) |> 
    select(- se.fit)
}


sp_time_agegr <- sim_constr %>% 
  group_by(age_gr) %>%
  group_modify(~ . |>
                 age_profile_time(date_value = date_value, ci = .95) |> 
                 mutate(across(c(fit, lwr, upr), ~ map(.x, ~ rbinom(n, 1, .x)))))|> 
  ungroup() |> 
  unnest(c(fit, lwr, upr)) |>
  pivot_longer(c(fit, lwr, upr), names_to = "line", values_to = "seropositvty")|> 
  group_by(age_gr, line) |> 
  ### usingg constrained gam to predict
  group_modify(~ .x %>%
                 scam(seropositvty ~ s(date, bs = "mpi"), binomial, .) |> 
                 predict2(list(date = date_value)) %>%
                 tibble(date = date_value,
                        seroprevalence  = .)) |> 
  ungroup() |>
  pivot_wider(names_from = line,values_from = seroprevalence)

sp_time_agegr

# A tibble: 1,825 × 5
   age_gr            date    fit     lwr    upr
   <fct>            <dbl>  <dbl>   <dbl>  <dbl>
 1 0< & ≤3 year-old 19358 0.0155 0.00786 0.0243
 2 0< & ≤3 year-old 19359 0.0155 0.00786 0.0243
 3 0< & ≤3 year-old 19360 0.0155 0.00786 0.0243
 4 0< & ≤3 year-old 19361 0.0155 0.00786 0.0243
 5 0< & ≤3 year-old 19362 0.0155 0.00786 0.0243
 6 0< & ≤3 year-old 19363 0.0155 0.00787 0.0243
 7 0< & ≤3 year-old 19364 0.0155 0.00787 0.0243
 8 0< & ≤3 year-old 19365 0.0155 0.00787 0.0243
 9 0< & ≤3 year-old 19366 0.0156 0.00788 0.0244
10 0< & ≤3 year-old 19367 0.0156 0.00789 0.0244
# ℹ 1,815 more rows

Code

sp_time_agegr %>% 
  ggplot(aes(x = as.Date(date) , y = fit))+
  geom_line(aes(x = as.Date(date) , y = fit))+
  geom_ribbon(aes(ymin=lwr,ymax = upr),alpha = 0.2)+
  facet_wrap(~age_gr,ncol = 5)+
  ylim(0,1)