The data are in this folder:
oned <- "~/Library/CloudStorage/OneDrive-OxfordUniversityClinicalResearchUnit/"
data_folder <- paste0(oned, "GitHub/OUCRU-Modelling/hfmd/")
Make sure that all the package listed below are installed. If a
package (e.g. tidyr
) is not installed, install it by typing
install.packages("tidyr")
at the command line.
library(readxl)
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
library(lubridate)
library(magrittr)
library(mgcv)
A function that reads the excel files:
read_excel2 <- function(x, ...) {
out <- readxl::read_excel(paste0(data_folder, x), skip = 1, ...)
out |>
names() |>
str_remove("^.*\r*\n\\(") |>
str_remove("\\)") |>
str_replace("...12", "month") |>
str_replace("...13", "year") %>%
setNames(out, .) |>
filter(! is.na(Participant_No))
}
A tuning of the polygon()
function:
polygon2 <- function(x, y1, y2, ...) {
polygon(c(x, rev(x)), c(y1, rev(y2)), ...)
}
The function that performs a likelihood ratio test:
lrt <- function(...) anova(..., test = "LRT")
A tuning of the points()
function:
points2 <- function(...) points(..., pch = "|", cex = .5)
A tuning of the mgcv::gam()
function:
gam2 <- function(formula, family = gaussian(), data = list(), ...){
out <- mgcv::gam(formula, family, data, ...)
out$data <- data
out
}
A function that extracts model quality metrics from a
gam
object:
quality <- function(x) {
tibble(deviance = deviance(x),
AIC = AIC(x),
GCV = x$gcv.ubre)
}
A function that adds a column (in first position) to a tibble to inform about the type of smoothing that was applied:
add_smooth_col <- function(x, s) {
bind_cols(tibble(smooth = s), x)
}
A function that extracts p values from an anova
object
(run on a gam
object):
p_values <- function(x) {
tibble(year = x$p.table[2, "Pr(>|z|)"],
"s(age)" = x$s.table[1, "p-value"])
}
sero <- data_folder |>
dir() |>
map_dfr(read_excel2) |>
mutate(across(Neutralization, ~ .x == "Yes")) |>
replace_na(list(Neutralization = FALSE)) |>
mutate(collection_date =
ymd(paste(year, month, `Day of taking blood sample`, sep = ":"))) |>
select(- `Day of taking blood sample`) |>
mutate(across(`Serum dilution`, ~ str_remove(.x, "1:") |> as.integer())) |>
rename(age = `Exact age calculated`, neutralization = Neutralization)
The data look like this:
sero
## # A tibble: 150 × 15
## Participant_No Province Hospital_name Site_code Collection_No Sample_ID
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0001 HCMC BV Nhi đồng 1 001 06 20FL-001-060001
## 2 0002 HCMC BV Nhi đồng 1 001 06 20FL-001-060002
## 3 0003 HCMC BV Nhi đồng 1 001 06 20FL-001-060003
## 4 0004 HCMC BV Nhi đồng 1 001 06 20FL-001-060004
## 5 0005 HCMC BV Nhi đồng 1 001 06 20FL-001-060005
## 6 0006 HCMC BV Nhi đồng 1 001 06 20FL-001-060006
## 7 0007 HCMC BV Nhi đồng 1 001 06 20FL-001-060007
## 8 0008 HCMC BV Nhi đồng 1 001 06 20FL-001-060008
## 9 0009 HCMC BV Nhi đồng 1 001 06 20FL-001-060009
## 10 0010 HCMC BV Nhi đồng 1 001 06 20FL-001-060010
## # ℹ 140 more rows
## # ℹ 9 more variables: `Serum/plasma` <chr>, Age_group <chr>,
## # `Age labelled on tubes` <chr>, age <dbl>, month <chr>, year <chr>,
## # `Serum dilution` <int>, neutralization <lgl>, collection_date <date>
and the next 6 variables:
sero[-(1:7)]
## # A tibble: 150 × 8
## Age_group `Age labelled on tubes` age month year `Serum dilution`
## <chr> <chr> <dbl> <chr> <chr> <int>
## 1 0≤ & <1 0 0.370 12 2022 NA
## 2 0≤ & <1 0 0.332 12 2022 NA
## 3 0≤ & <1 0 0.0767 1 2023 NA
## 4 0≤ & <1 0 0.438 12 2022 NA
## 5 0≤ & <1 0 0.932 1 2023 NA
## 6 1≤ & <2 1 1.96 12 2022 16
## 7 1≤ & <2 1 1.40 12 2022 NA
## 8 1≤ & <2 1 1.07 12 2022 NA
## 9 1≤ & <2 1 1.71 12 2022 NA
## 10 1≤ & <2 1 1.92 12 2022 NA
## # ℹ 140 more rows
## # ℹ 2 more variables: neutralization <lgl>, collection_date <date>
and the last 2 variables:
sero[-(1:13)]
## # A tibble: 150 × 2
## neutralization collection_date
## <lgl> <date>
## 1 FALSE 2022-12-28
## 2 FALSE 2022-12-27
## 3 FALSE 2023-01-04
## 4 FALSE 2022-12-27
## 5 FALSE 2023-01-03
## 6 TRUE 2022-12-26
## 7 FALSE 2022-12-26
## 8 FALSE 2022-12-27
## 9 FALSE 2022-12-06
## 10 FALSE 2022-12-27
## # ℹ 140 more rows
A function that computes a model’s predictions:
predict2 <- function(x, ci = .95, le = 512, m = 100) {
p <- (1 - ci) / 2
link_inv <- x$family$linkinv
dataset <- x$data
n <- nrow(dataset) - length(x$coefficients)
age_range <- range(dataset$age)
ages <- seq(age_range[1], age_range[2], le = le)
x |>
predict(data.frame(age = ages), se.fit = TRUE) |>
extract(c("fit", "se.fit")) %>%
c(age = list(ages), .) |>
as_tibble() |>
mutate(lwr = m * link_inv(fit + qt( p, n) * se.fit),
upr = m * link_inv(fit + qt(1 - p, n) * se.fit),
fit = m * link_inv(fit)) |>
select(- se.fit)
}
A function that plots a model’s predictions:
plot_predictions <- function(x, add = FALSE, col = 4, alpha = .2, lwd = 2,
m = 100) {
with(x, {
if (! add) {
plot(NA, xlim = c(0, max(age)), ylim = c(0, m),
xlab = "age (year)", ylab = "seroprevalence (%)")
}
polygon2(age, lwr, upr, border = NA, col = adjustcolor(col, alpha))
lines(age, fit, col = col, lwd = lwd)
})
}
A function that generates a polynomial formula:
make_formula <- function(degree) {
2:degree |>
map_chr(~ paste("I(age ^", .x, ")")) |>
paste(collapse = " + ") %>%
paste("neutralization ~ age +", .) |>
as.formula()
}
A function that helps looking for the optimal degree of the polynomial:
test_degrees <- function(yr, degree) {
degree |>
make_formula() |>
glm(binomial, filter(sero, year == as.character(yr))) |>
lrt()
}
Looking for optimal degrees for the 2 years:
test_degrees(2022, 5)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: neutralization
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 70 92.122
## age 1 13.1564 69 78.965 0.0002865 ***
## I(age^2) 1 0.7393 68 78.226 0.3898802
## I(age^3) 1 0.0001 67 78.226 0.9930642
## I(age^4) 1 0.4715 66 77.754 0.4922871
## I(age^5) 1 1.1410 65 76.613 0.2854348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_degrees(2023, 5)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: neutralization
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 78 108.490
## age 1 14.4660 77 94.024 0.0001427 ***
## I(age^2) 1 7.6539 76 86.370 0.0056650 **
## I(age^3) 1 1.1367 75 85.233 0.2863595
## I(age^4) 1 2.0231 74 83.210 0.1549192
## I(age^5) 1 0.0343 73 83.176 0.8531066
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Looking for year effects:
lrt(glm(neutralization ~ age * year + I(age ^2) * year, binomial, sero))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: neutralization
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 149 201.90
## age 1 26.9284 148 174.97 2.111e-07 ***
## year 1 1.9556 147 173.02 0.161989
## I(age^2) 1 6.9248 146 166.09 0.008501 **
## age:year 1 0.2712 145 165.82 0.602554
## year:I(age^2) 1 1.2277 144 164.60 0.267857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Best final model is then:
m <- 100
eps <- 1
glm(neutralization ~ age, binomial, filter(sero, year == "2022")) |>
predict2() |>
plot_predictions()
glm(neutralization ~ age + I(age ^2), binomial, filter(sero, year == "2023")) |>
predict2() |>
plot_predictions(add = TRUE, col = 2)
sero |>
filter(year == "2022") |>
with(points2(age, m * neutralization + eps, col = 4))
sero |>
filter(year == "2023") |>
with(points2(age, m * neutralization - eps, col = 2))
legend("left", legend = c("Dec 2022", "Apr 2023"), lty = 1, lwd = 2,
col = c(4, 2), bty = "n")
The list of smoothing options:
smooths <- c("tp", "ds", "cr", "cs", "cc", "bs", "ps", "cp", "re", "gp", "ad",
"sz", "fs")
Looking at year and smoothed age effects for all the different types of smoothing available:
smooths |>
map_dfr(~ p_values(anova(gam(neutralization ~ s(age, bs = .x) + year,
binomial, sero)))) |>
add_smooth_col(smooths)
## # A tibble: 13 × 3
## smooth year `s(age)`
## <chr> <dbl> <dbl>
## 1 tp 0.168 0.0000311
## 2 ds 0.168 0.0000322
## 3 cr 0.169 0.0000310
## 4 cs 0.171 0.00000231
## 5 cc 0.212 0.00000635
## 6 bs 0.168 0.0000304
## 7 ps 0.168 0.0000281
## 8 cp 0.156 0.000144
## 9 re 0.169 0.000000898
## 10 gp 0.170 0.0000297
## 11 ad 0.159 0.000610
## 12 sz 0.168 0.0000311
## 13 fs 0.168 0.0000311
Looking at the model qualities of the different smootings options:
smooths |>
map_dfr(~ quality(gam(neutralization ~ s(age, bs = .x) + year,
binomial, sero))) |>
add_smooth_col(smooths)
## # A tibble: 13 × 4
## smooth deviance AIC GCV
## <chr> <dbl> <dbl> <dbl>
## 1 tp 166. 174. 0.163
## 2 ds 166. 174. 0.162
## 3 cr 166. 174. 0.162
## 4 cs 166. 174. 0.163
## 5 cc 165. 178. 0.186
## 6 bs 166. 174. 0.163
## 7 ps 166. 174. 0.162
## 8 cp 153. 173. 0.155
## 9 re 173. 179. 0.193
## 10 gp 166. 175. 0.163
## 11 ad 152. 174. 0.158
## 12 sz 166. 174. 0.163
## 13 fs 166. 174. 0.163
Models qualities for the 2 years separately:
smooths |>
map_dfr(~ quality(gam(neutralization ~ s(age, bs = .x), binomial,
filter(sero, year == "2022")))) |>
add_smooth_col(smooths)
## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L = G$L,
## : Iteration limit reached without full convergence - check carefully
## # A tibble: 13 × 4
## smooth deviance AIC GCV
## <chr> <dbl> <dbl> <dbl>
## 1 tp 79.0 83.0 0.169
## 2 ds 79.0 83.0 0.169
## 3 cr 79.0 83.0 0.169
## 4 cs 79.0 83.2 0.171
## 5 cc 81.7 88.7 0.249
## 6 bs 79.0 83.0 0.169
## 7 ps 79.0 83.0 0.169
## 8 cp 80.6 88.5 0.246
## 9 re 79.0 82.9 0.167
## 10 gp 79.0 83.0 0.169
## 11 ad 9.08 60.1 -0.153
## 12 sz 79.0 83.0 0.169
## 13 fs 79.0 83.0 0.169
smooths |>
map_dfr(~ quality(gam(neutralization ~ s(age, bs = .x), binomial,
filter(sero, year == "2023")))) |>
add_smooth_col(smooths)
## # A tibble: 13 × 4
## smooth deviance AIC GCV
## <chr> <dbl> <dbl> <dbl>
## 1 tp 85.1 91.9 0.163
## 2 ds 85.1 91.9 0.163
## 3 cr 85.1 91.9 0.163
## 4 cs 85.1 91.9 0.163
## 5 cc 85.4 92.4 0.170
## 6 bs 85.2 91.9 0.163
## 7 ps 85.2 91.8 0.163
## 8 cp 85.7 92.4 0.169
## 9 re 94.1 97.9 0.240
## 10 gp 84.9 91.8 0.162
## 11 ad 83.1 90.9 0.151
## 12 sz 85.1 91.9 0.163
## 13 fs 85.1 91.9 0.163
The figure with a given smoothing:
gam_figure <- function(s = "bs") {
m <- 100
eps <- 1
gam2(neutralization ~ s(age, bs = s), binomial,
filter(sero, year == "2022")) |>
predict2() |>
plot_predictions()
gam2(neutralization ~ s(age, bs = s), binomial,
filter(sero, year == "2023")) |>
predict2() |>
plot_predictions(add = TRUE, col = 2)
sero |>
filter(year == "2022") |>
with(points2(age, m * neutralization + eps, col = 4))
sero |>
filter(year == "2023") |>
with(points2(age, m * neutralization - eps, col = 2))
legend("left", legend = c("Dec 2022", "Apr 2023"), lty = 1, lwd = 2,
col = c(4, 2), bty = "n")
}
With B-spline:
gam_figure(s = "bs")
sero |>
group_by(collection_date) |>
tally() |>
plot(type = "h", xlab = "date of sample collection",
ylab = "number of samples", col = 4)
sero |>
select(age, neutralization, collection_date)
## # A tibble: 150 × 3
## age neutralization collection_date
## <dbl> <lgl> <date>
## 1 0.370 FALSE 2022-12-28
## 2 0.332 FALSE 2022-12-27
## 3 0.0767 FALSE 2023-01-04
## 4 0.438 FALSE 2022-12-27
## 5 0.932 FALSE 2023-01-03
## 6 1.96 TRUE 2022-12-26
## 7 1.40 FALSE 2022-12-26
## 8 1.07 FALSE 2022-12-27
## 9 1.71 FALSE 2022-12-06
## 10 1.92 FALSE 2022-12-27
## # ℹ 140 more rows
mod1 <- glm(neutralization ~ age * collection_date +
I(age ^ 2) * collection_date, binomial,
mutate(sero, across(collection_date, as.numeric)))
mod2 <- update(mod1, . ~ . + age * I(collection_date ^ 2))
mod3 <- update(mod1, . ~ . + I(age ^2) * I(collection_date ^ 2))
mod4 <- update(mod2, . ~ . + I(age ^2) * I(collection_date ^ 2))
anova(mod1, mod2, mod4)
## Analysis of Deviance Table
##
## Model 1: neutralization ~ age * collection_date + I(age^2) * collection_date
## Model 2: neutralization ~ age + collection_date + I(age^2) + I(collection_date^2) +
## age:collection_date + collection_date:I(age^2) + age:I(collection_date^2)
## Model 3: neutralization ~ age + collection_date + I(age^2) + I(collection_date^2) +
## age:collection_date + collection_date:I(age^2) + age:I(collection_date^2) +
## I(age^2):I(collection_date^2)
## Resid. Df Resid. Dev Df Deviance
## 1 144 166.06
## 2 142 165.57 2 0.48713
## 3 141 165.52 1 0.05248
anova(mod1, mod3, mod4)
## Analysis of Deviance Table
##
## Model 1: neutralization ~ age * collection_date + I(age^2) * collection_date
## Model 2: neutralization ~ age + collection_date + I(age^2) + I(collection_date^2) +
## age:collection_date + collection_date:I(age^2) + I(age^2):I(collection_date^2)
## Model 3: neutralization ~ age + collection_date + I(age^2) + I(collection_date^2) +
## age:collection_date + collection_date:I(age^2) + age:I(collection_date^2) +
## I(age^2):I(collection_date^2)
## Resid. Df Resid. Dev Df Deviance
## 1 144 166.06
## 2 142 165.67 2 0.39438
## 3 141 165.52 1 0.14523
x <- mod1
dataset <- x$data
age_val <- c(.1, 1:14)
collection_date_val <- seq(min(dataset$collection_date),
max(dataset$collection_date))
new_data <- expand.grid(age = age_val, collection_date = collection_date_val)
prdcts <- cbind(new_data, fit = 100 * predict(x, new_data, "response")) |>
as_tibble() |>
arrange(collection_date) |>
mutate(across(collection_date, as_date))
prdcts |>
filter(age == .1) |>
with(plot(collection_date, fit, type = "l", lwd = 2, col = 4, ylim = c(0, 100),
xlab = "date of sample collection", ylab = "seroprevalence (%)"))
walk(1:14, ~ prdcts |>
filter(age == .x) |>
with(lines(collection_date, fit, lwd = 2, col = 4)))
abline(h = 10 * 0:10, col = "lightgrey")
A 3-D version of it:
x <- mod1
dataset <- x$data
age_val <- c(.1, 1:14)
collection_date_val <- seq(min(dataset$collection_date),
max(dataset$collection_date), le = 15)
new_data <- expand.grid(age = age_val, collection_date = collection_date_val)
prdcts <- cbind(new_data, fit = 100 * predict(x, new_data, "response")) |>
as_tibble() |>
arrange(collection_date) |>
mutate(across(collection_date, as_date))
with(prdcts,
persp(sort(unique(age)), sort(unique(collection_date)), matrix(fit, 15),
xlab = "age (years)", ylab = "time", zlab = "seroprevalence (%)",
zlim = c(0, 100), theta = 0, phi = 1 * 15, r = sqrt(3), d = 1,
ticktype = "detailed", border = 4, lwd = 2, axes = FALSE))
with(prdcts,
persp(sort(unique(age)), sort(unique(collection_date)), matrix(fit, 15),
xlab = "age (years)", ylab = "time", zlab = "seroprevalence (%)",
zlim = c(0, 100), theta = 0, phi = 1 * 15, r = sqrt(3), d = 1,
ticktype = "detailed", border = 4, lwd = 2, axes = TRUE))
sero |>
group_by(year, Age_group) |>
summarise(n = n(), mean_age = mean(age)) |>
arrange(year, mean_age) |>
select(- mean_age) |>
pivot_wider(names_from = year, values_from = n) |>
mutate(Age_group = paste0("[", str_replace(Age_group, "≤ & <", ", "), "["))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 3
## Age_group `2022` `2023`
## <chr> <int> <int>
## 1 [0, 1[ 3 7
## 2 [1, 2[ 5 5
## 3 [2, 3[ 5 5
## 4 [3, 4[ 5 5
## 5 [4, 5[ 5 5
## 6 [5, 6[ 5 5
## 7 [6, 7[ 5 5
## 8 [7, 8[ 5 5
## 9 [8, 9[ 3 7
## 10 [9, 10[ 5 5
## 11 [10, 11[ 5 5
## 12 [11, 12[ 5 5
## 13 [12, 13[ 5 5
## 14 [13, 14[ 5 5
## 15 [14, 15[ 5 5
tmp <- sero |>
filter(neutralization) |>
select(age, year, collection_date, `Serum dilution`) |>
rename(dilution = `Serum dilution`) |>
filter(! is.na(dilution))
tmp |>
group_by(dilution) |>
tally() |>
mutate(dilution = log2(dilution)) |>
with(barplot(n, col = 4))
tmp2 <- tmp |>
mutate(dilution = log2(dilution))
with(tmp2, plot(age, dilution))
lrt(glm(dilution ~ age + I(age^2) + I(age^3) + I(age^4), gaussian, tmp2))
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: dilution
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 57 174.62
## age 1 14.4096 56 160.21 0.012526 *
## I(age^2) 1 9.6905 55 150.52 0.040594 *
## I(age^3) 1 11.1150 54 139.41 0.028307 *
## I(age^4) 1 16.9144 53 122.49 0.006825 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrt(glm(dilution ~ age*as.factor(year) + I(age^2)*as.factor(year), gaussian, tmp2))
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: dilution
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 57 174.62
## age 1 14.4096 56 160.21 0.01261 *
## as.factor(year) 1 21.7296 55 138.48 0.00219 **
## I(age^2) 1 7.0060 54 131.47 0.08197 .
## age:as.factor(year) 1 8.3773 53 123.10 0.05718 .
## as.factor(year):I(age^2) 1 2.6769 52 120.42 0.28231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model <- glm(dilution ~ age + as.factor(year), gaussian, tmp2)
lrt(model)
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: dilution
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 57 174.62
## age 1 14.41 56 160.21 0.016744 *
## as.factor(year) 1 21.73 55 138.48 0.003306 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ages <- seq(min(tmp2$age), max(tmp2$age), le = 512)
new_data <- data.frame(age = ages, year = 2022:2023)
prdcts <- predict(model, new_data, se.fit = TRUE)
prdcts <- as_tibble(prdcts[-3]) |>
mutate(lwr = fit + qt(.025, nrow(tmp2) - 1) * se.fit,
upr = fit + qt(.975, nrow(tmp2) - 1) * se.fit)
prdcts <- cbind(new_data, prdcts) |>
as_tibble()
plot(NA, xlim = c(0, 15), ylim = c(0, 10), xlab = "age (years)", ylab = "dilution")
tmp2 |>
filter(year == 2022) |>
with(points(age, dilution, col = 4))
tmp2 |>
filter(year == 2023) |>
with(points(age, dilution, col = 2))
prdcts |>
filter(year == 2022) |>
with({
polygon2(age, lwr, upr, col = adjustcolor(4, .2), border = NA)
lines(age, fit, col = 4, lwd = 2)
})
prdcts |>
filter(year == 2023) |>
with({
polygon2(age, lwr, upr, col = adjustcolor(2, .2), border = NA)
lines(age, fit, col = 2, lwd = 2)
})
legend("topright", legend = c("Dec 2022", "Apr 2023"), lty = 1, lwd = 2,
col = c(4, 2), bty = "n")