The code below uses raster-based modelled data from WorldPop on age structure to generate a data frame of age structure by administrative divisions (here provinces). The GitHub repository of this code is https://github.com/OUCRU-Modelling/aes. To transform this code into an R script, use:
knitr::purl("age_structure.Rmd", "age_structure.R")
Specifying the input and output folders:
root <- "~/OneDrive - Oxford University Clinical Research Unit/data/"
gadm_data_folder <- paste0(root, "GADM/")
wpop_data_folder <- paste0(root, "WorldPop/raw/age structure/Vietnam")
wpop_data_processed <- paste0(root, "WorldPop/processed/age structure/Vietnam/age_data.rds")
Checking whether the WorldPop processed data already exist locally:
data_available <- file.exists(wpop_data_processed)
Packages needed for the analysis:
needed_package <- c("dplyr", "tidyr", "purrr", "terra", "stringr", "sf", "parallel", "magrittr")
Installing the packages that are not already installed:
to_install <- needed_package[which(!needed_package %in% installed.packages()[, "Package"])]
if (length(to_install)) install.packages(to_install)
Loading the packages:
dev_null <- sapply(needed_package, function(libname) library(libname, character.only = TRUE))
rm(dev_null)
GADM root URL:
gadm <- "https://geodata.ucdavis.edu/gadm/"
Version and format:
vers_form <- "gadm3.6/Rsf/"
File:
file_name <- "gadm36_VNM_1_sf.rds"
Creating the local folder structure if it does not exist:
local_folder <- paste0(gadm_data_folder, vers_form)
if (!dir.exists(local_folder)) dir.create(local_folder, recursive = TRUE)
Downloading the data if not already available locally:
local_file <- paste0(local_folder, file_name)
if (!file.exists(local_file)) download.file(paste0(gadm, vers_form, file_name), local_file)
Loading the polygons data:
provinces <- local_file %>%
readRDS() %>%
st_set_crs(4326) # setting the modern code of the WGS 84 projection
## old-style crs object detected; please recreate object with a recent sf::st_crs()
Checking the map of the provinces polygons:
provinces %>%
st_geometry() %>%
plot()
WorldPop URL:
worldpop <- "https://data.worldpop.org/"
All the combinations of years, ages and gender:
ages <- c(0, 1, seq(5, 80, 5))
combinations <- expand_grid(years = 2000:2020,
ages = ages,
sexes = c("f", "m"))
Creating the long files names:
files_names <- with(combinations,
paste0("GIS/AgeSex_structures/Global_2000_2020/",
years, "/VNM/vnm_", sexes, "_", ages, "_", years, ".tif"))
The corresponding local files names:
local_files_names <- paste0(wpop_data_folder, str_remove(files_names, "^GIS.*VNM"))
Checking for those that do not exist locally yet:
sel <- which(!map_lgl(local_files_names, file.exists))
Downloading the files that are missing:
if (length(sel) & !data_available) {
walk2(paste0(worldpop, files_names[sel]), local_files_names[sel], download.file)
}
A function that converts the polygon of a given province of
provinces
into a SpatVector
object:
prov2vect <- function(prov_name) {
provinces %>%
filter(VARNAME_1 == prov_name) %>%
vect()
}
A function that crops and masks a SpatRaster
object
rst
with a SpatVector
object
pol
:
crop_mask <- function(rst, pol) {
rst %>%
crop(pol) %>%
mask(pol)
}
A function that extracts the number of people of a raster
rst
inside a polygon pol
:
nb_people <- function(rst, pol) {
crop_mask(rst, pol) %>%
values() %>%
sum(na.rm = TRUE)
}
A function that converts a TIFF file (i.e. number of people for a given year, age, and sex) into a data frame with number of people per province:
tif2df <- function(tif_file) {
tmp <- first(str_split(str_remove(tif_file, ".tif"), "_"))
tif_file %>%
paste0(wpop_data_folder, "/", .) %>%
rast() %>%
map_dbl(prov_vect_list, nb_people, rst = .) %>%
tibble(year = tmp[4], province = provinces_names, sex = tmp[2], age = tmp[3], n = .)
}
The names of the provinces:
provinces_names <- provinces$VARNAME_1
A list of provinces polygons as SpatVector
:
prov_vect_list <- map(provinces_names, prov2vect)
Processing all the files to generate the age data frame (it takes about 30’ on 11 cores of a 3.2 GHz 6-Core Intel i7):
if (data_available) {
age_data <- readRDS(wpop_data_processed)
} else {
hash <- setNames(c(paste0("[", head(ages, -1), ";", tail(ages, -1), "["), "80+"), ages)
age_data <- wpop_data_folder %>%
dir() %>%
mclapply(tif2df, mc.cores = detectCores() - 1) %>%
bind_rows() %>%
mutate(age_class = hash[age],
sex = c(f = "female", m = "male")[sex],
age = as.integer(age))
saveRDS(age_data, wpop_data_processed)
}
The data look like this:
age_data
## # A tibble: 47,628 × 6
## year province sex age n age_class
## <chr> <chr> <chr> <int> <dbl> <chr>
## 1 2000 An Giang female 0 15375. [0;1[
## 2 2000 Bac Lieu female 0 4971. [0;1[
## 3 2000 Bac Giang female 0 11004. [0;1[
## 4 2000 Bac Kan female 0 2012. [0;1[
## 5 2000 Bac Ninh female 0 7648. [0;1[
## 6 2000 Ben Tre female 0 8263. [0;1[
## 7 2000 Ba Ria - Vung Tau female 0 6205. [0;1[
## 8 2000 Binh Dinh female 0 10658. [0;1[
## 9 2000 Binh Duong female 0 5318. [0;1[
## 10 2000 Binh Phuoc female 0 5835. [0;1[
## # … with 47,618 more rows
Let’s look at the change in the whole country population size over the years:
pop_in_a_year <- function(x) {
age_data %>%
filter(year == x) %>%
pull(n) %>%
sum()
}
plot2 <- function(...) {
plot(..., xlab = "", type = "o", col = 4)
}
years <- 2000:2020
pop_sizes <- map_dbl(years, pop_in_a_year)
plot2(years, pop_sizes, ylab = "population size")
Looks OK. Now, let’s see proportions of different age classes over the years, for the whole country again:
tmp <- age_data %>%
group_by(year, age) %>%
summarise(n = sum(n)) %>%
mutate(p = n / sum(n),
p = cumsum(p)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
with(tmp, plot(year, p, ylim = 0:1, type = "n", xlab = "",
ylab = "proportions (from younger to older)"))
tmp %>%
group_by(age) %>%
group_walk(~ with(.x, lines(year, p, col = 4)))
Looks OK too, in the sense that the proportions of the age classes are not constant with time, and also that the population is getting older and older.