4  Cleaning data

4.1 Constants

user <- Sys.getenv("USER")

if (user == "nguyenpnt") {
  path2data <- paste0(
    "/Users/nguyenpnt/Library/CloudStorage/OneDrive-OxfordUniversityClinicalResearchUnit/OUCRU Modelling - HFMD HCMC 2023/data/"
  )
} else {
  path2data <- paste0(
    "D:/OUCRU-Onedrive/OneDrive - Oxford University Clinical Resear",
    "ch Unit/Data/HFMD/cleaned/"
  )
}

path_raw   <- file.path(path2data, "raw/")
path_clean <- file.path(path2data, "cleaned/")

Raw files name:

## cases data
cases_file <- "TCM_full.xlsx"

## sero
apr_2023 <- "4_2023.xlsx"
aug_2023 <- "08_2023.xlsx"
dec_2022 <- "12_2022.xls"
dec_2023 <- "12_2023.xlsx"

## sero address
sero_add <- "cleaned_address_sero.csv"

## virology data 
viro <- "03EI Data 2023 shared.xlsx"

## CH1 outpatient data

outpatient_ch1 <- "CH1_outpatient.xlsx"

## HCM new address

hcm_address <- "AP Transform Loc.xlsx"

4.2 Packages

Required packages:

required <- c("readxl", "magrittr", "lubridate", "janitor", "tidyverse","stringi")

Installing those that are not installed yet:

to_install <- required[!required %in% installed.packages()[, "Package"]]
if (length(to_install))
  install.packages(to_install)

Loading the packages for interactive use:

invisible(sapply(required, library, character.only = TRUE))

4.3 Functions

4.4 Data import

HCDC data

df_cases <- paste0(path_raw, cases_file) |>
  read_excel(
    col_types = c(
      "date",
      "numeric",
      "text",
      "text",
      "text",
      "date",
      "date",
      "date",
      "text",
      "text",
      "text"
    )
  )

CH1 serology data

df_apr2023 <- paste0(path_raw, apr_2023) |> read_excel()
df_aug2023 <- paste0(path_raw, aug_2023) |> read_excel()
df_dec2022 <- paste0(path_raw, dec_2022) |> read_excel()
df_dec2023 <- paste0(path_raw, dec_2023) |> read_excel()

df_sero_add <- paste0(path_raw, sero_add) |> read_csv()

CH1 virology data

viro_df <- paste0(path_raw, viro) |> read_excel()

CH1 outpatient data

CH1_outpatient_010623 <- paste0(path_raw, outpatient_ch1) |> read_excel(sheet = "0106 2023")

CH1_outpatient_071223 <- paste0(path_raw, outpatient_ch1) |> read_excel(sheet = "0712 2023")

px_qh_change <- paste0(path_raw, hcm_address) |> read_excel(sheet = "Mapping")

4.5 HCDC data

Columns translation

eng_cols <- c(
  dob = "Ngày tháng năm sinh",
  age = "Tuổi",
  gender = "Giới tính",
  commune = "Phường/xã cư trú",
  district = "Quận/huyện cư trú",
  reported_date = "Ngày báo cáo",
  onset_date = "Ngày khởi phát triệu chứng",
  admission_date = "Ngày đi khám hoặc ngày nhập viện",
  medi_cen = "Tên bệnh viện (cơ sở y tế) đi khám",
  inout = "Tình trạng hiện nay",
  severity = "Phân độ lâm sàng"
)

df_cases %<>%
  rename(!!!eng_cols)  

Remove duplication

df_cases %<>% distinct()

Convert date columns as same type and adding some variables

## change date columns from <dttm> to <date>
df_cases %<>%
  mutate(
    across(c(
      dob, reported_date, onset_date, admission_date
    ), .fns = as.Date),
    year_dob = year(dob),
    year_reported = year(reported_date),
    year_onset = year(onset_date),
    year_admission = year(admission_date)
  ) 

Remove out of age range observation

df_cases %<>%
  filter(!age %in% c(-2951, 117) |
           is.na(dob)) %>%  ## excluded cases related to dob error
  mutate(
    ## change information
    onset_date = ifelse(
      year_dob > 2024 |
        year_dob < 1924 | year_onset > 2024 | year_onset < 2013,
      update(onset_date, year = year(admission_date)),
      onset_date
    ),
    onset_date = as.Date(onset_date)
  ) 

4.5.0.1 Cleaning cases were entered wrong year

Wrong reported data

df_cases %<>%
  mutate(
    delay = reported_date - admission_date,
    reported_date2 = update(reported_date, year = year(admission_date)),
    delay2 = reported_date2 - admission_date,
    check = year(admission_date) == year(onset_date)
  )  %>%
  mutate(
    reported_date = case_when(
      ## those case match criteria will be changed
      delay > 365 &
        delay2 <= 14 &
        delay2 > 0 & check %in% c("TRUE", NA) ~ reported_date2,
      .default = reported_date
    )
  ) %>%
  select(-c(delay, reported_date2, delay2, check)) 

Wrong admission date

df_cases %<>%
  mutate(
    check = year(reported_date) == year(onset_date),
    check2 = year(admission_date) != year(onset_date)
  ) %>%
  mutate(admission_date = case_when(
    check == TRUE &
      check2 == TRUE ~ update(admission_date, year = year(onset_date)),
    .default = admission_date
  )) %>%
  select(-c(check, check2))

Wrong onset date

df_cases %<>%
  mutate(
    check = year(reported_date) == year(admission_date),
    check2 = year(admission_date) != year(onset_date),
    delay = reported_date - onset_date,
    onset_date2 = update(onset_date, year = year(admission_date)),
    delay2 = reported_date - onset_date2
  ) %>%
  mutate(
    onset_date = case_when(
      check == TRUE &
        check2 == TRUE &
        delay > 365 & delay2 > 0 & delay2 <= 14 & delay2 > 0 ~ onset_date2,
      .default = onset_date
    ),
  ) %>%
  select(-c(check, check2, delay, onset_date2, delay2))

4.5.0.2 Cleaning cases enter wrong with day and month swapped

Onset date

df_cases %<>%
  mutate(
    delay_adm = admission_date - onset_date,
    onset_date2 = ydm(onset_date),
    delay_adm2 = admission_date - onset_date2,
    check = onset_date > reported_date &
      day(onset_date) == month(reported_date)
  ) %>%
  mutate(onset_date = case_when(check == TRUE &
                                  delay_adm2 >= 0 ~ onset_date2, .default = onset_date),) %>%
  select(-c(delay_adm, onset_date2, delay_adm2, check)) 

Admission date

df_cases %<>%
  mutate(
    delay = reported_date - admission_date,
    admission_date2 = ydm(admission_date),
    delay2 = reported_date - admission_date2,
    check = admission_date > reported_date &
      day(admission_date) == month(reported_date)
  ) %>%
  mutate(admission_date = case_when(check == TRUE &
                                      delay2 >= 0 ~ admission_date2, .default = admission_date)) %>%
  select(-c(delay, admission_date2, delay2, check)) 

Reported date

df_cases %<>%
  mutate(
    delay = reported_date - admission_date,
    reported_date2 = ydm(reported_date),
    delay2 = reported_date2 - admission_date,
    check = month(admission_date) == day(reported_date) &
      delay > 30
  ) %>%
  mutate(reported_date = case_when(check == TRUE &
                                     delay2 < 30 ~ reported_date2, .default = reported_date)) %>%
  select(-c(delay, reported_date2, delay2, check))

String variable

# Define lookup vectors once, outside the mutate
severity_map <- c(
  "2a" = "2A",
  "2A" = "2A",
  "2" = "2A",
  "a" = "2A",
  "A" = "2A",
  "-2" = "2A",
  "1"  = "1",
  "11" = "1",
  "1a" = "1",
  "2b" = "2B",
  "2B" = "2B",
  "2b nhom2" = "2B",
  "B" = "2B",
  "b" = "2B"
)

inout_map <- c(
  "Điều trị nội trú"  = "Inpatient",
  "Điều trị ngoại trú" = "Outpatient",
  "Ra viện"           = "Discharge",
  "Chuyển viện"       = "Hospital transfer",
  "Tình trạng khác"   = "Other",
  "Tử vong"           = "Died"
)

# Reusable function for district/commune normalization
clean_admin <- function(x, remove_pattern) {
  x |>
    stri_trans_general("latin-ascii") |>
    tolower() |>
    str_remove_all(remove_pattern) |>
    trimws()
}

df_cases %<>%
  mutate(
    severity = coalesce(severity_map[severity], severity),
    district = clean_admin(district, "quan|huyen|thanh pho"),
    commune  = clean_admin(commune, "phuong|xa|thi tran"),
    inout    = inout_map[inout],
    gender   = if_else(tolower(
      stri_trans_general(gender, "latin-ascii")
    ) == "nu", "Female", "Male")
  )

Save

# saveRDS(df_cases,paste0(path_clean,"hfmd_hcdc.rds"))

4.6 CH1 Serology data

t423 <- data.frame(df_apr2023[-c(1, 2), c(6, 8, 10:14)])
t423$pos <- replace(t423$...14, is.na(t423$...14), 0) %>%
  str_detect(regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t423) <- c("id",
                    "age_gr",
                    "age",
                    "col_day",
                    "col_month",
                    "col_year",
                    "neutralization",
                    "pos")
t423$age <- as.numeric(t423$age)
t423$col_time <- rep("Apr 2023", nrow(t423))

t823 <- data.frame(df_aug2023[-c(1, 2), c(6, 8, 9, 14:17)])
t823$pos <- str_detect(t823$...17, regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t823) <- c("id",
                    "age_gr",
                    "age",
                    "col_day",
                    "col_month",
                    "col_year",
                    "neutralization",
                    "pos")
t823$age <- as.numeric(t823$age)
t823$col_time <- rep("Aug 2023", nrow(t823))

t1222 <- data.frame(df_dec2022[-c(1, 2), c(6, 8, 10:14)])
t1222$pos <- replace(t1222$...14, is.na(t1222$...14), 0) %>%
  str_detect(regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t1222) <- c("id",
                     "age_gr",
                     "age",
                     "col_day",
                     "col_month",
                     "col_year",
                     "neutralization",
                     "pos")
t1222$age <- as.numeric(t1222$age)
t1222$col_time <- rep("Dec 2022", nrow(t1222))


t1223 <- data.frame(df_dec2023[-c(1, 2), c(6, 8, 9, 14:17)])
t1223$pos <- replace(t1223$...17, is.na(t1223$...17), 0) %>%
  str_detect(regex(paste(2^(4:10), collapse = "|"))) %>%
  as.integer(as.logical())
colnames(t1223) <- c("id",
                     "age_gr",
                     "age",
                     "col_day",
                     "col_month",
                     "col_year",
                     "neutralization",
                     "pos")
t1223$age <- as.numeric(t1223$age)
t1223$col_time <- rep("Dec 2023", nrow(t1223))

sero <- rbind(t1222, t1223, t423, t823)

data_pt <- full_join(df_sero_add, sero, by =  c("id" = "id"))

data_pt %<>% filter(!is.na(age) & !is.na(qhchuan)) %>%
  select(-c(add_mod)) %>%
  as.data.frame() %>%
  rename(district = qhchuan, commune = pxchuan) %>%
  mutate(col_date = make_datetime(
    year  = as.integer(col_year),
    month = as.integer(col_month),
    day   = as.integer(col_day)
  ))

Save

# saveRDS(data_pt,paste0(path_clean,"sero_EV-A71_1222-1223.rds"))

4.7 CH1 Virology data

viro_df %<>%
  mutate(
    city = City %>%
      trimws() %>%
      stri_trans_general("latin-ascii") %>%
      tolower(),
    sero_gr = if_else(SeroGroup1 == "ENT", "EV", SeroGroup1),
    admission_date = as.Date(DateAdmission),
    age_adm    = interval(DateBirth, admission_date) / years(1),
    adm_month  = month(admission_date, label = TRUE),
    adm_month2 = as.character(as.Date(floor_date(
      admission_date, "month"
    ))),
    age_bin = cut(
      age_adm,
      breaks = seq(0, max(age_adm, na.rm = TRUE) + 0.5, by = 0.5),
      right = FALSE
    ),
    across(
      Dist,
      ~ .x %>% stri_trans_general("latin-ascii") %>% tolower() %>% str_remove("^q")
    ),
    across(
      Ward,
      ~ .x %>% stri_trans_general("latin-ascii") %>% tolower() %>% str_remove("^p")
    )
  ) %>% rename(district = Dist, commune = Ward) %>%
  filter(city == "tp hcm") %>%
  select(
    city,
    sero_gr,
    admission_date,
    adm_month,
    adm_month2,
    age_adm,
    age_bin,
    DateBirth,
    district,
    commune
  )

Save

# saveRDS(data_pt,paste0(path_clean,"virology_cleaned.rds"))

4.8 CH1 outpatient data

px_qh_change %<>%
  filter(tinhThanh_cu == "TP.Hồ Chí Minh") %>%
  mutate(
    district = quanHuyen_cu %>%
      stri_trans_general("latin-ascii") %>%
      tolower() %>% trimws(which = "both"),
    commune = tenXa_moi %>%
      stri_trans_general("latin-ascii") %>%
      tolower() %>% trimws(which = "both")
  )

new_dup_dis <- px_qh_change %>%
  group_by(commune) %>%
  summarise(n_district = n_distinct(district),
            districts = paste(unique(district), collapse = ", ")) %>%
  filter(n_district > 1) %>%
  pull(commune)

px_qh_pair <- px_qh_change %>%
  filter(!commune %in% new_dup_dis) %>%
  dplyr::select(district,commune) %>% unique()

CH1_outpatient_hcm_23 <- rbind(CH1_outpatient_010623,CH1_outpatient_071223) %>%
  mutate(dob = as.Date(ngaysinh),
         adm = as.Date(ngaykham)) %>%
  mutate(city = str_split_i(string = diaphuong,
                          pattern = ",",i=-1),
         district = str_split_i(string = diaphuong,
                                pattern = ",",i=-2) %>% trimws(which = "both"),
         district2 = district %>%
           stri_trans_general("latin-ascii") %>%
           tolower() %>% trimws(which = "both"),
         commune = str_split_i(string = diaphuong,
                               pattern = ",",i=-3) %>% trimws(which = "both"),
         commune2 = commune %>%
           stri_trans_general("latin-ascii") %>%
           tolower() %>% trimws(which = "both"),) %>%
  filter(city %in% c(" TP.Hồ Chí Minh"," Thành phố Hồ Chí Minh"),
         !grepl('a|A|B|b', MaICDChinh)) %>%
  distinct(mahoso,dob,PHAI, .keep_all = TRUE)

CH1_outpatient_hcm_23a <- CH1_outpatient_hcm_23 %>%
  filter(district != "Không xác định" &
           !commune2 %in% new_dup_dis &
           district == "") %>%
  mutate(commune2 = commune %>%
           stri_trans_general("latin-ascii") %>%
           tolower() %>% trimws(which = "both")) %>%
  left_join(.,px_qh_pair,by = join_by(commune2 == commune)) %>%
  na.omit(district.y) %>%
  dplyr::select(dob,adm,district.y,commune2) %>%
  set_colnames(c("dob","adm","district","commune"))


CH1_outpatient_hcm_23b <- CH1_outpatient_hcm_23 %>%
  filter(!district %in% c("Không xác định","")) %>%
  dplyr::select(dob,adm,district2,commune2) %>%
  set_colnames(c("dob","adm","district","commune"))

CH1_out_cleaned <- rbind(CH1_outpatient_hcm_23a,CH1_outpatient_hcm_23b) %>%
  mutate(
    district = district %>%
      str_remove_all("quan|huyen|thanh pho") %>%
      trimws(which = "both"),
    district = case_when(
      district %in% c(2,9) ~ "thu duc",
      !district %in% c(2,9) ~ district),
    commune = commune %>%
      str_remove_all("phuong|xa") %>%
      trimws(which = "both"),
    age = interval(dob,adm) / years(1),
  ) |>
  filter(age < 17 & age >= 0) %>%
  mutate(
    age_gr = cut(age,
                 breaks = seq(0,17),
                 right = FALSE,
                 labels = seq(0,16)) %>% as.character() %>% as.numeric(),
    period = case_when(
      adm <= as.Date("2023-04-30") ~ "12/2022 - 4/2023",
      adm > as.Date("2023-04-30") & adm <= as.Date("2023-08-31") ~ "4/2023 - 8/2023",
      adm > as.Date("2023-08-31") ~ "8/2023 - 12/2023"
    )
  ) %>%
  group_by(district,age_gr,period) %>%
  count() %>%
  ungroup()

Save

# saveRDS(data_pt,paste0(path_clean,"CH1_out_cleaned.rds"))