An example of data manipulation
The goal of this training course is to learn how to manipulate data using the tidyverse
syntax in R
.
Here is an extract of an R
script showing the import of clinical and biological Excel data bases, the cleaning of the variables names, the reshape of biological data, the re-coding of the types of variables and the computing of derived variables. This R
script could be refined in terms of efficiency, but it provides a good example of data manipulation with the tidyverse.
# IMPORT CLINICAL AND BIOLOGICAL DATA ----------------------------------------------
data_clinical_unilateral_raw <- readxl::read_xlsx(path = paste(file_data, "donnees_cliniques_bilateral.xlsx",
sep = ""),
sheet = "Unilat")
data_clinical_bilateral_raw <- readxl::read_xlsx(path = paste(file_data, "donnees_cliniques_bilateral.xlsx",
sep = ""),
sheet = "Bilat")
data_biological_unilateral_raw <- readxl::read_xlsx(path = paste(file_data, "biological_unilateral.xlsx", sep = ""))
data_biological_bilateral_raw <- readxl::read_xlsx(path = paste(file_data, "biological_bilateral.xlsx", sep = ""))
(...)
# CLEAN VARIABLES NAMES AND PIVOT BIOLOGICAL DATA ----------------------------------
data_clinical_unilateral <- data_clinical_unilateral_raw %>%
janitor::clean_names() %>%
dplyr::rename(traitement = type_de_traitement,
id_patient = numero_de_dossier_gr,
age = age_au_debut_du_traitement,
cancer_primitif = primitif_standardise,
histologie = histologie_definitive_standardise,
debut_radiotherapie = debut,
fin_radiotherapie = fin,
chimiotherapie_concomittante = chimio_conco,
recidive = recidivie,
radiotherapy_adjuvante = rt_adjuvante,
chimiotherapie_induction = chimio_induction,
recidive_post_traitement = recidivie_post_tt)
data_biological_unilateral_wide <- data_biological_unilateral_raw %>%
janitor::clean_names() %>%
tidyr::separate(col = noigr_datedeb_datefin,
into = c('id_patient', 'debut_radiotherapie', 'fin_radiotherapie'),
sep = "_") %>%
tidyr::separate_wider_regex(cols = j_avant,
patterns = c(biological_baseline = "\\d+\\.\\d.",
"\\s*-\\s*",
date_prelevement_baseline = "\\d{2}/\\d{2}/\\d{4}\\s+\\d{2}:\\d{2}"),
too_few = "align_start")
data_biological_unilateral <- data_biological_unilateral_wide %>%
tidyr::pivot_longer(cols = tidyselect::starts_with('j'),
names_to = "prelevement",
values_to = "biological_date") %>%
tidyr::separate_wider_regex(cols = biological_date,
patterns = c(biological = "\\d+\\.\\d.",
"\\s*-\\s*",
date_prelevement = "\\d{2}/\\d{2}/\\d{4}\\s+\\d{2}:\\d{2}",
"\\s*-\\s*",
jours_prelevement = "\\d*"),
too_few = "align_start")
(...)
# RECODE TYPE OF VARIABLES ---------------------------------------------------------
indicatrice <- function(x) {
# Input :
# x : a numeric vector which takes values 1 (for 'Yes') and 0 (for 'No').
# Output :
# a vector which is a factor with two levels 'Yes' and 'No'.
return(factor(case_when(x == 1 ~ 'Yes',
x == 0 ~ 'No',
is.na(x) ~ NA,
.default = NA),
levels = c('Yes', 'No')))
}
data_clinical_unilateral <- data_clinical_unilateral %>%
dplyr::mutate(oms = as.numeric(case_when(oms == 'NC' ~ NA,
.default = oms))) %>%
dplyr::mutate_if(is.character,
.funs = as.factor) %>%
dplyr::mutate_at(vars('id_patient'),
.funs = as.character) %>%
dplyr::mutate_at(vars(c('chimiotherapie_concomittante', 'recidive', 'reirradiation',
'radiotherapy_adjuvante', 'chimiotherapie_induction',
'recidive_post_traitement', 'deces')),
.funs = indicatrice)
(...)
# BIND DATA OF CONTROL AND EXPERIMENTAL ARM ----------------------------------------
data_clinical <- data_clinical_unilateral %>%
dplyr::bind_rows(data_clinical_bilateral) %>%
dplyr::select(- age) # age not complete : it will be (re)computed later
(...)
# DERIVATION OF VARIABLES -----------------------------------------------------------
data_clinical <- data_clinical %>%
dplyr::mutate(age = floor(lubridate::interval(date_de_naissance, debut_radiotherapie)/years(1)),
oms = case_when(oms == 0 ~ '0',
oms == 1 | oms == 2 ~ '1-2',
.default = NA),
t = case_when(t == 'Tx' ~ 'Tx',
t == 'T0' ~ 'T0',
t == 'T1' ~ 'T1',
t == 'T2' ~ 'T2',
t == 'T3' ~ 'T3',
t == 'T4a' | t == 'T4b' ~ 'T4',
.default = NA),
n = case_when(n == 'Nx' ~ 'Nx',
n == 'N0' ~ 'N0',
n == 'N1' ~ 'N1',
n == 'N2a' | n == 'N2b' | n == 'N2c' ~ 'N2',
n == 'N3' ~ 'N3',
.default = NA),
stade = case_when(m == 'M1' ~ 'Stage IV',
n == 'N3' | t == 'T4' ~ 'Stage III',
n == 'N2' | t == 'T3' ~ 'Stage II',
n == 'N1' | n == 'N0' | t == 'T2' | t == 'T1' | t == 'T0' ~ 'Stage I',
t == 'Tx' & n == 'Nx' ~ 'TxNx',
.default = NA),
stade_new = as.factor(case_when(stade == 'Stage I' | stade == 'Stage II' ~ 'Stage I-II',
stade == 'Stage III' | stade == 'Stage IV' ~ 'Stage III-IV',
.default = NA)),
duree_radiotherapie = floor(lubridate::interval(debut_radiotherapie, fin_radiotherapie)/days(1)),
date_derniere_nouvelle = case_when(!is.na(date_deces) ~ date_deces,
!is.na(date_derniere_consultation) ~ date_derniere_consultation,
.default = NA),
delai_OS = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1),
delai_PFS = case_when(!is.na(date_de_recidive) ~ lubridate::interval(debut_radiotherapie, date_de_recidive)/months(1),
.default = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1)),
duree_suivi_months = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1))
data_biological <- data_biological %>%
dplyr::mutate(normales = "[1.0 - 4.0] (10^9/L)",
prelevement_timing = as.factor(str_extract(prelevement, pattern = "(?<=_)(.*?)(?=_)")),
prelevement_number = as.integer(str_extract(prelevement, pattern = "\\d+")),
jours_prelevement_bis = case_when((!is.na(date_prelevement) & prelevement_timing == 'pendant') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/days(1)),
(!is.na(date_prelevement) & prelevement_timing == 'apres') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/days(1)),
.default = NA),
mois_prelevement = case_when((!is.na(date_prelevement) & prelevement_timing == 'pendant') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/months(1)),
(!is.na(date_prelevement) & prelevement_timing == 'apres') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/months(1)),
.default = NA))
(...)