Logo CESP
Logo IGR

Data Manipulation

Introduction to the Tidyverse

Charlotte Bargain

30/06/2025

Introduction

An example of data manipulation

The goal of this training course is to learn how to manipulate data using the tidyverse syntax in R.

Here is an extract of an R script showing the import of clinical and biological Excel data bases, the cleaning of the variables names, the reshape of biological data, the re-coding of the types of variables and the computing of derived variables. This R script could be refined in terms of efficiency, but it provides a good example of data manipulation with the tidyverse.


# IMPORT CLINICAL AND BIOLOGICAL DATA ----------------------------------------------
  
data_clinical_unilateral_raw <- readxl::read_xlsx(path = paste(file_data, "donnees_cliniques_bilateral.xlsx", 
                                                  sep = ""), 
                                                  sheet = "Unilat")
  
data_clinical_bilateral_raw <- readxl::read_xlsx(path = paste(file_data, "donnees_cliniques_bilateral.xlsx", 
                                                 sep = ""), 
                                                 sheet = "Bilat")
  
  
data_biological_unilateral_raw <- readxl::read_xlsx(path = paste(file_data, "biological_unilateral.xlsx", sep = ""))
  
data_biological_bilateral_raw <- readxl::read_xlsx(path = paste(file_data, "biological_bilateral.xlsx", sep = ""))
  
(...)
  
  

# CLEAN VARIABLES NAMES AND PIVOT BIOLOGICAL DATA ----------------------------------
  
data_clinical_unilateral <- data_clinical_unilateral_raw %>%
  janitor::clean_names() %>%
  dplyr::rename(traitement = type_de_traitement,
                id_patient = numero_de_dossier_gr,
                age = age_au_debut_du_traitement,
                cancer_primitif = primitif_standardise,
                histologie = histologie_definitive_standardise,
                debut_radiotherapie = debut,
                fin_radiotherapie = fin,
                chimiotherapie_concomittante = chimio_conco,
                recidive = recidivie,
                radiotherapy_adjuvante = rt_adjuvante,
                chimiotherapie_induction = chimio_induction,
                recidive_post_traitement = recidivie_post_tt) 
  
  
data_biological_unilateral_wide <- data_biological_unilateral_raw %>%
  janitor::clean_names() %>%
  tidyr::separate(col = noigr_datedeb_datefin,
                  into = c('id_patient', 'debut_radiotherapie', 'fin_radiotherapie'),
                  sep = "_") %>%
  tidyr::separate_wider_regex(cols = j_avant,
                              patterns = c(biological_baseline = "\\d+\\.\\d.", 
                                           "\\s*-\\s*",
                                           date_prelevement_baseline = "\\d{2}/\\d{2}/\\d{4}\\s+\\d{2}:\\d{2}"),
                              too_few = "align_start") 
  
data_biological_unilateral <- data_biological_unilateral_wide %>%
  tidyr::pivot_longer(cols = tidyselect::starts_with('j'),
                      names_to = "prelevement",
                      values_to = "biological_date") %>%
  tidyr::separate_wider_regex(cols = biological_date,
                              patterns = c(biological = "\\d+\\.\\d.", 
                                           "\\s*-\\s*",
                                           date_prelevement = "\\d{2}/\\d{2}/\\d{4}\\s+\\d{2}:\\d{2}",
                                           "\\s*-\\s*",
                                           jours_prelevement = "\\d*"),
                              too_few = "align_start")
  
(...)
  
  
  
# RECODE TYPE OF VARIABLES ---------------------------------------------------------
  
indicatrice <- function(x) {
  
  # Input : 
  # x : a numeric vector which takes values 1 (for 'Yes') and 0 (for 'No').
  # Output :
  # a vector which is a factor with two levels 'Yes' and 'No'.
  
  return(factor(case_when(x == 1 ~ 'Yes',
                          x == 0 ~ 'No',
                          is.na(x) ~ NA,
                          .default = NA),
                levels = c('Yes', 'No')))
}
  
data_clinical_unilateral <- data_clinical_unilateral %>%
  dplyr::mutate(oms = as.numeric(case_when(oms == 'NC' ~ NA,
                                           .default = oms))) %>%
  dplyr::mutate_if(is.character,
                   .funs = as.factor) %>% 
  dplyr::mutate_at(vars('id_patient'), 
                   .funs = as.character) %>%
  dplyr::mutate_at(vars(c('chimiotherapie_concomittante', 'recidive', 'reirradiation',
                          'radiotherapy_adjuvante', 'chimiotherapie_induction', 
                          'recidive_post_traitement', 'deces')),
                   .funs = indicatrice)
  
(...)
  
  
  
# BIND DATA OF CONTROL AND EXPERIMENTAL ARM ----------------------------------------
  
data_clinical <- data_clinical_unilateral %>%
  dplyr::bind_rows(data_clinical_bilateral) %>%
  dplyr::select(- age) # age not complete : it will be (re)computed later
  
(...)
  
  
  
# DERIVATION OF VARIABLES -----------------------------------------------------------
  
data_clinical <- data_clinical %>%

  dplyr::mutate(age = floor(lubridate::interval(date_de_naissance, debut_radiotherapie)/years(1)),
  oms = case_when(oms == 0 ~ '0',
                  oms == 1 | oms == 2 ~ '1-2',
                  .default = NA),
  t = case_when(t == 'Tx' ~ 'Tx',
                t == 'T0' ~ 'T0',
                t == 'T1' ~ 'T1',
                t == 'T2' ~ 'T2',
                t == 'T3' ~ 'T3',
                t == 'T4a' | t == 'T4b' ~ 'T4',
                .default = NA),
  n = case_when(n == 'Nx' ~ 'Nx',
                n == 'N0' ~ 'N0',
                n == 'N1' ~ 'N1',
                n == 'N2a' | n == 'N2b' | n == 'N2c' ~ 'N2',
                n == 'N3' ~ 'N3',
                .default = NA),
  stade = case_when(m == 'M1' ~ 'Stage IV',
                    n == 'N3' | t == 'T4' ~ 'Stage III',
                    n == 'N2' | t == 'T3' ~ 'Stage II',
                    n == 'N1' | n == 'N0' | t == 'T2' | t == 'T1' | t == 'T0' ~ 'Stage I',
                    t == 'Tx' &  n == 'Nx' ~ 'TxNx',
                    .default = NA),
  stade_new = as.factor(case_when(stade == 'Stage I' | stade == 'Stage II' ~ 'Stage I-II',
                                  stade == 'Stage III' | stade == 'Stage IV' ~ 'Stage III-IV',
                                  .default = NA)),
  
  duree_radiotherapie = floor(lubridate::interval(debut_radiotherapie, fin_radiotherapie)/days(1)),
  date_derniere_nouvelle = case_when(!is.na(date_deces) ~ date_deces,
                                     !is.na(date_derniere_consultation) ~ date_derniere_consultation,
                                     .default = NA),
  delai_OS = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1),
  delai_PFS = case_when(!is.na(date_de_recidive) ~ lubridate::interval(debut_radiotherapie, date_de_recidive)/months(1),
                        .default = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1)),
  duree_suivi_months = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1)) 
  
  
data_biological <- data_biological %>%
  
  dplyr::mutate(normales = "[1.0 - 4.0] (10^9/L)",
  prelevement_timing = as.factor(str_extract(prelevement, pattern = "(?<=_)(.*?)(?=_)")),
  prelevement_number = as.integer(str_extract(prelevement, pattern = "\\d+")),
  jours_prelevement_bis = case_when((!is.na(date_prelevement) & prelevement_timing == 'pendant') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/days(1)),
                                    (!is.na(date_prelevement) & prelevement_timing == 'apres') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/days(1)),
                                    .default = NA),
  mois_prelevement = case_when((!is.na(date_prelevement) & prelevement_timing == 'pendant') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/months(1)),
                               (!is.na(date_prelevement) & prelevement_timing == 'apres') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/months(1)),
                               .default = NA)) 
  
(...)

Import data bases

Data bases types

Data bases may have various formats :

  • CSV (Comma Separated Values) files .csv
  • Excel files .xls or .xlsx
  • SAS files .sas7bdat or xpt (Trialmaster)




To import data correctly, you must pay attention to :

  • Separator , or ; or tab
  • Decimal . or ,
  • Sheet in excel files

Data bases types

Import data bases

Many functions from multiple packages exist to import data bases from different formats.

data_csv <- read.csv(file = "./Data/data_clinique.csv", 
header = TRUE, sep = ",", dec = ".")

library(readxl)
data_excel <- readxl::read_xlsx(path = "./Data/data_clinique.xlsx", 
sheet = "Sheet 1", col_names = TRUE)

library(haven)
# data_sas <- haven::read_sas(data_file = "./Data/data_clinique.sas7bdat")
# data_sas_bis <- haven::read_xpt(file = "./Data/data_clinique.xpt")

Using package::function syntax

Prefixing a function with the name of its package using :: is not mandatory, but it can be good practice for beginners. It helps clarify which package a function comes from and can prevent conflicts when different packages have functions with the same name. This ensures you’re using the correct function from the right package.

Import data bases

Easier to use import function from rio package which import data bases from different formats, in your global environment.

library(rio)
data <- rio::import(file = "./Data/data_clinique.csv", setclass = "tibble")




Another solution is the Dan’s package EDCimport which allows to import several data sets from formats csv, sas or xpt files with functions read_all_csv, read_all_sas and read_all_xpt.

library(EDCimport)
# data_list <- EDCimport::read_all_sas(path = "./Data/data.sas7bdat",
#                                      format_file = "./Data/procformat.sas")

You can find a description of this package on CRAN EDCimport. It will also be presented by Dan on module 4 of this training course.

Visualize data frames

After importing a data base in your global environment, you can observe the number of rows, columns and the type of variables using str function or the blue arrow.

str(data)
tibble [50 × 7] (S3: tbl_df/tbl/data.frame)
 $ patient_id     : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
 $ sex            : chr [1:50] "Man" "Woman" "Woman" "Woman" ...
 $ age            : int [1:50] 74 66 63 85 41 37 28 68 58 44 ...
 $ bmi            : num [1:50] 21.4 23.3 25.7 19.9 21.4 ...
 $ diagnosis_date : chr [1:50] "2017-11-04" "2019-11-15" "2019-02-13" "2018-03-16" ...
 $ initial_therapy: chr [1:50] "Surgery" "Chemotherapy + Surgery" "Chemotherapy" "Chemotherapy + Surgery" ...
 $ chemotherapy   : chr [1:50] NA "Cisplatine" "Cisplatine" "Cisplatine" ...

Visualize data frames

You can visualize the complete data using View function. It opens a new window with your data. You can also click on the name of the data frame in the global environment.

View(data)

Visualize data frames

In a View, you can explore data values with the arrows and the filter option.

Visualize data frames

Manipulate data frame

Introduction to the tidyverse

The Tidyverse is a collection of R packages designed for working with data bases with a common design philosophy and grammar. When the package tidyverse is loaded, its extensions are loaded too.


The most common (useful) are :

janitor is another R package very useful when manipulating data.

Clean and rename column names

One of the first thing to do after importing data is cleaning variables names using clean_names function from janitor package.

library(janitor)
data <- janitor::clean_names(data)




We can also rename column/variable name using rename function from dplyr package with this syntax :

data <- dplyr::rename(data, new_name_col_1 = old_name_col_1, new_name_col_2 = old_name_col_2)


To apply one operation for each column/variable name, we can use a function inside the rename function, specifying the .fn option.

data <- dplyr::rename(data, .fn = toupper)

Filter lines

Keep rows that match one or several logical condition with filter function from dplyr package.

library(tidyverse)
new_data <- dplyr::filter(data, sex == "Man" & age >= 60)
head(new_data)
# A tibble: 6 × 7
  patient_id sex     age   bmi diagnosis_date initial_therapy       chemotherapy
       <int> <chr> <int> <dbl> <chr>          <chr>                 <chr>       
1          1 Man      74  21.4 2017-11-04     Surgery               <NA>        
2          8 Man      68  25.4 2017-10-28     Surgery               <NA>        
3         16 Man      73  24.2 2019-08-17     Chemotherapy          Carboplatine
4         17 Man      65  24.9 2018-03-14     Chemotherapy + Surge… Carboplatine
5         23 Man      71  20.6 2018-04-05     Chemotherapy + Surge… Cisplatine  
6         26 Man      69  24.9 2017-11-16     Chemotherapy + Surge… Carboplatine

Equivalent SAS code

data new_data; 
  set data; 
  if sex = "Man" AND age >= 60; 
run; 

Select columns

Extract columns/variables of interest as a table with select function from dplyr package.

new_data <- dplyr::select(data, patient_id, sex, age, bmi)
head(new_data)
# A tibble: 6 × 4
  patient_id sex     age   bmi
       <int> <chr> <int> <dbl>
1          1 Man      74  21.4
2          2 Woman    66  23.3
3          3 Woman    63  25.7
4          4 Woman    85  19.9
5          5 Man      41  21.4
6          6 Man      37  24.0

Equivalent SAS code

data new_data; 
  set data; 
  keep patient_id sex age bmi; 
run; 

Select columns

Keep all columns except one (or more) using - column_name.

dplyr::select(data, - initial_therapy)
new_data <- dplyr::select(data, - c(diagnosis_date, initial_therapy))
head(new_data)
# A tibble: 6 × 5
  patient_id sex     age   bmi chemotherapy
       <int> <chr> <int> <dbl> <chr>       
1          1 Man      74  21.4 <NA>        
2          2 Woman    66  23.3 Cisplatine  
3          3 Woman    63  25.7 Cisplatine  
4          4 Woman    85  19.9 Cisplatine  
5          5 Man      41  21.4 Carboplatine
6          6 Man      37  24.0 Cisplatine  

Equivalent SAS code

data new_data; 
  set data; 
  drop diagnosis_date initial_therapy; 
run; 

Sort columns

Sort a data frame in ascending order by one or more columns using arrange function from dplyr package.

new_data <- dplyr::arrange(data, age)
head(new_data)
# A tibble: 6 × 7
  patient_id sex     age   bmi diagnosis_date initial_therapy       chemotherapy
       <int> <chr> <int> <dbl> <chr>          <chr>                 <chr>       
1         34 Woman    19  20.0 2017-12-14     Surgery               <NA>        
2         28 Woman    20  21.4 2018-03-03     Surgery               <NA>        
3         18 Man      21  25.2 2019-10-26     Chemotherapy + Surge… Carboplatine
4         20 Man      22  25.0 2017-11-12     Surgery               <NA>        
5         30 Woman    25  20.8 2019-10-29     Chemotherapy + Surge… Carboplatine
6         31 Woman    25  23.1 2018-02-20     Surgery               <NA>        

Equivalent SAS code

proc sort data = data out = new_data; 
  by age; 
run;

Sort columns

Use desc within arrange to sort a data frame in descending order by one or more columns.

new_data <- dplyr::arrange(data, desc(age)) 
head(new_data)
# A tibble: 6 × 7
  patient_id sex     age   bmi diagnosis_date initial_therapy       chemotherapy
       <int> <chr> <int> <dbl> <chr>          <chr>                 <chr>       
1          4 Woman    85  19.9 2018-03-16     Chemotherapy + Surge… Cisplatine  
2         40 Man      85  24.3 2018-10-05     Chemotherapy          Cisplatine  
3         42 Woman    76  23.5 2018-10-18     Chemotherapy          Cisplatine  
4          1 Man      74  21.4 2017-11-04     Surgery               <NA>        
5         16 Man      73  24.2 2019-08-17     Chemotherapy          Carboplatine
6         23 Man      71  20.6 2018-04-05     Chemotherapy + Surge… Cisplatine  

Equivalent SAS code

proc sort data = data out = new_data;
  by descending age;
run;

Mutate columns

Create new column or modify existing column with mutate function from dplyr package.

new_data <- dplyr::mutate(data, bmi_integer = as.integer(bmi))
head( dplyr::select(new_data, patient_id, bmi, bmi_integer) )
# A tibble: 6 × 3
  patient_id   bmi bmi_integer
       <int> <dbl>       <int>
1          1  21.4          21
2          2  23.3          23
3          3  25.7          25
4          4  19.9          19
5          5  21.4          21
6          6  24.0          24

Equivalent SAS code

data new_data; 
  set data; 
  bmi_integer = int(bmi); 
run;

Mutate columns

Create new column or modify existing column with mutate function from dplyr package.

new_data <- dplyr::mutate(data, age_cat = if_else(condition = age < 60, 
                                                  true = 'Age < 60', false = 'Age >= 60'))
head( dplyr::select(new_data, patient_id, age, age_cat) )
# A tibble: 6 × 3
  patient_id   age age_cat  
       <int> <int> <chr>    
1          1    74 Age >= 60
2          2    66 Age >= 60
3          3    63 Age >= 60
4          4    85 Age >= 60
5          5    41 Age < 60 
6          6    37 Age < 60 

Equivalent SAS code

data new_data; 
  set data; 
  if age < 60 then age_cat = "Age < 60"; 
  else age_cat = "Age >= 60"; 
run; 

Grouping operations

Function group_by from dplyr package defines groups of rows using column(s) values.

dplyr::group_by(data, sex)
dplyr::group_by(data, sex, initial_therapy)

Then, we can apply one operation for each group of rows, separately.

Grouping operations

Function group_by from dplyr package defines groups of rows using column(s) values.

new_data <- mutate(group_by(data, sex),
                   age_mean_sex = mean(age),
                   age_above_mean_sex = age > age_mean_sex) 

new_data <- ungroup(new_data)

head( dplyr::select(new_data, patient_id, sex, age, age_mean_sex, age_above_mean_sex) )
# A tibble: 6 × 5
  patient_id sex     age age_mean_sex age_above_mean_sex
       <int> <chr> <int>        <dbl> <lgl>             
1          1 Man      74         49.8 TRUE              
2          2 Woman    66         44.9 TRUE              
3          3 Woman    63         44.9 TRUE              
4          4 Woman    85         44.9 TRUE              
5          5 Man      41         49.8 FALSE             
6          6 Man      37         49.8 FALSE             

Grouping operations


group_by

Using group_by from dplyr can be risky because if you forget to use ungroup, it might lead to unintended behavior in subsequent operations.

For example, any operation after group_by will continue to be applied to the groups, which can cause errors or incorrect results.


Tip

Now, it’s recommended to use the grouping argument of the operation function instead (example to come).

This allows you to perform grouped operations without the need to explicitly call group_by and worry about forgetting to ungroup, providing cleaner and safer code.

Summarise columns

summarise function from dplyr package return a new data frame with a single row summarizing all observations in the input.

new_data <- dplyr::summarise(data, 
                             age_mean = mean(age), 
                             age_min = min(age), 
                             age_max = max(age))
head(new_data)
# A tibble: 1 × 3
  age_mean age_min age_max
     <dbl>   <int>   <int>
1     47.7      19      85


Equivalent SAS code

proc summary data = data nway;
  var age;
  output out = new_data 
    mean = age_mean 
    min = age_min 
    max = age_max;
run;

Summarise columns

We can also compute aggregated statistics by dimensions/subsets of the initial table using summarise and its grouping argument .by, working as group_by but avoiding mistakes.

new_data <- data %>% 
  dplyr::summarise(age_mean = mean(age), .by = 'sex')
head(new_data)
# A tibble: 2 × 2
  sex   age_mean
  <chr>    <dbl>
1 Man       49.8
2 Woman     44.9

Equivalent SAS code

proc summary data = data nway; 
  class sex; 
  var age; 
  output out = new_data mean(age) = age_mean; 
run; 

Sequence of operations

When manipulating data, we can chain several operations (clean and rename column names, select columns, filter on values, add new columns…), using the

pipe %>% (CTrl + Maj + M).

new_data <- data %>% 
  select(patient_id, sex, age) %>% 
  filter(sex == "Man" & age >= 60)

head(new_data)
# A tibble: 6 × 3
  patient_id sex     age
       <int> <chr> <int>
1          1 Man      74
2          8 Man      68
3         16 Man      73
4         17 Man      65
5         23 Man      71
6         26 Man      69

CheatSheets dplyr

CheatSheets dplyr

Recode variables

case_when

case_when is a function from dplyr package used inside a mutate, allowing to create a new variable from values of one or more variable(s) of the data frame, by successively assessing logical conditions if {...} else {...} .

new_data <- data %>% 
  select(patient_id, sex, age) %>%
  mutate(sex_age = case_when(sex == "Man" & age < 60 ~ "Man under 60yo",
                             sex == "Man" & age >= 60 ~ "Man over 60yo",
                             sex == "Woman" & age < 60 ~ "Woman under 60yo",
                             sex == "Woman" & age >= 60 ~ "Woman over 60yo",
                             .default = NA))
head(new_data)
# A tibble: 6 × 4
  patient_id sex     age sex_age        
       <int> <chr> <int> <chr>          
1          1 Man      74 Man over 60yo  
2          2 Woman    66 Woman over 60yo
3          3 Woman    63 Woman over 60yo
4          4 Woman    85 Woman over 60yo
5          5 Man      41 Man under 60yo 
6          6 Man      37 Man under 60yo 

case_when

new_data <- data %>% 
  select(patient_id, sex, age) %>%
  mutate(sex_age = case_when(sex == "Man" & age < 60 ~ "Man under 60yo",
                             sex == "Man" & age >= 60 ~ "Man over 60yo",
                             sex == "Woman" & age < 60 ~ "Woman under 60yo",
                             sex == "Woman" & age >= 60 ~ "Woman over 60yo",
                             .default = NA))


Equivalent SAS code

data new_data; 
  set data; 
  if sex = "Man" and age < 60 then sex_age = "Man under 60yo"; 
  else if sex = "Man" and age >= 60 then sex_age = "Man over 60yo"; 
  else if sex = "Woman" and age < 60 then sex_age = "Woman under 60yo"; 
  else if sex = "Woman" and age >= 60 then sex_age = "Woman over 60yo"; 
  else sex_age = .; 
  keep patient_id sex age sex_age; 
run; 

case_when

The new variable created by case_when and mutate can reference a variable instead of a static value depending on conditions. You can make conditional changes on one variable while keeping the rest of the data intact.

library(glue)

new_data <- data %>% 
  select(patient_id, initial_therapy, chemotherapy) %>%
  mutate(complete_therapy = case_when(initial_therapy == "Chemotherapy" ~ chemotherapy,
                                      initial_therapy == "Chemotherapy + Surgery" ~ glue("{chemotherapy}", "Surgery", .sep = " + "),
                                      .default = initial_therapy))

Note

Here is an example with the function glue from the package glue : expressions enclosed by braces {} are evaluated as R code.

glue("We have clinical data for {nrow(data)} patients on this data set.")
We have clinical data for 50 patients on this data set.

case_when

The new variable created by case_when and mutate can reference a variable instead of a static value depending on conditions. You can make conditional changes on one variable while keeping the rest of the data intact.

new_data <- data %>% 
  select(patient_id, initial_therapy, chemotherapy) %>%
  mutate(complete_therapy = case_when(initial_therapy == "Chemotherapy" ~ chemotherapy,
                                      initial_therapy == "Chemotherapy + Surgery" ~ glue("{chemotherapy}", "Surgery", .sep = " + "),
                                      .default = initial_therapy))
head(new_data)
# A tibble: 6 × 4
  patient_id initial_therapy        chemotherapy complete_therapy      
       <int> <chr>                  <chr>        <glue>                
1          1 Surgery                <NA>         Surgery               
2          2 Chemotherapy + Surgery Cisplatine   Cisplatine + Surgery  
3          3 Chemotherapy           Cisplatine   Cisplatine            
4          4 Chemotherapy + Surgery Cisplatine   Cisplatine + Surgery  
5          5 Chemotherapy + Surgery Carboplatine Carboplatine + Surgery
6          6 Chemotherapy           Cisplatine   Cisplatine            

stringr

stringr is an package from tidyverse collection, allowing to manipulate strings. Regular expressions can especially be used in stringr functions, to detect some patterns in strings.

data %>% 
  select(patient_id, initial_therapy) %>% 
  mutate(surgery_detect = str_detect(string = initial_therapy, pattern = "Surgery"), 
         surgery_extract = str_extract(string = initial_therapy, pattern = "Surgery")) %>%
  head()
# A tibble: 6 × 4
  patient_id initial_therapy        surgery_detect surgery_extract
       <int> <chr>                  <lgl>          <chr>          
1          1 Surgery                TRUE           Surgery        
2          2 Chemotherapy + Surgery TRUE           Surgery        
3          3 Chemotherapy           FALSE          <NA>           
4          4 Chemotherapy + Surgery TRUE           Surgery        
5          5 Chemotherapy + Surgery TRUE           Surgery        
6          6 Chemotherapy           FALSE          <NA>           

CheatSheets stringr

forcats

forcats is a package from tidyverse collection, allowing to manipulate categorical variables named factors. Factors are variables with several modalities, that can be ordered and named. These factors can be sorted using relevel function.

data %>% 
  select(patient_id, initial_therapy) %>%
  mutate(init_therapy_fct = as_factor(initial_therapy),
         init_therapy_recode = fct_recode(init_therapy_fct,
                                          chimio_chirurgie = "Chemotherapy + Surgery", 
                                          chimio = "Chemotherapy",
                                          chirurgie = "Surgery")) %>%
  head()
# A tibble: 6 × 4
  patient_id initial_therapy        init_therapy_fct       init_therapy_recode
       <int> <chr>                  <fct>                  <fct>              
1          1 Surgery                Surgery                chirurgie          
2          2 Chemotherapy + Surgery Chemotherapy + Surgery chimio_chirurgie   
3          3 Chemotherapy           Chemotherapy           chimio             
4          4 Chemotherapy + Surgery Chemotherapy + Surgery chimio_chirurgie   
5          5 Chemotherapy + Surgery Chemotherapy + Surgery chimio_chirurgie   
6          6 Chemotherapy           Chemotherapy           chimio             

CheatSheets forcats

lubridate

lubridate is a package from tidyverse collection, allowing to manipulate dates. We can get and set components of a date, compare two dates using logical operators (\(date_{1} > date_{2}\) return TRUE or FALSE), add or subtract component or a date from another date.

data %>% 
  select(patient_id, diagnosis_date) %>%
  mutate(diagnosis_date_bis = ymd(diagnosis_date),
         diagnosis_year = year(diagnosis_date_bis),
         diagnosis_today = today() - diagnosis_date_bis) %>%
  head()
# A tibble: 6 × 5
  patient_id diagnosis_date diagnosis_date_bis diagnosis_year diagnosis_today
       <int> <chr>          <date>                      <dbl> <drtn>         
1          1 2017-11-04     2017-11-04                   2017 2795 days      
2          2 2019-11-15     2019-11-15                   2019 2054 days      
3          3 2019-02-13     2019-02-13                   2019 2329 days      
4          4 2018-03-16     2018-03-16                   2018 2663 days      
5          5 2017-05-01     2017-05-01                   2017 2982 days      
6          6 2017-11-10     2017-11-10                   2017 2789 days      

CheatSheets lubridate

CheatSheets lubridate

Pivot data frame

Pivot from wide to long

tidyr package allows to arrange data, particularly to pivot data. With function pivot_longer, we will obtain one row for each observation.

Columns selected with option cols = c(A, B) A and B are gathered into one column containing the double of rows. The names of the columns gathered are reported in new column N with option names_to = "N". The values of the columns gathered are reported in new column V with option values_to = "V".

Pivot from wide to long

data_wide <- data %>%
  summarise(age_min = min(age),
            age_mean = mean(age),
            age_max = max(age),
            .by = "initial_therapy")

data_wide
# A tibble: 3 × 4
  initial_therapy        age_min age_mean age_max
  <chr>                    <int>    <dbl>   <int>
1 Surgery                     19     41.2      74
2 Chemotherapy + Surgery      21     49.9      85
3 Chemotherapy                25     52.4      85

data_long <- data_wide %>% 
  pivot_longer(cols = c(age_min, age_mean, age_max), 
               names_to = "Statistic", 
               names_prefix = "age_", 
               values_to = "Age")

data_long
# A tibble: 9 × 3
  initial_therapy        Statistic   Age
  <chr>                  <chr>     <dbl>
1 Surgery                min        19  
2 Surgery                mean       41.2
3 Surgery                max        74  
4 Chemotherapy + Surgery min        21  
5 Chemotherapy + Surgery mean       49.9
6 Chemotherapy + Surgery max        85  
7 Chemotherapy           min        25  
8 Chemotherapy           mean       52.4
9 Chemotherapy           max        85  

Pivot from long to wide

With function pivot_wider, we will obtain one row for value of one column (example: one row for each id_patient).

Columns selected with the option names_from = "N" will define the names of the new columns in the output. The values A and B corresponding to each unique value in N will be placed into new columns as names, with the values reported in the column specified by the values_from = "V" option. The identifier column id specified with option id_cols = id remains unchanged.

Pivot from long to wide

data_long
# A tibble: 9 × 3
  initial_therapy        Statistic   Age
  <chr>                  <chr>     <dbl>
1 Surgery                min        19  
2 Surgery                mean       41.2
3 Surgery                max        74  
4 Chemotherapy + Surgery min        21  
5 Chemotherapy + Surgery mean       49.9
6 Chemotherapy + Surgery max        85  
7 Chemotherapy           min        25  
8 Chemotherapy           mean       52.4
9 Chemotherapy           max        85  
data_wide <- data_long %>%
  pivot_wider(id_cols = initial_therapy,
              names_from = Statistic,
              names_prefix = "age_",
              values_from = Age)

data_wide
# A tibble: 3 × 4
  initial_therapy        age_min age_mean age_max
  <chr>                    <dbl>    <dbl>   <dbl>
1 Surgery                     19     41.2      74
2 Chemotherapy + Surgery      21     49.9      85
3 Chemotherapy                25     52.4      85

Join data frames

Type of joins

dplyr package contains several functions to combine two data frames A and B in various ways. These mutating joins functions match observations from A and B, based on the key(s).

The by argument specifies the column(s) that should be used as the key to match rows between the two data frames A and B. If the key in A and B takes different names, you can use option by = join_by("id_a" = "id_b").

After performing a join, if both data frames have columns with the same names (other than key column(s)), it automatically appends a suffix to distinguish them. The default suffixes are .x for the left data frame and .y for the right data frame. You can customize these suffixes with the suffix argument.

Data sets for examples

data_chimio <- rio::import(file = "./Data/data_chimio.csv")

head(data_chimio)
  patient_id date_debut dose_totale_mg
1          2 2019-12-08            500
2          3 2019-02-24            600
3          4 2018-03-25            480
4          5 2017-05-19           1200
5          6 2017-12-08            500
6          7 2017-10-29           1400
data_AE <- rio::import(file = "./Data/data_AE.csv")

head(data_AE)
  patient_id                   ae_soc ae_grade
1          1       Vascular disorders        1
2          1  Immune system disorders        1
3          1           Investigations        1
4          2        Cardiac disorders        4
5          2           Investigations        1
6          3 Nervous system disorders        3

left_join

left_join function from dplyr package is used to join two data frames A and B, combining all rows from the left A data frame with matching rows from the right B data frame. If there is no match in the right data frame, it fills the missing values with NA.

left_join

data_left <- dplyr::left_join(data_chimio, data_AE, by = 'patient_id') %>%
  arrange(patient_id)

data_left
# A tibble: 66 × 5
   patient_id date_debut dose_totale_mg ae_soc                   ae_grade
        <int> <IDate>             <int> <chr>                       <int>
 1          2 2019-12-08            500 Cardiac disorders               4
 2          2 2019-12-08            500 Investigations                  1
 3          3 2019-02-24            600 Nervous system disorders        3
 4          3 2019-02-24            600 Investigations                  2
 5          4 2018-03-25            480 Nervous system disorders        3
 6          5 2017-05-19           1200 Investigations                  3
 7          6 2017-12-08            500 Immune system disorders         3
 8          6 2017-12-08            500 Vascular disorders              4
 9          7 2017-10-29           1400 Immune system disorders         3
10          7 2017-10-29           1400 Vascular disorders              2
# ℹ 56 more rows

right_join

right_join function from dplyr package is used to join two data frame A and B, keeping all the observations of B.

right_join

data_right <- dplyr::right_join(data_chimio, data_AE, by = 'patient_id') %>%
  arrange(patient_id)

data_right
# A tibble: 98 × 5
   patient_id date_debut dose_totale_mg ae_soc                   ae_grade
        <int> <IDate>             <int> <chr>                       <int>
 1          1 NA                     NA Vascular disorders              1
 2          1 NA                     NA Immune system disorders         1
 3          1 NA                     NA Investigations                  1
 4          2 2019-12-08            500 Cardiac disorders               4
 5          2 2019-12-08            500 Investigations                  1
 6          3 2019-02-24            600 Nervous system disorders        3
 7          3 2019-02-24            600 Investigations                  2
 8          4 2018-03-25            480 Nervous system disorders        3
 9          5 2017-05-19           1200 Investigations                  3
10          6 2017-12-08            500 Immune system disorders         3
# ℹ 88 more rows

inner_join

inner_join function from dplyr package is used to join two data frame A and B, keeping only common observations of one variable (matching key) in A and B.

inner_join

data_inner <- dplyr::inner_join(data_chimio, data_AE, by = 'patient_id') %>%
  arrange(patient_id)

data_inner
# A tibble: 64 × 5
   patient_id date_debut dose_totale_mg ae_soc                   ae_grade
        <int> <IDate>             <int> <chr>                       <int>
 1          2 2019-12-08            500 Cardiac disorders               4
 2          2 2019-12-08            500 Investigations                  1
 3          3 2019-02-24            600 Nervous system disorders        3
 4          3 2019-02-24            600 Investigations                  2
 5          4 2018-03-25            480 Nervous system disorders        3
 6          5 2017-05-19           1200 Investigations                  3
 7          6 2017-12-08            500 Immune system disorders         3
 8          6 2017-12-08            500 Vascular disorders              4
 9          7 2017-10-29           1400 Immune system disorders         3
10          7 2017-10-29           1400 Vascular disorders              2
# ℹ 54 more rows

full_join

full_join function from dplyr package is used to join two data frame A and B, keeping all the observations of A and all the observations of B.

full_join

data_full <- dplyr::full_join(data_chimio, data_AE, by = 'patient_id') %>%
  arrange(patient_id)

data_full
# A tibble: 100 × 5
   patient_id date_debut dose_totale_mg ae_soc                   ae_grade
        <int> <IDate>             <int> <chr>                       <int>
 1          1 NA                     NA Vascular disorders              1
 2          1 NA                     NA Immune system disorders         1
 3          1 NA                     NA Investigations                  1
 4          2 2019-12-08            500 Cardiac disorders               4
 5          2 2019-12-08            500 Investigations                  1
 6          3 2019-02-24            600 Nervous system disorders        3
 7          3 2019-02-24            600 Investigations                  2
 8          4 2018-03-25            480 Nervous system disorders        3
 9          5 2017-05-19           1200 Investigations                  3
10          6 2017-12-08            500 Immune system disorders         3
# ℹ 90 more rows

Complete script example

# IMPORT CLINICAL AND BIOLOGICAL DATA ----------------------------------------------
  
data_clinical_unilateral_raw <- readxl::read_xlsx(path = paste(file_data, "donnees_cliniques_bilateral.xlsx", 
                                                  sep = ""), 
                                                  sheet = "Unilat")
  
data_clinical_bilateral_raw <- readxl::read_xlsx(path = paste(file_data, "donnees_cliniques_bilateral.xlsx", 
                                                 sep = ""), 
                                                 sheet = "Bilat")
  
  
data_biological_unilateral_raw <- readxl::read_xlsx(path = paste(file_data, "biological_unilateral.xlsx", sep = ""))
  
data_biological_bilateral_raw <- readxl::read_xlsx(path = paste(file_data, "biological_bilateral.xlsx", sep = ""))
  
(...)
  
  

# CLEAN VARIABLES NAMES AND PIVOT BIOLOGICAL DATA ----------------------------------
  
data_clinical_unilateral <- data_clinical_unilateral_raw %>%
  janitor::clean_names() %>%
  dplyr::rename(traitement = type_de_traitement,
                id_patient = numero_de_dossier_gr,
                age = age_au_debut_du_traitement,
                cancer_primitif = primitif_standardise,
                histologie = histologie_definitive_standardise,
                debut_radiotherapie = debut,
                fin_radiotherapie = fin,
                chimiotherapie_concomittante = chimio_conco,
                recidive = recidivie,
                radiotherapy_adjuvante = rt_adjuvante,
                chimiotherapie_induction = chimio_induction,
                recidive_post_traitement = recidivie_post_tt) 
  
  
data_biological_unilateral_wide <- data_biological_unilateral_raw %>%
  janitor::clean_names() %>%
  tidyr::separate(col = noigr_datedeb_datefin,
                  into = c('id_patient', 'debut_radiotherapie', 'fin_radiotherapie'),
                  sep = "_") %>%
  tidyr::separate_wider_regex(cols = j_avant,
                              patterns = c(biological_baseline = "\\d+\\.\\d.", 
                                           "\\s*-\\s*",
                                           date_prelevement_baseline = "\\d{2}/\\d{2}/\\d{4}\\s+\\d{2}:\\d{2}"),
                              too_few = "align_start") 
  
data_biological_unilateral <- data_biological_unilateral_wide %>%
  tidyr::pivot_longer(cols = tidyselect::starts_with('j'),
                      names_to = "prelevement",
                      values_to = "biological_date") %>%
  tidyr::separate_wider_regex(cols = biological_date,
                              patterns = c(biological = "\\d+\\.\\d.", 
                                           "\\s*-\\s*",
                                           date_prelevement = "\\d{2}/\\d{2}/\\d{4}\\s+\\d{2}:\\d{2}",
                                           "\\s*-\\s*",
                                           jours_prelevement = "\\d*"),
                              too_few = "align_start")
  
(...)
  
  
  
# RECODE TYPE OF VARIABLES ---------------------------------------------------------
  
indicatrice <- function(x) {
  
  # Input : 
  # x : a numeric vector which takes values 1 (for 'Yes') and 0 (for 'No').
  # Output :
  # a vector which is a factor with two levels 'Yes' and 'No'.
  
  return(factor(case_when(x == 1 ~ 'Yes',
                          x == 0 ~ 'No',
                          is.na(x) ~ NA,
                          .default = NA),
                levels = c('Yes', 'No')))
}
  
data_clinical_unilateral <- data_clinical_unilateral %>%
  dplyr::mutate(oms = as.numeric(case_when(oms == 'NC' ~ NA,
                                           .default = oms))) %>%
  dplyr::mutate_if(is.character,
                   .funs = as.factor) %>% 
  dplyr::mutate_at(vars('id_patient'), 
                   .funs = as.character) %>%
  dplyr::mutate_at(vars(c('chimiotherapie_concomittante', 'recidive', 'reirradiation',
                          'radiotherapy_adjuvante', 'chimiotherapie_induction', 
                          'recidive_post_traitement', 'deces')),
                   .funs = indicatrice)
  
(...)
  
  
  
# BIND DATA OF CONTROL AND EXPERIMENTAL ARM ----------------------------------------
  
data_clinical <- data_clinical_unilateral %>%
  dplyr::bind_rows(data_clinical_bilateral) %>%
  dplyr::select(- age) # age not complete : it will be (re)computed later
  
(...)
  
  
  
# DERIVATION OF VARIABLES -----------------------------------------------------------
  
data_clinical <- data_clinical %>%

  dplyr::mutate(age = floor(lubridate::interval(date_de_naissance, debut_radiotherapie)/years(1)),
  oms = case_when(oms == 0 ~ '0',
                  oms == 1 | oms == 2 ~ '1-2',
                  .default = NA),
  t = case_when(t == 'Tx' ~ 'Tx',
                t == 'T0' ~ 'T0',
                t == 'T1' ~ 'T1',
                t == 'T2' ~ 'T2',
                t == 'T3' ~ 'T3',
                t == 'T4a' | t == 'T4b' ~ 'T4',
                .default = NA),
  n = case_when(n == 'Nx' ~ 'Nx',
                n == 'N0' ~ 'N0',
                n == 'N1' ~ 'N1',
                n == 'N2a' | n == 'N2b' | n == 'N2c' ~ 'N2',
                n == 'N3' ~ 'N3',
                .default = NA),
  stade = case_when(m == 'M1' ~ 'Stage IV',
                    n == 'N3' | t == 'T4' ~ 'Stage III',
                    n == 'N2' | t == 'T3' ~ 'Stage II',
                    n == 'N1' | n == 'N0' | t == 'T2' | t == 'T1' | t == 'T0' ~ 'Stage I',
                    t == 'Tx' &  n == 'Nx' ~ 'TxNx',
                    .default = NA),
  stade_new = as.factor(case_when(stade == 'Stage I' | stade == 'Stage II' ~ 'Stage I-II',
                                  stade == 'Stage III' | stade == 'Stage IV' ~ 'Stage III-IV',
                                  .default = NA)),
  
  duree_radiotherapie = floor(lubridate::interval(debut_radiotherapie, fin_radiotherapie)/days(1)),
  date_derniere_nouvelle = case_when(!is.na(date_deces) ~ date_deces,
                                     !is.na(date_derniere_consultation) ~ date_derniere_consultation,
                                     .default = NA),
  delai_OS = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1),
  delai_PFS = case_when(!is.na(date_de_recidive) ~ lubridate::interval(debut_radiotherapie, date_de_recidive)/months(1),
                        .default = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1)),
  duree_suivi_months = lubridate::interval(debut_radiotherapie, date_derniere_nouvelle)/months(1)) 
  
  
data_biological <- data_biological %>%
  
  dplyr::mutate(normales = "[1.0 - 4.0] (10^9/L)",
  prelevement_timing = as.factor(str_extract(prelevement, pattern = "(?<=_)(.*?)(?=_)")),
  prelevement_number = as.integer(str_extract(prelevement, pattern = "\\d+")),
  jours_prelevement_bis = case_when((!is.na(date_prelevement) & prelevement_timing == 'pendant') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/days(1)),
                                    (!is.na(date_prelevement) & prelevement_timing == 'apres') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/days(1)),
                                    .default = NA),
  mois_prelevement = case_when((!is.na(date_prelevement) & prelevement_timing == 'pendant') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/months(1)),
                               (!is.na(date_prelevement) & prelevement_timing == 'apres') ~ floor(lubridate::interval(debut_radiotherapie, date_prelevement)/months(1)),
                               .default = NA)) 
  
(...)

Conclusion


You are now able to :

  • Import data bases in R environment

  • Visualize the data

  • Manipulate the data using tidyverse

  • Create derived variables

  • Pivot the data

  • Join several data

The next steps are learning how to :

  • create graphs using ggplot2 (training course n°3 created by Nusaïbah)

  • import data from TrialMaster using EDCimport and create description tables using crosstable (training course n°4 created by Dan)

  • compute regression models (training course n°5 created by Dan)

  • create automatic word reports using officer (training course n°6 created by me)!

Thank you for your attention



Good luck for the next training courses !

Code With Me