je me retrouve confronter à un format de data.frame qui n'est pas pratique (CDISC) pour appliquer un modèle (modèle mixte, lme4), dont voici un extrait:
Code : Tout sélectionner
library(tidyverse)
Code : Tout sélectionner
dta <- structure(list(VSORRES = c(35.4, NA, 102, 31.2, 84, 87, 29.4,
NA, NA, NA, NA, 37.9, NA, 38.2, 32.2, 26.6, NA, NA, 80, 80, 85,
80, 76, 80, NA, 81, 93, 79, 80, 68), VISIT = c("BASELINE", "VISIT2",
"VISIT2", "VISIT1", "VISIT3", "VISIT3", "VISIT1", "LAST", "LAST",
"LAST", "LAST", "VISIT2", "VISIT2", "VISIT3", "VISIT3", "BASELINE",
"LAST", "LAST", "BASELINE", "BASELINE", "BASELINE", "BASELINE",
"VISIT1", "VISIT1", "VISIT2", "VISIT2", "VISIT3", "VISIT3", "VISIT1",
"VISIT1"), VSORRESU = c("kg/m2", "mmHg", "mmHg", "kg/m2", "mmHg",
"mmHg", "kg/m2", "mmHg", "mmHg", "mmHg", "mmHg", "kg/m2", "kg/m2",
"kg/m2", "kg/m2", "kg/m2", "kg/m2", "kg/m2", "mmHg", "mmHg",
"mmHg", "mmHg", "mmHg", "mmHg", "mmHg", "mmHg", "mmHg", "mmHg",
"mmHg", "mmHg"), VSCAT = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), VSTEST = c("Body Mass Index",
"Diastolic Blood Pressure", "Diastolic Blood Pressure", "Body Mass Index",
"Diastolic Blood Pressure", "Diastolic Blood Pressure", "Body Mass Index",
"Diastolic Blood Pressure", "Diastolic Blood Pressure", "Diastolic Blood Pressure",
"Diastolic Blood Pressure", "Body Mass Index", "Body Mass Index",
"Body Mass Index", "Body Mass Index", "Body Mass Index", "Body Mass Index",
"Body Mass Index", "Diastolic Blood Pressure", "Diastolic Blood Pressure",
"Diastolic Blood Pressure", "Diastolic Blood Pressure", "Diastolic Blood Pressure",
"Diastolic Blood Pressure", "Diastolic Blood Pressure", "Diastolic Blood Pressure",
"Diastolic Blood Pressure", "Diastolic Blood Pressure", "Diastolic Blood Pressure",
"Diastolic Blood Pressure"), VSTESTCD = c("BMI", "DIABP", "DIABP",
"BMI", "DIABP", "DIABP", "BMI", "DIABP", "DIABP", "DIABP", "DIABP",
"BMI", "BMI", "BMI", "BMI", "BMI", "BMI", "BMI", "DIABP", "DIABP",
"DIABP", "DIABP", "DIABP", "DIABP", "DIABP", "DIABP", "DIABP",
"DIABP", "DIABP", "DIABP"), VSDTC = c("1994-12-05", NA, "2000-11-14",
"1997-12-02", "2004-03-16", "2004-02-24", "1998-02-10", NA, NA,
NA, NA, "2000-11-14", NA, "2004-02-24", "2004-03-16", "1995-04-04",
NA, NA, "1994-12-05", "1995-04-04", "1994-12-05", "1995-04-04",
"1998-02-10", "1997-12-02", NA, "2000-11-14", "2004-02-24", "2004-03-16",
"1997-12-02", "1998-02-10"), VSTPT = c(NA, "Measurement 2", "Measurement 2",
NA, "Measurement 2", "Measurement 2", NA, "Measurement 1", "Measurement 1",
"Measurement 2", "Measurement 2", NA, NA, NA, NA, NA, NA, NA,
"Measurement 1", "Measurement 1", "Measurement 2", "Measurement 2",
"Measurement 1", "Measurement 1", "Measurement 1", "Measurement 1",
"Measurement 1", "Measurement 1", "Measurement 2", "Measurement 2"
), DOMAIN = c("VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS",
"VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS",
"VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS", "VS"
), VSTPTNUM = c(NA, 2L, 2L, NA, 2L, 2L, NA, 1L, 1L, 2L, 2L, NA,
NA, NA, NA, NA, NA, NA, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L), SUBJID = c(83228L, 756L, 83228L, 83228L, 756L, 83228L,
756L, 756L, 83228L, 83228L, 756L, 83228L, 756L, 83228L, 756L,
756L, 756L, 83228L, 83228L, 756L, 83228L, 756L, 756L, 83228L,
756L, 83228L, 83228L, 756L, 83228L, 756L)), .Names = c("VSORRES",
"VISIT", "VSORRESU", "VSCAT", "VSTEST", "VSTESTCD", "VSDTC",
"VSTPT", "DOMAIN", "VSTPTNUM", "SUBJID"), row.names = c("10",
"100753", "102081", "10619", "106848", "107455", "10976", "111706",
"114382", "118765", "119618", "17481", "20560", "22239", "23090",
"3047", "5766", "6246", "67929", "72420", "75889", "76741", "79968",
"83252", "83506", "87034", "89271", "93048", "96608", "98554"
), class = "data.frame")
J'ai une variable "VSORRES" qui contient la/les mesures ("VSTPT"/"VSTPTNUM") réalisées à une "VISIT", pour une variable clinique "VSTESTCD" (et "VSTEST") avec une unité "VSORRESU".
Ce que je souhaite obtenir, c'est un data.frame où chaque valeur de "VSTESTCD" est une colonne contenant la valeur "VSORRES".
Un exemple simple:
Code : Tout sélectionner
tmp <- dta %>%
filter(is.na(VSTPTNUM) | VSTPTNUM==1) %>%
select(VSORRES, VISIT, SUBJID, VSTESTCD)
VSORRES VISIT SUBJID VSTESTCD
1 35.4 BASELINE 83228 BMI
2 31.2 VISIT1 83228 BMI
3 29.4 VISIT1 756 BMI
4 NA LAST 756 DIABP
5 NA LAST 83228 DIABP
6 37.9 VISIT2 83228 BMI
7 NA VISIT2 756 BMI
8 38.2 VISIT3 83228 BMI
9 32.2 VISIT3 756 BMI
10 26.6 BASELINE 756 BMI
11 NA LAST 756 BMI
12 NA LAST 83228 BMI
13 80.0 BASELINE 83228 DIABP
14 80.0 BASELINE 756 DIABP
15 76.0 VISIT1 756 DIABP
16 80.0 VISIT1 83228 DIABP
17 NA VISIT2 756 DIABP
18 81.0 VISIT2 83228 DIABP
19 93.0 VISIT3 83228 DIABP
20 79.0 VISIT3 756 DIABP
Code : Tout sélectionner
tmp %>%
spread(key = VSTESTCD, value = VSORRES)
VISIT SUBJID BMI DIABP
1 BASELINE 756 26.6 80
2 BASELINE 83228 35.4 80
3 LAST 756 NA NA
4 LAST 83228 NA NA
5 VISIT1 756 29.4 76
6 VISIT1 83228 31.2 80
7 VISIT2 756 NA NA
8 VISIT2 83228 37.9 81
9 VISIT3 756 32.2 79
10 VISIT3 83228 38.2 93
Mon soucis est que tidyr::spread ne peut prendre qu'une seule colonne pour "key"/"value" et je patine un peu pour transformer directement mon data.frame d'origine.
Si on rajoute l'unité de la variable "VSORRESU", je souhaiterai qu'à la fois la valeur et l'unité soit "disperser" en même temps, pour avoir par exemple "BMI.value", "BMI.unit", "DIABPlue" et "DIABP.unit", pour le moment j'obtiens plutôt ça:
Code : Tout sélectionner
dta %>%
filter(is.na(VSTPTNUM) | VSTPTNUM==1) %>%
select(VSORRES, VISIT, SUBJID, VSTESTCD, VSORRESU) %>%
spread(key = VSTESTCD, value = VSORRES)
VISIT SUBJID VSORRESU BMI DIABP
1 BASELINE 756 kg/m2 26.6 NA
2 BASELINE 756 mmHg NA 80
3 BASELINE 83228 kg/m2 35.4 NA
4 BASELINE 83228 mmHg NA 80
5 LAST 756 kg/m2 NA NA
6 LAST 756 mmHg NA NA
7 LAST 83228 kg/m2 NA NA
8 LAST 83228 mmHg NA NA
9 VISIT1 756 kg/m2 29.4 NA
10 VISIT1 756 mmHg NA 76
11 VISIT1 83228 kg/m2 31.2 NA
12 VISIT1 83228 mmHg NA 80
13 VISIT2 756 kg/m2 NA NA
14 VISIT2 756 mmHg NA NA
15 VISIT2 83228 kg/m2 37.9 NA
16 VISIT2 83228 mmHg NA 81
17 VISIT3 756 kg/m2 32.2 NA
18 VISIT3 756 mmHg NA 79
19 VISIT3 83228 kg/m2 38.2 NA
20 VISIT3 83228 mmHg NA 93
Si quelqu'un à l'astuce pour formater mes données où si vous connaissez un package permettant de manipuler des données au format CDISC pour les rendre exploitable (analysable) dans R, je suis preneur. (Sinon je finirais bien par faire quelque-chose de très très moche et presque fonctionnel)
Par avance, merci.