Bonjour
Je suis confronté à une difficulté depuis plusieurs jours: faire l'échantillonnage stratifié.
Je ne me connais pas assez dans R, mais je l'utilise pour construire un modèle prédictif.
Vous pouvez accéder à mon jeu de données par le lien suivant: https://drive.google.com/file/d/1wwpFy- ... sp=sharing
J'ai plusieurs questions:
1) Y a-t-il un moyen pour automatiser certaines actions qui se répètent avec de légères modifications ? (exemple de la partie ATTRIBUTION A CHAQUE OBSERVATION DU NOMBRE D'OCCURRENCES DE "FORMULAVERSION"; je l'ai fait manuellement, mais c'est très fastidieux et source d'éventuelles erreurs de saisie)
2) Je veux construire des échantillons test (30%) et entrainement (70%) voire validation en sorte que dans chaque échantillon, la variable "FORMULAVERSION" se retrouve dans les mêmes proportions que dans le jeu de données global. J'ai donc ajouter dans le jeu de données, une colonne comptant le nombre d'occurrence de "FORMULAVERSION" puis une autre calculant les fréquences. Je souhaite considère "FORMULAVERSION" comme variable de stratification pour faire l'échantillonnage.
3) Quels fonctions et packages me sont nécessaires ?
Merci pour l'attention que vous porterez à mes inquiétudes.
Bien cordialement,
Dabêgou RAGAWA
Voici ce que j'ai essayé sans parvenir ou sans savoir quoi faire exactement.
``{r, include=FALSE}
##
#PACKAGES
##
library(openxlsx)
library(readr)
library(readxl)
library(dplyr)
library(survey)
library(sampling)
library(SamplingStrata)
##
#ESPACE DE TRAVAIL
##
setwd("C:/Users/dabegou.ragawa/Data Analyst - Data Analyst/R/Joachim/ANALYSE/DATA/rc/30062020")
##
#IMPORT DU JEU DE DONNEES
##
clean<-read_xlsx("DigitalTwinDryerHumidityFormula-v2.xlsx")
##
#CREATION DE COLONNE POUR COMPTER LES OBSERVATIONS PAR "FORMULAVERSION"
##
clean$NUMBER<-NA
```
```{r,echo = FALSE}
```
```{r,echo = FALSE}
##
#COMPTER LES OCCURENCES DE CHAQUE FORMULAVERSION
##
NBR_FORMULAVERSION<-as.data.frame(table(clean$FormulaVersion))
NBR_FORMULAVERSION
```
```{r,echo = FALSE}
##
#ATTRIBUTION A CHAQUE OBSERVATION DU NOMBRE D'OCCURRENCES DE "FORMULAVERSION"
##
clean$NUMBER[clean$FormulaVersion=="2402.00.13475"]<-sum(clean$FormulaVersion=="2402.00.13475")
clean$NUMBER[clean$FormulaVersion=="2403.00.13476"]<-sum(clean$FormulaVersion=="2403.00.13476")
clean$NUMBER[clean$FormulaVersion=="2405.00.13478"]<-sum(clean$FormulaVersion=="2405.00.13478")
clean$NUMBER[clean$FormulaVersion=="2411.00.13467"]<-sum(clean$FormulaVersion=="2411.00.13467")
clean$NUMBER[clean$FormulaVersion=="2434.00.13336"]<-sum(clean$FormulaVersion=="2434.00.13336")
clean$NUMBER[clean$FormulaVersion=="2435.00.13337"]<-sum(clean$FormulaVersion=="2435.00.13337")
clean$NUMBER[clean$FormulaVersion=="2435.00.13977"]<-sum(clean$FormulaVersion=="2435.00.13977")
clean$NUMBER[clean$FormulaVersion=="2451.00.13674"]<-sum(clean$FormulaVersion=="2451.00.13674")
clean$NUMBER[clean$FormulaVersion=="2451.04.13793"]<-sum(clean$FormulaVersion=="2451.04.13793")
clean$NUMBER[clean$FormulaVersion=="2451.85.13974"]<-sum(clean$FormulaVersion=="2451.85.13974")
clean$NUMBER[clean$FormulaVersion=="2451.86.13975"]<-sum(clean$FormulaVersion=="2451.86.13975")
clean$NUMBER[clean$FormulaVersion=="2451.95.13754"]<-sum(clean$FormulaVersion=="2451.95.13754")
clean$NUMBER[clean$FormulaVersion=="2451.95.13788"]<-sum(clean$FormulaVersion=="2451.95.13788")
clean$NUMBER[clean$FormulaVersion=="2453.00.13675"]<-sum(clean$FormulaVersion=="2453.00.13675")
clean$NUMBER[clean$FormulaVersion=="4022.04.13556"]<-sum(clean$FormulaVersion=="4022.04.13556")
clean$NUMBER[clean$FormulaVersion=="4169.00.13353"]<-sum(clean$FormulaVersion=="4169.00.13353")
clean$NUMBER[clean$FormulaVersion=="4539.00.13406"]<-sum(clean$FormulaVersion=="4539.00.13406")
clean$NUMBER[clean$FormulaVersion=="4594.00.13500"]<-sum(clean$FormulaVersion=="4594.00.13500")
clean$NUMBER[clean$FormulaVersion=="4815.00.13442"]<-sum(clean$FormulaVersion=="4815.00.13442")
clean$NUMBER[clean$FormulaVersion=="4815.02.13598"]<-sum(clean$FormulaVersion=="4815.02.13598")
clean$NUMBER[clean$FormulaVersion=="4819.00.13438"]<-sum(clean$FormulaVersion=="4819.00.13438")
clean$NUMBER[clean$FormulaVersion=="4819.02.13624"]<-sum(clean$FormulaVersion=="4819.02.13624")
clean$NUMBER[clean$FormulaVersion=="4821.02.13625"]<-sum(clean$FormulaVersion=="4821.02.13625")
clean$NUMBER[clean$FormulaVersion=="5013.01.13726"]<-sum(clean$FormulaVersion=="5013.01.13726")
clean$NUMBER[clean$FormulaVersion=="5013.01.14018"]<-sum(clean$FormulaVersion=="5013.01.14018")
clean$NUMBER[clean$FormulaVersion=="5013.04.12977"]<-sum(clean$FormulaVersion=="5013.04.12977")
clean$NUMBER[clean$FormulaVersion=="5061.00.13388"]<-sum(clean$FormulaVersion=="5061.00.13388")
clean$NUMBER[clean$FormulaVersion=="5119.00.13384"]<-sum(clean$FormulaVersion=="5119.00.13384")
clean$NUMBER[clean$FormulaVersion=="5119.02.13612"]<-sum(clean$FormulaVersion=="5119.02.13612")
clean$NUMBER[clean$FormulaVersion=="5125.00.13425"]<-sum(clean$FormulaVersion=="5125.00.13425")
clean$NUMBER[clean$FormulaVersion=="5125.00.13995"]<-sum(clean$FormulaVersion=="5125.00.13995")
clean$NUMBER[clean$FormulaVersion=="5127.00.13354"]<-sum(clean$FormulaVersion=="5127.00.13354")
clean$NUMBER[clean$FormulaVersion=="5133.00.13494"]<-sum(clean$FormulaVersion=="5133.00.13494")
clean$NUMBER[clean$FormulaVersion=="5133.00.13721"]<-sum(clean$FormulaVersion=="5133.00.13721")
clean$NUMBER[clean$FormulaVersion=="5133.00.13997"]<-sum(clean$FormulaVersion=="5133.00.13997")
clean$NUMBER[clean$FormulaVersion=="5133.04.13640"]<-sum(clean$FormulaVersion=="5133.04.13640")
clean$NUMBER[clean$FormulaVersion=="5133.04.14013"]<-sum(clean$FormulaVersion=="5133.04.14013")
clean$NUMBER[clean$FormulaVersion=="5133.94.13879"]<-sum(clean$FormulaVersion=="5133.94.13879")
clean$NUMBER[clean$FormulaVersion=="5136.00.13410"]<-sum(clean$FormulaVersion=="5136.00.13410")
clean$NUMBER[clean$FormulaVersion=="5175.00.13585"]<-sum(clean$FormulaVersion=="5175.00.13585")
clean$NUMBER[clean$FormulaVersion=="5176.00.13723"]<-sum(clean$FormulaVersion=="5176.00.13723")
clean$NUMBER[clean$FormulaVersion=="5176.00.14021"]<-sum(clean$FormulaVersion=="5176.00.14021")
clean$NUMBER[clean$FormulaVersion=="5180.00.13576"]<-sum(clean$FormulaVersion=="5180.00.13576")
clean$NUMBER[clean$FormulaVersion=="5180.00.14023"]<-sum(clean$FormulaVersion=="5180.00.14023")
clean$NUMBER[clean$FormulaVersion=="5180.02.13615"]<-sum(clean$FormulaVersion=="5180.02.13615")
clean$NUMBER[clean$FormulaVersion=="5180.04.13803"]<-sum(clean$FormulaVersion=="5180.04.13803")
clean$NUMBER[clean$FormulaVersion=="5181.00.13577"]<-sum(clean$FormulaVersion=="5181.00.13577")
clean$NUMBER[clean$FormulaVersion=="5181.00.14024"]<-sum(clean$FormulaVersion=="5181.00.14024")
clean$NUMBER[clean$FormulaVersion=="5181.04.13639"]<-sum(clean$FormulaVersion=="5181.04.13639")
clean$NUMBER[clean$FormulaVersion=="5181.04.14012"]<-sum(clean$FormulaVersion=="5181.04.14012")
clean$NUMBER[clean$FormulaVersion=="5208.00.13347"]<-sum(clean$FormulaVersion=="5208.00.13347")
clean$NUMBER[clean$FormulaVersion=="5282.00.13166"]<-sum(clean$FormulaVersion=="5282.00.13166")
clean$NUMBER[clean$FormulaVersion=="5285.00.13568"]<-sum(clean$FormulaVersion=="5285.00.13568")
clean$NUMBER[clean$FormulaVersion=="5286.00.13557"]<-sum(clean$FormulaVersion=="5286.00.13557")
clean$NUMBER[clean$FormulaVersion=="5292.00.12887"]<-sum(clean$FormulaVersion=="5292.00.12887")
clean$NUMBER[clean$FormulaVersion=="5293.00.13521"]<-sum(clean$FormulaVersion=="5293.00.13521")
clean$NUMBER[clean$FormulaVersion=="5296.00.13326"]<-sum(clean$FormulaVersion=="5296.00.13326")
clean$NUMBER[clean$FormulaVersion=="5298.00.13327"]<-sum(clean$FormulaVersion=="5298.00.13327")
clean$NUMBER[clean$FormulaVersion=="5299.00.13449"]<-sum(clean$FormulaVersion=="5299.00.13449")
clean$NUMBER[clean$FormulaVersion=="5304.00.13328"]<-sum(clean$FormulaVersion=="5304.00.13328")
clean$NUMBER[clean$FormulaVersion=="5312.00.13534"]<-sum(clean$FormulaVersion=="5312.00.13534")
clean$NUMBER[clean$FormulaVersion=="5319.00.13569"]<-sum(clean$FormulaVersion=="5319.00.13569")
clean$NUMBER[clean$FormulaVersion=="5329.00.13342"]<-sum(clean$FormulaVersion=="5329.00.13342")
clean$NUMBER[clean$FormulaVersion=="5366.00.13372"]<-sum(clean$FormulaVersion=="5366.00.13372")
clean$NUMBER[clean$FormulaVersion=="5369.00.13671"]<-sum(clean$FormulaVersion=="5369.00.13671")
clean$NUMBER[clean$FormulaVersion=="5370.00.13513"]<-sum(clean$FormulaVersion=="5370.00.13513")
clean$NUMBER[clean$FormulaVersion=="5371.00.13514"]<-sum(clean$FormulaVersion=="5371.00.13514")
clean$NUMBER[clean$FormulaVersion=="5373.00.13528"]<-sum(clean$FormulaVersion=="5373.00.13528")
clean$NUMBER[clean$FormulaVersion=="5374.00.13529"]<-sum(clean$FormulaVersion=="5374.00.13529")
clean$NUMBER[clean$FormulaVersion=="5376.00.13590"]<-sum(clean$FormulaVersion=="5376.00.13590")
clean$NUMBER[clean$FormulaVersion=="5377.00.13591"]<-sum(clean$FormulaVersion=="5377.00.13591")
clean$NUMBER[clean$FormulaVersion=="5377.00.13999"]<-sum(clean$FormulaVersion=="5377.00.13999")
clean$NUMBER[clean$FormulaVersion=="5378.00.13503"]<-sum(clean$FormulaVersion=="5378.00.13503")
clean$NUMBER[clean$FormulaVersion=="5382.00.13998"]<-sum(clean$FormulaVersion=="5382.00.13998")
clean$NUMBER[clean$FormulaVersion=="5385.00.13587"]<-sum(clean$FormulaVersion=="5385.00.13587")
clean$NUMBER[clean$FormulaVersion=="5386.00.13497"]<-sum(clean$FormulaVersion=="5386.00.13497")
clean$NUMBER[clean$FormulaVersion=="5390.00.13505"]<-sum(clean$FormulaVersion=="5390.00.13505")
clean$NUMBER[clean$FormulaVersion=="5393.00.13506"]<-sum(clean$FormulaVersion=="5393.00.13506")
clean$NUMBER[clean$FormulaVersion=="5419.03.13787"]<-sum(clean$FormulaVersion=="5419.03.13787")
clean$NUMBER[clean$FormulaVersion=="5462.00.13394"]<-sum(clean$FormulaVersion=="5462.00.13394")
clean$NUMBER[clean$FormulaVersion=="5466.00.13345"]<-sum(clean$FormulaVersion=="5466.00.13345")
clean$NUMBER[clean$FormulaVersion=="5474.00.13553"]<-sum(clean$FormulaVersion=="5474.00.13553")
clean$NUMBER[clean$FormulaVersion=="5474.90.14065"]<-sum(clean$FormulaVersion=="5474.90.14065")
clean$NUMBER[clean$FormulaVersion=="5475.00.13346"]<-sum(clean$FormulaVersion=="5475.00.13346")
clean$NUMBER[clean$FormulaVersion=="5479.00.13331"]<-sum(clean$FormulaVersion=="5479.00.13331")
clean$NUMBER[clean$FormulaVersion=="5479.04.13636"]<-sum(clean$FormulaVersion=="5479.04.13636")
clean$NUMBER[clean$FormulaVersion=="5479.85.12739"]<-sum(clean$FormulaVersion=="5479.85.12739")
clean$NUMBER[clean$FormulaVersion=="5479.94.13881"]<-sum(clean$FormulaVersion=="5479.94.13881")
clean$NUMBER[clean$FormulaVersion=="5484.02.13600"]<-sum(clean$FormulaVersion=="5484.02.13600")
clean$NUMBER[clean$FormulaVersion=="5484.02.13843"]<-sum(clean$FormulaVersion=="5484.02.13843")
clean$NUMBER[clean$FormulaVersion=="5484.04.13637"]<-sum(clean$FormulaVersion=="5484.04.13637")
clean$NUMBER[clean$FormulaVersion=="5485.00.13350"]<-sum(clean$FormulaVersion=="5485.00.13350")
clean$NUMBER[clean$FormulaVersion=="5485.00.13815"]<-sum(clean$FormulaVersion=="5485.00.13815")
clean$NUMBER[clean$FormulaVersion=="5485.02.13601"]<-sum(clean$FormulaVersion=="5485.02.13601")
clean$NUMBER[clean$FormulaVersion=="5485.02.13844"]<-sum(clean$FormulaVersion=="5485.02.13844")
clean$NUMBER[clean$FormulaVersion=="5485.04.13792"]<-sum(clean$FormulaVersion=="5485.04.13792")
clean$NUMBER[clean$FormulaVersion=="5487.00.13552"]<-sum(clean$FormulaVersion=="5487.00.13552")
clean$NUMBER[clean$FormulaVersion=="5487.04.13633"]<-sum(clean$FormulaVersion=="5487.04.13633")
clean$NUMBER[clean$FormulaVersion=="5498.00.13583"]<-sum(clean$FormulaVersion=="5498.00.13583")
clean$NUMBER[clean$FormulaVersion=="5503.00.13542"]<-sum(clean$FormulaVersion=="5503.00.13542")
clean$NUMBER[clean$FormulaVersion=="5504.00.13550"]<-sum(clean$FormulaVersion=="5504.00.13550")
clean$NUMBER[clean$FormulaVersion=="5506.00.13976"]<-sum(clean$FormulaVersion=="5506.00.13976")
clean$NUMBER[clean$FormulaVersion=="5506.04.13634"]<-sum(clean$FormulaVersion=="5506.04.13634")
clean$NUMBER[clean$FormulaVersion=="5507.00.13544"]<-sum(clean$FormulaVersion=="5507.00.13544")
clean$NUMBER[clean$FormulaVersion=="5525.00.13509"]<-sum(clean$FormulaVersion=="5525.00.13509")
clean$NUMBER[clean$FormulaVersion=="5616.00.13455"]<-sum(clean$FormulaVersion=="5616.00.13455")
clean$NUMBER[clean$FormulaVersion=="5616.00.14008"]<-sum(clean$FormulaVersion=="5616.00.14008")
clean$NUMBER[clean$FormulaVersion=="5645.00.13380"]<-sum(clean$FormulaVersion=="5645.00.13380")
clean$NUMBER[clean$FormulaVersion=="5645.04.13897"]<-sum(clean$FormulaVersion=="5645.04.13897")
clean$NUMBER[clean$FormulaVersion=="5645.86.13192"]<-sum(clean$FormulaVersion=="5645.86.13192")
clean$NUMBER[clean$FormulaVersion=="5751.00.13519"]<-sum(clean$FormulaVersion=="5751.00.13519")
clean$NUMBER[clean$FormulaVersion=="5751.85.13088"]<-sum(clean$FormulaVersion=="5751.85.13088")
clean$NUMBER[clean$FormulaVersion=="5755.00.13358"]<-sum(clean$FormulaVersion=="5755.00.13358")
clean$NUMBER[clean$FormulaVersion=="5755.02.13603"]<-sum(clean$FormulaVersion=="5755.02.13603")
clean$NUMBER[clean$FormulaVersion=="5759.00.13939"]<-sum(clean$FormulaVersion=="5759.00.13939")
clean$NUMBER[clean$FormulaVersion=="5764.00.13766"]<-sum(clean$FormulaVersion=="5764.00.13766")
clean$NUMBER[clean$FormulaVersion=="5773.00.13936"]<-sum(clean$FormulaVersion=="5773.00.13936")
clean$NUMBER[clean$FormulaVersion=="5781.00.13465"]<-sum(clean$FormulaVersion=="5781.00.13465")
clean$NUMBER[clean$FormulaVersion=="5782.00.13727"]<-sum(clean$FormulaVersion=="5782.00.13727")
clean$NUMBER[clean$FormulaVersion=="5782.00.14004"]<-sum(clean$FormulaVersion=="5782.00.14004")
clean$NUMBER[clean$FormulaVersion=="5782.90.13967"]<-sum(clean$FormulaVersion=="5782.90.13967")
clean$NUMBER[clean$FormulaVersion=="5802.00.13502"]<-sum(clean$FormulaVersion=="5802.00.13502")
clean$NUMBER[clean$FormulaVersion=="5809.04.13799"]<-sum(clean$FormulaVersion=="5809.04.13799")
clean$NUMBER[clean$FormulaVersion=="5810.00.13344"]<-sum(clean$FormulaVersion=="5810.00.13344")
clean$NUMBER[clean$FormulaVersion=="5810.04.13804"]<-sum(clean$FormulaVersion=="5810.04.13804")
```
```{r,echo = FALSE}
##
#CALCUL DES FREQUENCES DES "FORMULAVERSION" DANS UNE NOUVELLE COLONNE "Frequence"
##
clean$Frequence<-clean$NUMBER/sum(table(clean$FormulaVersion))
table_frequence<-as.data.frame(clean$NUMBER/sum(table(clean$FormulaVersion)))
##
#VISUALISATION DU NOUVEAU JEU DE DONNEES AYANT UNE COLONNE NUMBER
##
clean
##
#SAUVEGARDE DU NOUVEAU JEU DE DONNEES AVEC COLONNE NUMBER
##
#DigitalTwinDryerHumidityFormulaNumberv1<-write.xlsx(clean,file = "DigitalTwinDryerHumidityFormulaNumber-v1.xlsx")
```
```{r,echo = FALSE}
#PLAN D'ECHANTILLONNAGE
plan<-svydesign(id=~1,strata = ~FormulaVersion,data = clean,fpc =~NUMBER)
```