class: center, middle, inverse, title-slide # Wrangling II ### Explorative Datenanalyse mit R
The R Bootcamp
@
CSS
### Dezember 2019 --- layout: true <div class="my-footer"> <span style="text-align:center"> <span> <img src="https://raw.githubusercontent.com/therbootcamp/therbootcamp.github.io/master/_sessions/_image/by-sa.png" height=14 style="vertical-align: middle"/> </span> <a href="https://therbootcamp.github.io/"> <span style="padding-left:82px"> <font color="#7E7E7E"> www.therbootcamp.com </font> </span> </a> <a href="https://therbootcamp.github.io/"> <font color="#7E7E7E"> Explorative Datenanalyse mit R @ CSS | Dezember 2019 </font> </a> </span> </div> --- # Advanced `dplyr` .pull-left45[ <ul> <li class="m1"><span><high>Transformation & Organisation</high> <br><br> <ul class="level"> <li><span>Fehlende Werte ersetzen / entfernen</span></li> <li><span>Ändere alle Variablen, die...</span></li> <li><span>Zeilen zu Spalten oder Spalten zu Zeilen</span></li> </ul> </span></li> <li class="m2"><span><high>Aggregation</high> <br><br> <ul class="level"> <li><span>Nach Variablen gruppieren</span></li> <li><span>Deskriptive Statistiken berechnen</span></li> </ul> </span></li> </ul> ] .pull-right45[ <p align="center"> <img src="image/wrangling.jpeg"><br> <font style="font-size:10px">from <a href="https://DATENsciencebe.com/tag/DATEN-wrangling/">DATENsciencebe.com</a></font> </p> ] --- # Transformation & Organisation .pull-left4[ <ul> <li class="m1"><span><high>Transformation</high> <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span><high>Organisation</high> <br><br> <ul class="level"> <li><span><mono>starts_with(), contains()</mono></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse ``` ``` ## # A tibble: 5 x 6 ## id alter bedingung bed_label t_1 t_2 ## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 1 37 1 placebo 123 135 ## 2 2 65 2 medikament 143 140 ## 3 3 57 2 medikament NA NA ## 4 4 34 1 placebo 100 105 ## 5 5 45 2 medikament NA NA ``` ] --- # `mutate_if()` .pull-left4[ <ul> <li class="m1"><span><high>Transformation</high> <br><br> <ul class="level"> <li><span><high><mono>mutate_if()</mono></high></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span>Organisation <br><br> <ul class="level"> <li><span><mono>starts_with(), contains(), :</mono></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse %>% # Ändere alle numeric in character mutate_if(is.numeric, as.character) ``` ``` ## # A tibble: 5 x 6 ## id alter bedingung bed_label t_1 t_2 ## <chr> <chr> <chr> <chr> <chr> <chr> ## 1 1 37 1 placebo 123 135 ## 2 2 65 2 medikament 143 140 ## 3 3 57 2 medikament <NA> <NA> ## 4 4 34 1 placebo 100 105 ## 5 5 45 2 medikament <NA> <NA> ``` ] --- # `replace_na()` .pull-left4[ <ul> <li class="m1"><span><high>Transformation</high> <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><high><mono>replace_na()</mono></high></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span>Organisation <br><br> <ul class="level"> <li><span><mono>starts_with(), contains(), :</mono></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse %>% # Ändere fehlende Werte in 110 mutate(t_1 = replace_na(t_1, 110)) ``` ``` ## # A tibble: 5 x 6 ## id alter bedingung bed_label t_1 t_2 ## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 1 37 1 placebo 123 135 ## 2 2 65 2 medikament 143 140 ## 3 3 57 2 medikament 110 NA ## 4 4 34 1 placebo 100 105 ## 5 5 45 2 medikament 110 NA ``` ] --- # `replace_na()` .pull-left4[ <ul> <li class="m1"><span><high>Transformation</high> <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><high><mono>drop_na()</mono></high></span></li> </ul> </span></li> <li class="m2"><span>Organisation <br><br> <ul class="level"> <li><span><mono>starts_with(), contains(), :</mono></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse %>% # Entferne Zeielen mit fehlenden Werten drop_na() ``` ``` ## # A tibble: 3 x 6 ## id alter bedingung bed_label t_1 t_2 ## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 1 37 1 placebo 123 135 ## 2 2 65 2 medikament 143 140 ## 3 4 34 1 placebo 100 105 ``` ] --- # `starts_with()` .pull-left4[ <ul> <li class="m1"><span>Transformation <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span><high>Organisation</high> <br><br> <ul class="level"> <li><span><high><mono>starts_with(), contains(), :</mono></high></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse %>% # Wählen spalten aus die mit "t" beginnen select(starts_with("t")) ``` ``` ## # A tibble: 5 x 2 ## t_1 t_2 ## <dbl> <dbl> ## 1 123 135 ## 2 143 140 ## 3 NA NA ## 4 100 105 ## 5 NA NA ``` ] --- # `contains()` .pull-left4[ <ul> <li class="m1"><span>Transformation <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span><high>Organisation</high> <br><br> <ul class="level"> <li><span><high><mono>starts_with(), contains(), :</mono></high></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse %>% # Wählen spalten aus "_" beinhalten select(contains("_")) ``` ``` ## # A tibble: 5 x 3 ## bed_label t_1 t_2 ## <chr> <dbl> <dbl> ## 1 placebo 123 135 ## 2 medikament 143 140 ## 3 medikament NA NA ## 4 placebo 100 105 ## 5 medikament NA NA ``` ] --- # `:` .pull-left4[ <ul> <li class="m1"><span>Transformation <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span><high>Organisation</high> <br><br> <ul class="level"> <li><span><high><mono>starts_with(), contains(), :</mono></high></span></li> <li><span><mono>pivot_longer(), pivot_wider()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r patienten_ergebnisse %>% # Spalten von alter bis t_1 select(alter:t_1) ``` ``` ## # A tibble: 5 x 4 ## alter bedingung bed_label t_1 ## <dbl> <dbl> <chr> <dbl> ## 1 37 1 placebo 123 ## 2 65 2 medikament 143 ## 3 57 2 medikament NA ## 4 34 1 placebo 100 ## 5 45 2 medikament NA ``` ] --- # `pivot_*()` .pull-left4[ <ul> <li class="m1"><span>Transformation <br><br> <ul class="level"> <li><span><mono>mutate_if()</mono></span></li> <li><span><mono>replace_na()</mono></span></li> <li><span><mono>drop_na()</mono></span></li> </ul> </span></li> <li class="m2"><span><high>Organisation</high> <br><br> <ul class="level"> <li><span><mono>starts_with(), contains(), :</mono></span></li> <li><span><high><mono>pivot_longer(), pivot_wider()</mono></high></span></li> </ul> </span></li> </ul> ] .pull-right5[ <p align="center"> <img src="image/tidyr-spread-gather.gif" height=420px><br> <font style="font-size:10px">adapted from <a href="https://github.com/gadenbuie/tidyexplain">tidyexplain</a></font> </p> ] --- # `pivot_longer()` .pull-left4[ ```r # wide zu long DATEN %>% pivot_longer(cols = SPALTEN, names_to = NAMEN, values_to = WERTE) ``` ] .pull-right5[ ```r # wide zu long patienten_ergebnisse %>% filter(bed_label == "placebo") ``` ``` ## # A tibble: 2 x 6 ## id alter bedingung bed_label t_1 t_2 ## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 1 37 1 placebo 123 135 ## 2 4 34 1 placebo 100 105 ``` ] --- # `pivot_longer()` .pull-left4[ ```r # wide zu long DATEN %>% pivot_longer(cols = SPALTEN, names_to = NAMEN, values_to = WERTE) ``` ] .pull-right5[ ```r # wide zu long patienten_ergebnisse %>% filter(bed_label == "placebo") %>% pivot_longer(cols = c("t_1", "t_2"), names_to = "zeit", values_to = "messung") ``` ``` ## # A tibble: 4 x 6 ## id alter bedingung bed_label zeit messung ## <dbl> <dbl> <dbl> <chr> <chr> <dbl> ## 1 1 37 1 placebo t_1 123 ## 2 1 37 1 placebo t_2 135 ## 3 4 34 1 placebo t_1 100 ## 4 4 34 1 placebo t_2 105 ``` ] --- # `pivot_wider()` .pull-left4[ ```r # wide zu long DATEN %>% pivot_wider(names_from = NAMEN, values_from = WERTE) ``` ] .pull-right5[ ```r # wide zu long patienten_ergebnisse_lang ``` ``` ## # A tibble: 4 x 6 ## id alter bedingung bed_label zeit messung ## <dbl> <dbl> <dbl> <chr> <chr> <dbl> ## 1 1 37 1 placebo t_1 123 ## 2 1 37 1 placebo t_2 135 ## 3 4 34 1 placebo t_1 100 ## 4 4 34 1 placebo t_2 105 ``` ] --- # `pivot_wider()` .pull-left4[ ```r # wide zu long DATEN %>% pivot_wider(names_from = NAMEN, values_from = WERTE) ``` ] .pull-right5[ ```r # wide zu long patienten_ergebnisse_lang %>% pivot_wider(names_from = "zeit", values_from = "messung") ``` ``` ## # A tibble: 2 x 6 ## id alter bedingung bed_label t_1 t_2 ## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> ## 1 1 37 1 placebo 123 135 ## 2 4 34 1 placebo 100 105 ``` ] --- # Aggregation .pull-left4[ <ul> <li class="m1"><span><high>Aggregation</high> <br><br> <ul class="level"> <li><span><mono>summarise()</mono></span></li> <li><span><mono>summarise_if()</mono></span></li> <li><span><mono>group_by(), summarise()</mono></span></li> <li><span><mono>n(), first(), last(), nth()</mono></span></li> <li><span><mono>pull()</mono></span></li> </ul> </span></li> </ul> ] .pull-right5[ ```r basel ``` ``` ## # A tibble: 10,000 x 20 ## id geschlecht alter groesse gewicht ## <dbl> <chr> <dbl> <dbl> <dbl> ## 1 1 f 87 165 NA ## 2 2 m 54 175. 85.6 ## 3 3 f 34 147. 53.9 ## 4 4 m 31 166. 105 ## 5 5 m 24 180. 102. ## # … with 9,995 more rows, and 15 more ## # variables: einkommen <dbl>, ## # bildung <chr>, konfession <chr>, ## # kinder <dbl>, glueck <dbl>, ## # fitness <dbl>, essen <dbl>, ## # alkohol <dbl>, tattoos <dbl>, ## # rhein <dbl>, … ``` ] --- # `summarise()` .pull-left4[ ```r DATEN %>% summarise( NAME = SUMMARY_FUN(A), NAME = SUMMARY_FUN(B) ) ``` ] .pull-right5[ ```r basel %>% # Berechne deskriptive Statistiken summarise( alter_mean = mean(alter), groesse_median = median(groesse) ) ``` ``` ## # A tibble: 1 x 2 ## alter_mean groesse_median ## <dbl> <dbl> ## 1 49.4 171. ``` ] --- # `summarise_if()` .pull-left4[ ```r DATEN %>% summarise_if( BEDINGUNG, SUMMARY_FUN ) ``` ] .pull-right5[ ```r basel %>% select(alter, groesse, konfession, einkommen) ``` ``` ## # A tibble: 10,000 x 4 ## alter groesse konfession einkommen ## <dbl> <dbl> <chr> <dbl> ## 1 87 165 katholisch NA ## 2 54 175. konfessionslos 7500 ## 3 34 147. konfessionslos 5500 ## 4 31 166. katholisch NA ## 5 24 180. katholisch 3800 ## # … with 9,995 more rows ``` ] --- # `summarise_if()` .pull-left4[ ```r DATEN %>% summarise_if( BEDINGUNG, SUMMARY_FUN ) ``` ] .pull-right5[ ```r basel %>% # Berechne deskriptive Statistiken select(alter, groesse, konfession, einkommen) %>% summarise_if(is.numeric, mean) ``` ``` ## # A tibble: 1 x 3 ## alter groesse einkommen ## <dbl> <dbl> <dbl> ## 1 49.4 171. NA ``` ] --- # `summarise_if()` .pull-left4[ ```r DATEN %>% summarise_if( BEDINGUNG, SUMMARY_FUN, ARGUMENTE ) ``` ] .pull-right5[ ```r basel %>% # Berechne deskriptive Statistiken select(alter, groesse, konfession, einkommen) %>% summarise_if(is.numeric, mean, na.rm = TRUE) ``` ``` ## # A tibble: 1 x 3 ## alter groesse einkommen ## <dbl> <dbl> <dbl> ## 1 49.4 171. 8355. ``` ] --- # Gruppierte Aggregation <p align="center"> <img src="image/summarsed_data_diagram.png" height="470px"> </p> --- # `group_by()`, `summarise()` .pull-left4[ ```r DATEN %>% group_by(GRUPPENVARIABLE) %>% summarise( NAME = SUMMARY_FUN(A), NAME = SUMMARY_FUN(B) ) ``` ] .pull-right5[ ```r basel %>% # Gruppiere nach geschlecht group_by(geschlecht) %>% # Berechne Statistiken summarise( alter_mean = mean(alter), groesse_median = median(groesse) ) ``` ``` ## # A tibble: 2 x 3 ## geschlecht alter_mean groesse_median ## <chr> <dbl> <dbl> ## 1 f 49.8 164 ## 2 m 49.1 178. ``` ] --- # `n()` .pull-left4[ ```r DATEN %>% group_by(GRUPPENVARIABLE) %>% summarise( NAME = SUMMARY_FUN(A), NAME = SUMMARY_FUN(B) ) ``` ] .pull-right5[ ```r basel %>% # Gruppiere nach geschlecht group_by(geschlecht) %>% # Berechne Statistiken summarise( N = n() ) ``` ``` ## # A tibble: 2 x 2 ## geschlecht N ## <chr> <int> ## 1 f 5000 ## 2 m 5000 ``` ] --- # `first()` .pull-left4[ ```r DATEN %>% group_by(GRUPPENVARIABLE) %>% summarise( NAME = SUMMARY_FUN(A), NAME = SUMMARY_FUN(B) ) ``` ] .pull-right5[ ```r basel %>% # Gruppiere nach geschlecht group_by(geschlecht) %>% # Berechne Statistiken summarise( N = n(), bild_1 = first(bildung) ) ``` ``` ## # A tibble: 2 x 3 ## geschlecht N bild_1 ## <chr> <int> <chr> ## 1 f 5000 obligatorisch ## 2 m 5000 sek III ``` ] --- # `last()` .pull-left4[ ```r DATEN %>% group_by(GRUPPENVARIABLE) %>% summarise( NAME = SUMMARY_FUN(A), NAME = SUMMARY_FUN(B) ) ``` ] .pull-right5[ ```r basel %>% # Gruppiere nach geschlecht group_by(geschlecht) %>% # Berechne Statistiken summarise( N = n(), bild_1 = first(bildung), bild_N = last(bildung) ) ``` ``` ## # A tibble: 2 x 4 ## geschlecht N bild_1 bild_N ## <chr> <int> <chr> <chr> ## 1 f 5000 obligatorisch lehre ## 2 m 5000 sek III lehre ``` ] --- # `nth()` .pull-left4[ ```r DATEN %>% group_by(GRUPPENVARIABLE) %>% summarise( NAME = SUMMARY_FUN(A), NAME = SUMMARY_FUN(B) ) ``` ] .pull-right5[ ```r basel %>% # Gruppiere nach geschlecht group_by(geschlecht) %>% # Berechne Statistiken summarise( N = n(), bild_1 = first(bildung), bild_N = last(bildung), bild_150 = nth(bildung, 150) ) ``` ``` ## # A tibble: 2 x 5 ## geschlecht N bild_1 bild_N bild_150 ## <chr> <int> <chr> <chr> <chr> ## 1 f 5000 obligatoris… lehre obligatori… ## 2 m 5000 sek III lehre lehre ``` ] --- # `pull()` .pull-left4[ ```r DATEN %>% group_by(GRUPPENVARIABLE) %>% summarise( NAME = SUMMARY_FUN(A), ) %>% pull(NAME) ``` ] .pull-right5[ ```r basel %>% # Gruppiere nach geschlecht group_by(geschlecht) %>% # Berechne Statistiken summarise(N = n()) %>% # Extrahiere Vektor pull(N) ``` ``` ## [1] 5000 5000 ``` ] --- class: middle, center <h1><a href="https://therbootcamp.github.io/EDA_2019CSS/_sessions/WranglingII/WranglingII_practical.html">Practical</a></h1>