Get started#


Load data#

-_images/palmerpenguins.png -

To start playing around with the functions from these packages we will -use the -palmerpenguins -data set. This simple data set has both continuous and categorical -variables that make it perfect for showcasing how different functions -work.

-penguins_url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/inst/extdata/penguins.csv"
-dat = read_csv(url(penguins_url))
-dat = dat %>% drop_na()
-## # A tibble: 6 × 8
-##   species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex  
-##   <chr>   <chr>           <dbl>         <dbl>            <dbl>       <dbl> <chr>
-## 1 Adelie  Torge…           39.1          18.7              181        3750 male 
-## 2 Adelie  Torge…           39.5          17.4              186        3800 fema…
-## 3 Adelie  Torge…           40.3          18                195        3250 fema…
-## 4 Adelie  Torge…           36.7          19.3              193        3450 fema…
-## 5 Adelie  Torge…           39.3          20.6              190        3650 male 
-## 6 Adelie  Torge…           38.9          17.8              181        3625 fema…
-## # … with 1 more variable: year <dbl>

General plotting with ggpubr#


ggpubr allows to make insightful plots quickly for exploration that in -turn can be further customized thanks to being built on top of -ggplot2.


These are useful links for using this package:

- -

Next, we will try to answer different questions using this library and -ggplot2.


How many penguins of each species did we observe in total?#

ggpie(dat %>% count(species), x = "n", fill = "species")


How many penguins of each species and sex did we observe across the different islands?#

ggbarplot(dat %>% count(species, sex, island), x = "species", y = "n", fill = "sex", 
-          label = TRUE, position = position_dodge(0.7), facet.by = "island", palette = "lancet")


What are the distributions of flipper lengths considering penguin species, sex and islands of origin?#

gghistogram(dat, x = "flipper_length_mm", fill = "sex", facet.by = c("species","island"))


Alternatively, we can use stripcharts charts:

ggstripchart(dat, x = "island", y = "flipper_length_mm", color = "sex", facet.by = "species", alpha = 0.5, position = position_jitterdodge(), add = "median_iqr", add.params = list(color="black", group="sex", size=0.2))


Are the differences of body mass between sexes significant if we control for species and island?#

ggstripchart(dat, x = "island", y = "body_mass_g", color = "sex", facet.by = "species", alpha = 0.5, position = position_jitterdodge(), add = "median_iqr", add.params = list(color="black", group="sex", size=0.2))+
-   stat_compare_means(aes(color = sex), label = "p.signif", method = "wilcox.test")


What is the relationship between flipper length, body mass and bill length?#

ggscatter(dat, x = "flipper_length_mm", y = "body_mass_g", color = "bill_length_mm", alpha = 0.5)


Could we have sampling bias in the relationship between flipper length and body mass?#

ggscatter(dat %>% mutate(year=factor(year)), x = "flipper_length_mm", y = "body_mass_g", alpha = 0.5, color = "year", ellipse = TRUE)


What is the spearman correlation coefficient between body mass and flipper length?#

ggscatter(dat %>% mutate(year=factor(year)), x = "flipper_length_mm", y = "body_mass_g", alpha = 0.5, color = "year", 
-          add = "reg.line", conf.int = TRUE, 
-          cor.coef = TRUE,
-          cor.coeff.args = list(method = "spearman", label.sep = "\n")) + 
-   theme(aspect.ratio = 1)


Create and save a figure#

fontsize = 6
-labsize = 2
-# overview number of observations of every sex across islands and species
-p1 = ggbarplot(dat %>% count(species, sex, island), x = "species", y = "n", fill = "sex", 
-               label = TRUE, lab.size = labsize,
-               position = position_dodge(0.7), facet.by = "island", palette = "lancet") + 
-   ylim(NA, 68)
-# sex-related body mass distributions across islands and species
-p2 = ggstripchart(dat, x = "island", y = "body_mass_g", color = "sex", facet.by = "species", 
-                  alpha = 0.5, position = position_jitterdodge(), add = "median_iqr", 
-                  add.params = list(color="black", group="sex", size=0.2),
-                  palette = "lancet")+
-            stat_compare_means(aes(color = sex), label = "p.signif", method = "wilcox.test", size = labsize)
-# association of flipper length and body mass
-p3 = ggscatter(dat %>% mutate(year=factor(year)), x = "flipper_length_mm", y = "body_mass_g", alpha = 0.5, color = "year", 
-          add = "reg.line", conf.int = TRUE, 
-          cor.coef = TRUE,
-          cor.coeff.args = list(method = "spearman", label.sep = "\n", size = labsize)) + 
-   theme(aspect.ratio = 1)
-p1p2 = ggarrange(p1 + theme_pubr(base_size = fontsize), p2 + theme_pubr(base_size = fontsize), ncol = 1, common.legend = TRUE)
-fig = ggarrange(p1p2, p3 + theme_pubr(base_size = fontsize), widths = c(2,1), heights = c(2, 1), labels = "AUTO")
-# save
-ggsave("images/myfig.png", fig, width = 15, height = 10, unit = "cm")


Heatmaps with ComplexHeatmap#


A part from ggpubr, one of the most common packages to visualize -multiple types of data altogether is ComplexHeatmap, which allows to -combine hierarchical clustering of rows and columns with continuous and -categorical data.

-# we are only interested in numeric columns
-cols_oi = c("bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g")
-rownames(dat) = 1:nrow(dat)
-# we need to add as.data.frame() because "dat" is a tibble,
-# which differ in the way they handle data underlying data types
-# we can customize the color for each species
-colors_species = c("Adelie"="red", "Chinstrap"="yellow", "Gentoo"="grey")
-colors_annot = list(species=colors_species)
-annotation_row = HeatmapAnnotation(df=dat[,c("island","species")] %>% as.data.frame(),
-                                   name="metadata_row",
-                                   which="row",
-                                   col = colors_annot)
-mat = dat[,cols_oi] %>% as.matrix()
-mat = scale(mat)
-        name="zscore",
-        show_row_names = FALSE, 
-        right_annotation = annotation_row)



- -

Session Info#

-## R version 4.1.2 (2021-11-01)
-## Platform: x86_64-pc-linux-gnu (64-bit)
-## Running under: Ubuntu 18.04.6 LTS
-## Matrix products: default
-## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
-## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
-## locale:
-##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
-##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_US.UTF-8    
-##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_US.UTF-8   
-##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
-##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
-## attached base packages:
-## [1] grid      stats     graphics  grDevices utils     datasets  methods  
-## [8] base     
-## other attached packages:
-##  [1] ComplexHeatmap_2.10.0 ggpubr_0.4.0          forcats_0.5.1        
-##  [4] stringr_1.4.0         dplyr_1.0.8           purrr_0.3.4          
-##  [7] readr_2.1.2           tidyr_1.2.0           tibble_3.1.6         
-## [10] ggplot2_3.3.5         tidyverse_1.3.1      
-## loaded via a namespace (and not attached):
-##  [1] nlme_3.1-155        matrixStats_0.61.0  fs_1.5.2           
-##  [4] lubridate_1.8.0     bit64_4.0.5         RColorBrewer_1.1-2 
-##  [7] doParallel_1.0.17   httr_1.4.2          ggsci_2.9          
-## [10] tools_4.1.2         backports_1.4.1     utf8_1.2.2         
-## [13] R6_2.5.1            BiocGenerics_0.40.0 DBI_1.1.2          
-## [16] mgcv_1.8-39         colorspace_2.0-3    GetoptLong_1.0.5   
-## [19] withr_2.5.0         tidyselect_1.1.2    gridExtra_2.3      
-## [22] bit_4.0.4           compiler_4.1.2      cli_3.2.0          
-## [25] rvest_1.0.2         xml2_1.3.3          labeling_0.4.2     
-## [28] scales_1.1.1        digest_0.6.29       rmarkdown_2.12     
-## [31] pkgconfig_2.0.3     htmltools_0.5.2     dbplyr_2.1.1       
-## [34] fastmap_1.1.0       highr_0.9           GlobalOptions_0.1.2
-## [37] rlang_1.0.2         readxl_1.3.1        rstudioapi_0.13    
-## [40] shape_1.4.6         farver_2.1.0        generics_0.1.2     
-## [43] jsonlite_1.8.0      vroom_1.5.7         car_3.0-12         
-## [46] magrittr_2.0.2      Matrix_1.4-0        S4Vectors_0.32.3   
-## [49] Rcpp_1.0.8          munsell_0.5.0       fansi_1.0.2        
-## [52] abind_1.4-5         lifecycle_1.0.1     stringi_1.7.6      
-## [55] yaml_2.3.5          carData_3.0-5       parallel_4.1.2     
-## [58] crayon_1.5.0        lattice_0.20-45     haven_2.4.3        
-## [61] cowplot_1.1.1       splines_4.1.2       circlize_0.4.14    
-## [64] hms_1.1.1           knitr_1.37          pillar_1.7.0       
-## [67] rjson_0.2.21        ggsignif_0.6.3      stats4_4.1.2       
-## [70] codetools_0.2-18    reprex_2.0.1        glue_1.6.2         
-## [73] evaluate_0.15       modelr_0.1.8        vctrs_0.3.8        
-## [76] png_0.1-7           tzdb_0.2.0          foreach_1.5.2      
-## [79] cellranger_1.1.0    gtable_0.3.0        clue_0.3-60        
-## [82] assertthat_0.2.1    xfun_0.30           broom_0.7.12       
-## [85] rstatix_0.7.0       iterators_1.0.14    IRanges_2.28.0     
-## [88] cluster_2.1.2       ellipsis_0.3.2
