Plotting

knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=6, echo = TRUE, eval = TRUE, 
                      message = FALSE, warning = FALSE, dpi = 200, fig.align = 'center')

knitr::include_graphics("https://d21ii91i3y6o6h.cloudfront.net/gallery_images/from_proof/9296/small/1447173871/rstudio-hex-ggplot2-dot-psd.png")

Source: https://www.rstudio.com/

Overview

In this practical you’ll practice plotting data with the ggplot2 package.

Cheatsheet

knitr::include_graphics("../_image/ggplot_cheatsheet_ss.png")

https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf

If you don’t have it already, you can access the ggplot2 cheatsheet here https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf. This has a nice overview of all the major functions in ggplot2.

Examples

The following examples will take you through the steps of creating both simple and complex plots with ggplot2. Try to go through each line of code and see how it works!

# -----------------------------------------------
# Examples of using ggplot2 on the mpg data
# ------------------------------------------------

library(tidyverse)         # Load tidyverse (which contains ggplot2!)

mpg # Look at the mpg data

# Just a blank space without any aesthetic mappings
ggplot(data = mpg)

# Now add a mapping where engine displacement (displ) and highway miles per gallon (hwy) are mapped to the x and y aesthetics
ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy))   # Map displ to x-axis and hwy to y-axis

#  Add points with geom_point()
ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy)) +
       geom_point()     

#  Add points with geom_count()
ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy)) +
       geom_count()   

# Again, but with some additional arguments
# Also using a new theme temporarily

ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy)) +
       geom_point(col = "red",                  # Red points
                  size = 3,                     # Larger size
                  alpha = .5,                   # Transparent points
                  position = "jitter") +        # Jitter the points         
         scale_x_continuous(limits = c(1, 15)) +  # Axis limits
         scale_y_continuous(limits = c(0, 50)) +
  theme_minimal()


# Assign class to the color aesthetic and add labels with labs()

ggplot(data = mpg, 
  mapping = aes(x = displ, y = hwy, col = class)) +  # Change color based on class column
  geom_point(size = 3, position = 'jitter') +
  labs(x = "Engine Displacement in Liters",
       y = "Highway miles per gallon",
       title = "MPG data",
       subtitle = "Cars with higher engine displacement tend to have lower highway mpg",
       caption = "Source: mpg data in ggplot2")
  

# Add a regression line for each class

ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy, color = class)) +
  geom_point(size = 3, alpha = .9) + 
  geom_smooth(method = "lm")

# Add a regression line for all classes

ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy, color = class)) +
  geom_point(size = 3, alpha = .9) + 
  geom_smooth(col = "blue", method = "lm")


# Facet by class
ggplot(data = mpg,
       mapping = aes(x = displ, 
                     y = hwy, 
                     color = factor(cyl))) + 
  geom_point() +
  facet_wrap(~ class) 


# Another fancier example

ggplot(data = mpg, 
       mapping = aes(x = cty, y = hwy)) + 
       geom_count(aes(color = manufacturer)) +     # Add count geom (see ?geom_count)
       geom_smooth() +                   # smoothed line without confidence interval
       geom_text(data = filter(mpg, cty > 25), 
                 aes(x = cty,y = hwy, 
                     label = rownames(filter(mpg, cty > 25))),
                     position = position_nudge(y = -1), 
                                check_overlap = TRUE, 
                     size = 5) + 
       labs(x = "City miles per gallon", 
            y = "Highway miles per gallon",
            title = "City and Highway miles per gallon", 
            subtitle = "Numbers indicate cars with highway mpg > 25",
            caption = "Source: mpg data in ggplot2",
            color = "Manufacturer", 
            size = "Counts")

Tasks

Setup

Open your bootcamp R project. Open a new R script and save it under the name plotting_practical.R in your 2_Code folder.
At the top of your script load the tidyverse package (which includes ggplot2).

library(tidyverse)
library(speff2trial)
library(car)
library(FFTrees)

Building a plot step-by-step

The diamonds dataset in the ggplot2 package shows information about 50,000 round cut diamonds. Print the diamonds dataset, it should look like this:

diamonds

# A tibble: 53,940 x 10
   carat cut       color clarity depth table price     x     y     z
   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
 7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
 8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
 9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
# ... with 53,930 more rows

Using ggplot(), create the following blank plot using the data and mapping arguments (but no geom).

ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price))

Now, using geom_point(), add points showing the relationship between the number of carats in the diamonds (carat) and its price (price)

ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price)) + 
  geom_point()

Make the points transparent using the alpha argument to geom_point()

ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price)) + 
  geom_point(alpha = .05)

Using the color aesthetic mapping, color the points by their cut.

ggplot(data = diamonds, 
  mapping = aes(x = carat, y = price, color = cut)) + 
  geom_point(alpha = .05)

Using the facet_wrap() function, create different plots for each value of cut.

ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price, color = cut)) + 
  geom_point(alpha = .05) + 
  facet_wrap(~ cut, nrow = 1)

Using the geom_smooth() function, add a black, smoothed mean line to each plot (You can also try turning the line into a regression line using the method argument)

ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price, color = cut)) + 
  geom_point(alpha = .05) + 
  facet_wrap(~ cut, nrow = 1) + 
  geom_smooth(color = "black")

Playing with themes

Look at the theme help menu with ?theme_bw() to see a list of all of the standard ggplot themes. Then, try adding one of these themes to your previous plots to see how they change.
The ggthemes package contains many additional themes. If you don’t have the package already, install it by running install.packages("ggthemes"). Then, look at the ggthemes() vignette by running the following code:

# Open the ggthemes vignette
vignette("ggthemes", package = "ggthemes")

Now, create the following plot from the mpg data using the using the Five Thirty Eight theme. Note that cty is on the x axis, and hwy is on the y axis.

library(ggthemes)

ggplot(data = mpg, aes(x = cty, y = hwy)) + 
  geom_point() +
  labs(title = "City vs highway miles per gallon") +
  facet_wrap(~manufacturer) +
  theme_fivethirtyeight()

Density geom with `geom_density()`

Create the following density plot of prices from the diamonds data using the following template:

Set the data argument to diamonds
Map carat to the x aesthetic
Add a density geom with geom_density() and set the fill color to "tomato1"
Add labels
Use the minimal theme with theme_minimal()

ggplot(data = XX, 
       mapping = aes(x = XX)) + 
       geom_density(fill = "XX") + 
       labs(x = "XX", 
            y = "XX", 
            title = "XX",
            subtitle = "XX",
            caption = "XX") +
  theme_XX()

ggplot(data = diamonds, 
       mapping = aes(x = carat)) + 
  geom_density(fill = "tomato1") + 
  labs(x = "Carats", 
       y = "Count", 
       title = "Diamond Prices",
       subtitle = "Created with ggplot!",
       caption = "Source: diamonds dataset") +
  theme_minimal()

Boxplot geom `geom_boxplot()`

Look at the help menu for geom_boxplot(). Then, create the following boxplot using the following template

ggplot(data = XX,
  mapping = aes(x = XX, y = log(XX), fill = XX)) + 
  geom_boxplot()  + 
  labs(y = "XX", 
       x = "XX", 
       fill = "XX",
       title = "XX",
       subtitle = "XX") +
  scale_fill_brewer(palette = "XX")

ggplot(data = diamonds,
  mapping = aes(x = cut, y = log(price), fill = cut)) + 
  geom_boxplot()  + 
  labs(y = "Price (log scale)", x = "Cut", color = "Cut", 
       title = "Distribution of diamond prices by cut", 
       subtitle = "Data come from a random sample of 1000 diamonds",
       caption = "Source: diamonds dataset from ggplot2") +
  scale_fill_brewer(palette = "YlGnBu")

Violin geom `geom_violin()`

Now make the following plot using geom_violin(). You can also change the color palette in the palette argument to the scale_fill_brewer() function. Look at the help menu with ?scale_fill_brewer() to see all the possibilities. In the plot below, I’m using "Set1"

ggplot(data = diamonds,
  mapping = aes(x = cut, y = log(price), fill = cut)) + 
  geom_violin()  + 
  labs(y = "Price (log scale)", x = "Cut", color = "Cut", 
       title = "Distribution of diamond prices by cut", 
       subtitle = "Data come from a random sample of 1000 diamonds",
       caption = "Source: diamonds dataset from ggplot2") +
  scale_fill_brewer(palette = "Set1")

Summary statistics

You can use the stat_summary() function to add summary statistics as geoms to plots. Using the following template, create the following plot showing the mean prices of diamonds for each level of clarity.

ggplot(data = XX,
  mapping = aes(x = XX, y = XX)) + 
stat_summary(fun.y = "mean", 
             geom = "bar", 
             fill = "white", 
             col = "black") +
  labs(y = "XX", 
       x = "XX", 
       title = "XX", 
       caption = "XX")

ggplot(data = diamonds,
  mapping = aes(x = clarity, y = price)) + 
stat_summary(fun.y = "mean", 
             geom = "bar", 
             fill = "white", 
             col = "black") +
  labs(y = "Mean Price", 
       x = "Clarity", 
       color = "Clarity", 
       title = "Mean diamond prices by Clarity", 
       caption = "Source: diamonds dataset from ggplot2")

Now, create the following plot from the mpg dataframe

ggplot(data = mpg,
  mapping = aes(x = manufacturer, y = hwy)) + 
stat_summary(fun.y = "mean", 
             geom = "bar", 
             fill = "white", 
             col = "black") +
  labs(y = "Highway Miles per gallon", 
       x = "Manufacturer", 
       title = "Average fuel efficiency for different car manufacturers", 
       caption = "Source: mpg dataset from ggplot2")

You can easily flip the coordinates of a plot by using coord_flip(). Using coord_flip(), flip the x and y coordinates of your previous plot so it looks like this:

ggplot(data = mpg,
  mapping = aes(x = manufacturer, y = hwy)) + 
stat_summary(fun.y = "mean", 
             geom = "bar", 
             fill = "white", 
             col = "black") +
  labs(y = "Highway Miles per gallon", 
       x = "Manufacturer", 
       title = "Average fuel efficiency for different car manufacturers", 
       caption = "Source: mpg dataset from ggplot2") + coord_flip()

Saving plots as objects

Create the following plot from the mpg dataset, and save it as an object called myplot

myplot <- ggplot(data = mpg,
                 aes(x = cty, y = hwy)) +
  geom_point() +
  labs(x = "City Miles per Gallon",
       y = "Highway Miles per Gallon",
       title = "mpg data")

myplot

Now, using object assignment <- add a regression line to the myplot object with geom_smooth(). Then evaluate the object to see the updated version. It should now look like this:

myplot + geom_smooth()

Using ggsave(), save the object as a pdf file called myplot.pdf in your 3_Figures folder. Set the width to 6 inches, and the height to 4 inches. Open the pdf outside of RStudio to make sure it worked!

ggsave(myplot, 
       filename = "3_Figures/myplot.pdf", 
       width = 6, 
       height = 4, 
       device = "pdf")

Demographic information of midwest counties in the US

Print the midwest dataset (it’s contained in ggplot2) and look at the help menu to see what values it contains. It should look like this:

midwest

# A tibble: 437 x 28
     PID county    state  area poptotal popdensity popwhite popblack
   <int> <chr>     <chr> <dbl>    <int>      <dbl>    <int>    <int>
 1   561 ADAMS     IL    0.052    66090      1271.    63917     1702
 2   562 ALEXANDER IL    0.014    10626       759      7054     3496
 3   563 BOND      IL    0.022    14991       681.    14477      429
 4   564 BOONE     IL    0.017    30806      1812.    29344      127
 5   565 BROWN     IL    0.018     5836       324.     5264      547
 6   566 BUREAU    IL    0.05     35688       714.    35157       50
 7   567 CALHOUN   IL    0.017     5322       313.     5298        1
 8   568 CARROLL   IL    0.027    16805       622.    16519      111
 9   569 CASS      IL    0.024    13437       560.    13384       16
10   570 CHAMPAIGN IL    0.058   173025      2983.   146506    16559
# ... with 427 more rows, and 20 more variables: popamerindian <int>,
#   popasian <int>, popother <int>, percwhite <dbl>, percblack <dbl>,
#   percamerindan <dbl>, percasian <dbl>, percother <dbl>,
#   popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
#   poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
#   percchildbelowpovert <dbl>, percadultpoverty <dbl>,
#   percelderlypoverty <dbl>, inmetro <int>, category <chr>

Using the following code as a template, create the following plot showing the relationship between college education and poverty

ggplot(data = XX, 
    mapping = aes(x = XX, y = XX)) + 
    geom_point(aes(fill = XX, size = XX), shape = 21, color = "white") + 
    geom_smooth(aes(x = XX, y = XX)) +
    labs(
        x = "XX", 
        y = "XX", 
        title = "XX",
        subtitle = "XX",
        caption = "XX") + 
    scale_color_brewer(palette = "XX") + 
    scale_size(range = c(XX, XX)) +
    guides(size = guide_legend(override.aes = list(col = "black")), 
           fill = guide_legend(override.aes = list(size = 5)))

ggplot(data = midwest, 
    mapping = aes(x = percollege, y = percpovertyknown)) + 
    geom_point(aes(fill = state, size = popdensity), shape = 21, color = "white") + 
    geom_smooth(aes(x = percollege, y = percpovertyknown)) +
    labs(
        x = "Percent with college education", 
        y = "Poverty rate", 
        title = "Midwest Data",
        subtitle = "States with higher college education rates tend to have lower poverty rates",
        caption = "Source: ggplot2 package") + 
    scale_color_brewer(palette = "Set1") + 
    scale_size(range = c(0, 12)) +
    guides(size = guide_legend(override.aes = list(col = "black")), 
           fill = guide_legend(override.aes = list(size = 5)))

Create the following density plot showing the density of inhabitants with a college education in different states using the following template

ggplot(data = XX, 
       mapping = aes(XX, fill = XX)) + 
  geom_density(alpha = XX) + 
  labs(title = "XX", 
       subtitle = "XX",
       caption = "XX",
       x = "XX",
       y = "XX",
       fill = "XX")

ggplot(data = midwest, 
       mapping = aes(percollege, fill = state)) + 
  geom_density(alpha = 0.8) + 
  labs(title = "College education rates", 
       subtitle = "For 5 Midwest states",
       caption = "Source: midwest dataset in ggplot2",
       x = "Percent of inhabitants with a college education",
       y = "Density",
       fill = "State") + 
  theme_bw()

Heatplots with `geom_tile()`

You can create heatplots using the geom_tile() function. Try creating the following heatplot of statistics of NBA players using the following template:

# Read in nba data
nba_long <- read_csv("https://raw.githubusercontent.com/therbootcamp/therbootcamp.github.io/master/_sessions/_data/nba_long.csv")

# Look at the data
nba_long

ggplot(XX, 
       mapping = aes(x = XX, y = XX, fill = XX)) + 
  geom_tile(colour = "XX") + 
  scale_fill_gradientn(colors = c("XX", "XX", "XX"))+ 
  labs(x = "XX", 
       y = "XX", 
       fill = "XX", 
       title = "NBA XX performance",
       subtitle = "XX",
       caption = "XX") +
  coord_flip()

# Read in nba data
nba_long <- read_csv("https://raw.githubusercontent.com/therbootcamp/therbootcamp.github.io/master/_sessions/_data/nba_long.csv")

ggplot(nba_long, 
       mapping = aes(x = Name, y = measure, fill = value)) + 
  geom_tile(colour = "white") + 
  scale_fill_gradientn(colors = c("red", "white", "blue"))+ 
  labs(x = "Player", 
       y = "Statistic", 
       fill = "Performance", 
       title = "NBA player performance",
       subtitle = "Each tile represents how well the player performed on that statistic relative to other players.",
       caption = "Source: https://learnr.wordpress.com/2010/01/26/ggplot2-quick-heatmap-plotting/") +
  coord_flip() + 
  theme_minimal()

Make the following plot of savings data (psavert) from the economics dataset.

ggplot(data = XX, aes(x = XX, y = XX)) + 
       geom_line() + 
       labs(title = "XX", 
            subtitle = "XX", 
            caption = "XX", 
            y = "XX") + 
       geom_smooth()

# plot
ggplot(data = economics, 
       mapping = aes(x = date, y = psavert)) + 
  geom_line() + 
  geom_smooth() +
  labs(title = "Personal Savings Rates Changes over Time", 
       subtitle = "Ratio of personal saving to disposable income", 
       caption = "Source:  http://research.stlouisfed.org/fred2", 
       y = "Savings Rate %") + 
  theme_bw()

Make the following plot from the trial_act.csv dataset. To do this, you’ll need to use both geom_boxplot() and geom_point(). To jitter the points, use the position argument to geom_point(), as well as the position_jitter() function to control how much to jitter the points.

ggplot(data = ACTG175,
       aes(x = factor(arms), y = days, fill = factor(arms))) +
   facet_wrap(~ drugs, nrow = 1, labeller = label_both) +
  geom_boxplot(outlier.size = 0) + 
  labs(title = "Number of days until a major negative event",
       subtitle = "For different treatment arms and separated by drug users and non drug users",
       caption = "Source: ACTG175 dataset from the speff2trial package",
       x = "Treatment Arm",
       y = "Number of days until a major negative event",
       fill = "Arm") +
  theme_bw() +  
  scale_fill_brewer(palette = "Dark2") + 
  geom_point(alpha = .2, 
             position = position_jitter(w = 0.1, h = 0))

References and Further Reading

Many of the plots in this practical were taken from Selva Prabhakaran’s website http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
The main ggplot2 webpage at http://ggplot2.tidyverse.org/ has great tutorials and examples.
ggplot2 is also great for making maps. For examples, check out Eric Anderson’s page at http://eriqande.github.io/rep-res-web/lectures/making-maps-with-R.html