Learn R4DS

02-Explore

Introduction

Lets explore some useful tools that have an immediate payoff:

Visualisation
Transformation
Data Exploration - curiosity and scepticism to ask and answer interesting questions about data

03-Data Visualization

First Steps

Using example dataset from ggplot2 on Cars, lets try to answer the following questions

Do cars with big engines use more fuel than cars with small engines?
What does the relationship between engine size and fuel efficiency look like?
- Is it positive? Negative? Linear? Nonlinear?

mpg

## # A tibble: 234 x 11
##    manufacturer model    displ  year   cyl trans   drv     cty   hwy fl    class
##    <chr>        <chr>    <dbl> <int> <int> <chr>   <chr> <int> <int> <chr> <chr>
##  1 audi         a4         1.8  1999     4 auto(l~ f        18    29 p     comp~
##  2 audi         a4         1.8  1999     4 manual~ f        21    29 p     comp~
##  3 audi         a4         2    2008     4 manual~ f        20    31 p     comp~
##  4 audi         a4         2    2008     4 auto(a~ f        21    30 p     comp~
##  5 audi         a4         2.8  1999     6 auto(l~ f        16    26 p     comp~
##  6 audi         a4         2.8  1999     6 manual~ f        18    26 p     comp~
##  7 audi         a4         3.1  2008     6 auto(a~ f        18    27 p     comp~
##  8 audi         a4 quat~   1.8  1999     4 manual~ 4        18    26 p     comp~
##  9 audi         a4 quat~   1.8  1999     4 auto(l~ 4        16    25 p     comp~
## 10 audi         a4 quat~   2    2008     4 manual~ 4        20    28 p     comp~
## # ... with 224 more rows

Graphing

Plot # 1: displ vs hwy

displ - engine displacement, in litres (engine size)
hwy - highway miles per gallon (car’s efficiency)

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(x=displ,y=hwy)) +
  geom_smooth(method = "lm", se = TRUE, level=0.95)

plot shows negative relationship between engine size (displ) and efficiency (hwy)

Plot # 2: cyl vs hwy

displ - engine displacement, in litres (engine size)
hwy - highway miles per gallon (car’s efficiency)

ggplot(data=mpg, aes(x=hwy,y=cyl)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, level=0.95)

plot shows negative relationship between # of cylinders (cyl) and efficiency (hwy)

Plot # 3: OMG!! 2 seaters have a differentiated mileage

Mix and match color with size, shape and alpha

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(x=displ,y=hwy,color=class)) +
  geom_smooth(method = "lm", se = TRUE, level=0.95)

## `geom_smooth()` using formula 'y ~ x'

Plot # 4: Conditional coloring

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(x=displ,y=hwy,color=displ<5)) +
  geom_smooth(method = "lm", se = TRUE, level=0.95)

## `geom_smooth()` using formula 'y ~ x'

Facets

Plot # 5: by Class

Clear view of the class variable. But the linear regression gave more insight than the facets.

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(x=displ,y=hwy,color=displ<5)) +
  facet_wrap(~class, nrow = 2)

Plot # 5: drv ~ cyl

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(x=displ,y=hwy,color=hwy<25)) +
  facet_grid(cyl~drv)

Geom objects

Plot # 6 - GEOM_SMOOTH

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_smooth(mapping = aes(x=displ,y=hwy))

Plot # 7 - GEOM_POINT

ggplot(data=mpg, aes(x=displ,y=hwy)) +
  geom_point(mapping = aes(x=displ,y=hwy))

Plot # 8 - GEOM_SMOOTH with Linetype

ggplot(data = mpg) + 
  geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))

Plot # 9 - Multiple geoms in the same plot

ggplot(data = mpg) + 
  geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))

Plot # 10 - display different aesthetics in different layers

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth()

Statistical transformations

The figure below describes how this process works with geom_bar()

Plot 11 - default stat is count

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, y= stat(prop) , group=1))

Plot 12 - default stat changed from count to other

demo <- tribble(
  ~cut,         ~freq,
  "Fair",       1610,
  "Good",       4906,
  "Very Good",  12082,
  "Premium",    13791,
  "Ideal",      21551
)

ggplot(data = demo) +
  geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")

Plot 13 - stat_summary

ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.min = min,
    fun.max = max,
    fun = median
  )

Position adjustments

Plot 14 - color

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, colour = cut))

Plot 15 - fill

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = cut))

Plot 16 - clarity

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity))

Plot 17 - 100 perc stacked - position fill

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")

Plot 18 - position dodge

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")

Plot 19 - position jitter

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")

Coordinate systems

Plot 20 - coord_flip()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot() +
  coord_flip()

Plot 21 - coord_quickmap()

nz <- map_data("nz")

ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", colour = "black")

ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", colour = "black") +
  coord_quickmap()

Plot 22 - coord_polar()

bar <- ggplot(data = diamonds) + 
  geom_bar(
    mapping = aes(x = cut, fill = cut), 
    show.legend = FALSE,
    width = 1
  ) + 
  theme(aspect.ratio = 1) +
  labs(x = NULL, y = NULL)

bar + coord_flip()

bar + coord_polar()