library(tidyverse)
library(plotly)

Introduction

Gapminder is a foundation that maintains a database of global socio-economic numbers about each country across time, to promote data-driven sustainable development worlwide.

One of their iconic visualization shows how life expectancy relates to gdp per capita across time.

I am trying to recreate this vis using ggplot2 (for the static version), and plotly (for the interactive version).

Data

The data is maintained and available on GitHub.

# URLs
life_expectancy_url <- 
  "https://raw.githubusercontent.com/open-numbers/ddf--gapminder--life_expectancy/master/ddf--datapoints--life_expectancy--by--geo--time.csv"

gdp_capita_year_url <- 
  "https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp/master/ddf--datapoints--gdp_per_capita_cppp--by--geo--time.csv"

geo_url <- 
  "https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp/master/ddf--entities--geo.csv"

continent_url <- 
  "https://docs.google.com/spreadsheets/d/1I9Bt2jnafZKGyWDry4RAnZ_DopLYw9t_SjIiAZwZaGY/pub?gid=1"

population_url <- 
  "https://raw.githubusercontent.com/open-numbers/ddf--gapminder--population/master/ddf--datapoints--population--by--country--year.csv"

Loading into tibbles

Life expectancy

life_expectancy <- 
  life_expectancy_url %>% 
  read_csv()
## Parsed with column specification:
## cols(
##   geo = col_character(),
##   time = col_integer(),
##   life_expectancy = col_double()
## )
if (sum(is.na(life_expectancy)) != 0) {
  cat("The data is not complete.")
}

GDP per capita per year

gdp_capita_year <- 
  gdp_capita_year_url %>% 
  read_csv()
## Parsed with column specification:
## cols(
##   geo = col_character(),
##   time = col_integer(),
##   gdp_per_capita_cppp = col_integer()
## )
if (sum(is.na(gdp_capita_year)) != 0) {
  cat("The data is not complete.")
}

Country names and continents

geo <- 
  geo_url %>% 
  read_csv()
## Parsed with column specification:
## cols(
##   name = col_character(),
##   geo = col_character()
## )
if (sum(is.na(geo)) != 0) {
  cat("The data is not complete.")
}

missing_continent <- 
  tribble(
    ~Entity, ~Group,
    "Central African Republic", "Africa",
    "Czech Republic", "Europe",
    "Dominican Republic", "America",
    "North Korea", "Asia",
    "South Korea", "Asia",
    "Kyrgyz Republic", "Asia",
    "Lao", "Asia",
    "St. Lucia", "America",
    "St. Vincent and the Grenadines", "America",
    "Yemen", "Asia",
    "South Sudan", "Africa")

continent <-
  googlesheets::gs_url(continent_url) %>% 
  googlesheets::gs_read(ws = 2) %>% 
  select(Entity:Group) %>% 
  mutate(Group =  Group %>% 
                    stringr::str_replace_all( "[\\[\\]]", "")) %>% 
  bind_rows(missing_continent)
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1I9Bt2jnafZKGyWDry4RAnZ_DopLYw9t_SjIiAZwZaGY
## Worksheets feed constructed with public visibility
## Accessing worksheet titled 'Groups'.
## No encoding supplied: defaulting to UTF-8.

Population

pop <- 
  population_url %>% 
  read_csv()
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   population = col_double(),
##   country = col_character()
## )

Joining the data together

data <- 
  life_expectancy %>% 
  inner_join(gdp_capita_year,
             by = c("geo", "time")) %>% 
  inner_join(geo, by = "geo") %>% 
  left_join(continent,
            by = c("name" = "Entity")) %>% 
  rename(continent = Group,
         country = name,
         year = time,
         gdp_capita = gdp_per_capita_cppp) %>% 
  left_join(pop,
            by = c("year" = "year", "geo" = "country"))

Plot

Plot parameters

continent_col <- 
  c("Africa" = "#00D5E9",
    "Asia" = "#FF5872",
    "America" = "#7FEB00",
    "Europe" = "#FFE700")

Plotting without interaction

plot_year <- function(year_) {
  data %>% 
    filter(year == year_) %>% 
    arrange(desc(population)) %>% 
    ggplot(aes(gdp_capita, life_expectancy)) +
    annotate("text", x = 7000, y = 50, label = year_,
             size = 30, colour = "#E0E0E0", family = "Courier") +
    geom_point(aes(size = population, fill = continent),
               shape = 21, colour = "grey25", show.legend = FALSE) +
    coord_cartesian(xlim = c(350, 500*2^8),
                    ylim = c(15, 88)) +
    scale_x_continuous(trans = "log2",
                       breaks = 500*2^(0:8),
                       labels = c(500*2^(0:4), "16k", "32k", "64k", "128k"), 
                       minor_breaks = NULL,
                       name = "Income per person") +
    scale_y_continuous(breaks = seq(20, 80, 10), minor_breaks = NULL,
                       name = "Life expectancy, years") +
    scale_size(limits = range(data$population), range = c(0.25,20)) +
    scale_fill_manual(values = continent_col) +
    theme_minimal() +
    theme(axis.line = element_line(color = "#999999", size = 0.25),
          panel.grid.major = element_line(linetype = "dotted", color = "#999999", size = 0.25),
          axis.text = element_text(family = "Courier", color = "#999999"),
          axis.title.x = element_text(hjust = 0, colour = "#425663"),
          axis.title.y = element_text(angle = 0, hjust = 0, vjust = 1.08, 
                                      margin = margin(0, -110, 0, 10), colour = "#425663"),
          plot.margin = unit(c(2.5, 1, 1.25, 1), "lines"))
  }

Year 1800 - 2000:

seq(1800, 2000, 50) %>% 
  map(plot_year)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

Plot with interaction

ggplot + plotly

p <-
 data %>%
   #arrange(desc(population)) %>%
   ggplot(aes(gdp_capita, life_expectancy)) +
   # annotate("text", x = 7000, y = 50, label = year,
   #         size = 30, colour = "#E0E0E0", family = "Courier") +
   geom_point(aes(size = population, fill = continent, frame = year),
              shape = 21, colour = "grey25", show.legend = FALSE) +
   coord_cartesian(xlim = c(350, 500*2^8),
                   ylim = c(15, 88)) +
   scale_x_continuous(trans = "log2",
                      breaks = 500*2^(0:8),
                      labels = c(500*2^(0:4), "16k", "32k", "64k", "128k"),
                      minor_breaks = NULL,
                      name = "Income per person") +
   scale_y_continuous(breaks = seq(20, 80, 10), minor_breaks = NULL,
                      name = "Life expectancy, years") +
   scale_size(limits = range(data$population), range = c(0.25,20)) +
   scale_fill_manual(values = continent_col) +
   theme_minimal() +
   theme(axis.line = element_line(color = "#999999", size = 0.25),
         panel.grid.major = element_line(linetype = "dotted", color = "#999999", size = 0.25),
         axis.text = element_text(family = "Courier", color = "#999999"),
         axis.title.x = element_text(hjust = 0, colour = "#425663"),
         axis.title.y = element_text(angle = 0, hjust = 0, vjust = 1.08,
                                     margin = margin(0, -110, 0, 10), colour = "#425663"),
         plot.margin = unit(c(2.5, 1, 1.25, 1), "lines"))
## Warning: Ignoring unknown aesthetics: frame
ggplotly(p,
         width = 900,
         height = 900*0.6) %>%
 animation_opts(200, easing = "linear-in-out")

Plotly doesn’t play well with stylised ggplot. Let’s see how closely we can recreate the Gapminder plot with Plotly only.

Pure Plotly

font_title <- 
  list(family = "Courier",
       size = 16,
       color = "#425663")

font_axis <- 
  list(family = "Courier",
       size = 12,
       color = "#999999")

slider_opts <- 
  list(prefix = "YEAR ",
       font = list(family = "Courier",
                   size = 20, 
                   color = "#425663"))

x_axis <- 
  list(title = "Income per person",
       type = "log",
       ticktext = c(500*2^(0:4), "16k", "32k", "64k", "128k"),
       tickvals = 500*2^(0:8),
       zeroline = FALSE,
       titlefont = font_title,
       tickfont = font_axis,
       range = c(log(300)/log(10), log(500*2^8.5)/log(10)),
       showline = TRUE,
       ticks = "outside",
       tickcolor = "white",
       ticklen = 10)

y_axis <- 
  list(title = "Life expectancy, years",
       tickvals = seq(20, 80, 10),
       range = c(15, 88),
       titlefont = font_title,
       tickfont = font_axis,
       showline = TRUE,
       ticks = "outside",
       tickcolor = "white",
       ticklen = 10)

data %>%
  plot_ly(x = ~gdp_capita, 
          y = ~life_expectancy,
          text = ~country, 
          hoverinfo = "text",
          size = ~population,
          sizes = c(5, 4000),
          width = 900,
          height = 900*0.6) %>%
  layout(xaxis = x_axis,
        yaxis = y_axis) %>%
  add_trace(color = ~continent, 
            colors = ~continent_col,
            frame = ~year, 
            ids = ~country,
            mode = "markers",
            marker = list(
                      opacity = 1,
                      line = list(
                              color = I("black"),
                              width = 1
                             )
                      ),
           type = "scatter") %>%
  animation_opts(frame = 0, 
                 easing = "linear-in-out") %>%
  animation_slider(currentvalue = slider_opts)

This is the closest to the orginal plot I could get.

  • I couldn’t get the frame animation system to work with both the marker and the in-plot year annotation.
  • I cannot modify the grid line linetype.
  • I couldn’t decouple size and color for the legend.
  • The frame animation system is undocumented: I don’t know how / what can be tweaked in the design of the animation button / slider / axis.
  • I don’t know how to set the plotting order to show the smaller countries on top of the large ones.