library(tidyverse)
library(plotly)
Gapminder is a foundation that maintains a database of global socio-economic numbers about each country across time, to promote data-driven sustainable development worlwide.
One of their iconic visualization shows how life expectancy relates to gdp per capita across time.
I am trying to recreate this vis using ggplot2
(for the static version), and plotly
(for the interactive version).
The data is maintained and available on GitHub.
# URLs
life_expectancy_url <-
"https://raw.githubusercontent.com/open-numbers/ddf--gapminder--life_expectancy/master/ddf--datapoints--life_expectancy--by--geo--time.csv"
gdp_capita_year_url <-
"https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp/master/ddf--datapoints--gdp_per_capita_cppp--by--geo--time.csv"
geo_url <-
"https://raw.githubusercontent.com/open-numbers/ddf--gapminder--gdp_per_capita_cppp/master/ddf--entities--geo.csv"
continent_url <-
"https://docs.google.com/spreadsheets/d/1I9Bt2jnafZKGyWDry4RAnZ_DopLYw9t_SjIiAZwZaGY/pub?gid=1"
population_url <-
"https://raw.githubusercontent.com/open-numbers/ddf--gapminder--population/master/ddf--datapoints--population--by--country--year.csv"
life_expectancy <-
life_expectancy_url %>%
read_csv()
## Parsed with column specification:
## cols(
## geo = col_character(),
## time = col_integer(),
## life_expectancy = col_double()
## )
if (sum(is.na(life_expectancy)) != 0) {
cat("The data is not complete.")
}
gdp_capita_year <-
gdp_capita_year_url %>%
read_csv()
## Parsed with column specification:
## cols(
## geo = col_character(),
## time = col_integer(),
## gdp_per_capita_cppp = col_integer()
## )
if (sum(is.na(gdp_capita_year)) != 0) {
cat("The data is not complete.")
}
geo <-
geo_url %>%
read_csv()
## Parsed with column specification:
## cols(
## name = col_character(),
## geo = col_character()
## )
if (sum(is.na(geo)) != 0) {
cat("The data is not complete.")
}
missing_continent <-
tribble(
~Entity, ~Group,
"Central African Republic", "Africa",
"Czech Republic", "Europe",
"Dominican Republic", "America",
"North Korea", "Asia",
"South Korea", "Asia",
"Kyrgyz Republic", "Asia",
"Lao", "Asia",
"St. Lucia", "America",
"St. Vincent and the Grenadines", "America",
"Yemen", "Asia",
"South Sudan", "Africa")
continent <-
googlesheets::gs_url(continent_url) %>%
googlesheets::gs_read(ws = 2) %>%
select(Entity:Group) %>%
mutate(Group = Group %>%
stringr::str_replace_all( "[\\[\\]]", "")) %>%
bind_rows(missing_continent)
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1I9Bt2jnafZKGyWDry4RAnZ_DopLYw9t_SjIiAZwZaGY
## Worksheets feed constructed with public visibility
## Accessing worksheet titled 'Groups'.
## No encoding supplied: defaulting to UTF-8.
pop <-
population_url %>%
read_csv()
## Parsed with column specification:
## cols(
## year = col_integer(),
## population = col_double(),
## country = col_character()
## )
data <-
life_expectancy %>%
inner_join(gdp_capita_year,
by = c("geo", "time")) %>%
inner_join(geo, by = "geo") %>%
left_join(continent,
by = c("name" = "Entity")) %>%
rename(continent = Group,
country = name,
year = time,
gdp_capita = gdp_per_capita_cppp) %>%
left_join(pop,
by = c("year" = "year", "geo" = "country"))
continent_col <-
c("Africa" = "#00D5E9",
"Asia" = "#FF5872",
"America" = "#7FEB00",
"Europe" = "#FFE700")
plot_year <- function(year_) {
data %>%
filter(year == year_) %>%
arrange(desc(population)) %>%
ggplot(aes(gdp_capita, life_expectancy)) +
annotate("text", x = 7000, y = 50, label = year_,
size = 30, colour = "#E0E0E0", family = "Courier") +
geom_point(aes(size = population, fill = continent),
shape = 21, colour = "grey25", show.legend = FALSE) +
coord_cartesian(xlim = c(350, 500*2^8),
ylim = c(15, 88)) +
scale_x_continuous(trans = "log2",
breaks = 500*2^(0:8),
labels = c(500*2^(0:4), "16k", "32k", "64k", "128k"),
minor_breaks = NULL,
name = "Income per person") +
scale_y_continuous(breaks = seq(20, 80, 10), minor_breaks = NULL,
name = "Life expectancy, years") +
scale_size(limits = range(data$population), range = c(0.25,20)) +
scale_fill_manual(values = continent_col) +
theme_minimal() +
theme(axis.line = element_line(color = "#999999", size = 0.25),
panel.grid.major = element_line(linetype = "dotted", color = "#999999", size = 0.25),
axis.text = element_text(family = "Courier", color = "#999999"),
axis.title.x = element_text(hjust = 0, colour = "#425663"),
axis.title.y = element_text(angle = 0, hjust = 0, vjust = 1.08,
margin = margin(0, -110, 0, 10), colour = "#425663"),
plot.margin = unit(c(2.5, 1, 1.25, 1), "lines"))
}
Year 1800 - 2000:
seq(1800, 2000, 50) %>%
map(plot_year)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
p <-
data %>%
#arrange(desc(population)) %>%
ggplot(aes(gdp_capita, life_expectancy)) +
# annotate("text", x = 7000, y = 50, label = year,
# size = 30, colour = "#E0E0E0", family = "Courier") +
geom_point(aes(size = population, fill = continent, frame = year),
shape = 21, colour = "grey25", show.legend = FALSE) +
coord_cartesian(xlim = c(350, 500*2^8),
ylim = c(15, 88)) +
scale_x_continuous(trans = "log2",
breaks = 500*2^(0:8),
labels = c(500*2^(0:4), "16k", "32k", "64k", "128k"),
minor_breaks = NULL,
name = "Income per person") +
scale_y_continuous(breaks = seq(20, 80, 10), minor_breaks = NULL,
name = "Life expectancy, years") +
scale_size(limits = range(data$population), range = c(0.25,20)) +
scale_fill_manual(values = continent_col) +
theme_minimal() +
theme(axis.line = element_line(color = "#999999", size = 0.25),
panel.grid.major = element_line(linetype = "dotted", color = "#999999", size = 0.25),
axis.text = element_text(family = "Courier", color = "#999999"),
axis.title.x = element_text(hjust = 0, colour = "#425663"),
axis.title.y = element_text(angle = 0, hjust = 0, vjust = 1.08,
margin = margin(0, -110, 0, 10), colour = "#425663"),
plot.margin = unit(c(2.5, 1, 1.25, 1), "lines"))
## Warning: Ignoring unknown aesthetics: frame
ggplotly(p,
width = 900,
height = 900*0.6) %>%
animation_opts(200, easing = "linear-in-out")
Plotly doesn’t play well with stylised ggplot. Let’s see how closely we can recreate the Gapminder plot with Plotly only.
font_title <-
list(family = "Courier",
size = 16,
color = "#425663")
font_axis <-
list(family = "Courier",
size = 12,
color = "#999999")
slider_opts <-
list(prefix = "YEAR ",
font = list(family = "Courier",
size = 20,
color = "#425663"))
x_axis <-
list(title = "Income per person",
type = "log",
ticktext = c(500*2^(0:4), "16k", "32k", "64k", "128k"),
tickvals = 500*2^(0:8),
zeroline = FALSE,
titlefont = font_title,
tickfont = font_axis,
range = c(log(300)/log(10), log(500*2^8.5)/log(10)),
showline = TRUE,
ticks = "outside",
tickcolor = "white",
ticklen = 10)
y_axis <-
list(title = "Life expectancy, years",
tickvals = seq(20, 80, 10),
range = c(15, 88),
titlefont = font_title,
tickfont = font_axis,
showline = TRUE,
ticks = "outside",
tickcolor = "white",
ticklen = 10)
data %>%
plot_ly(x = ~gdp_capita,
y = ~life_expectancy,
text = ~country,
hoverinfo = "text",
size = ~population,
sizes = c(5, 4000),
width = 900,
height = 900*0.6) %>%
layout(xaxis = x_axis,
yaxis = y_axis) %>%
add_trace(color = ~continent,
colors = ~continent_col,
frame = ~year,
ids = ~country,
mode = "markers",
marker = list(
opacity = 1,
line = list(
color = I("black"),
width = 1
)
),
type = "scatter") %>%
animation_opts(frame = 0,
easing = "linear-in-out") %>%
animation_slider(currentvalue = slider_opts)
This is the closest to the orginal plot I could get.
frame
animation system to work with both the marker and the in-plot year annotation.size
and color
for the legend.frame
animation system is undocumented: I don’t know how / what can be tweaked in the design of the animation button / slider / axis.