Dec 07 2022
- 1. Load packages and functions
- 2. Example for one country: grandparents in Guatemala
- 3. Replicate for all countries
Code by |
Diego Alburez-Gutierrez, PhD |
Kinship Inequalities Research Group, |
Max Planck Institute for Demographic Research |
https://www.demogr.mpg.de/en/research_6120/kinship_inequalities_10703 |
alburezgutierrez[at]demogr.mpg.de |
https://www.twitter.com/d_alburez |
Replication material for article The age of the grandparent has arrived, published in The Economist on 12 Jan, 2023.
These are back-of-the-envelope estimates to answer the question: ‘how many grandparents are there in the world’? I do this by relying on a series of country-level synthetic population microdata with a genealogical structure produced using the SOCSIM software (https://lab.demog.berkeley.edu/socsim/). The synthetic microdata come from the paper:
Alburez‐Gutierrez, D., Mason, C., and Zagheni, E. (2021). The “Sandwich Generation” Revisited: Global Demographic Drivers of Care Time Demands. Population and Development Review 47(4):997–1023. doi: https://doi.org/10.1111/padr.12436.
Details on the simulation implementation, data sources, and parameters are given in the paper. Here I will analyse the replication data for this paper, which is stored in the Harvard Dataverse: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/SSZL6U.
Note that the estimates are rough approximations and estimates for smaller countries (e.g., with populations smaller than 1 million) should be interpreted with special care.
The scripts access these data using the Harvard Dataverse API, so you need a connection to the internet to replicate this code. The code runs in R, preferably in RStudio.
library(tidyverse)
library(httr)
library(countrycode)
library(data.table)
library(knitr)
Define a number of functions for getting data, re-arranging simulation data, and doing the analysis.
# a function to convert socsim months to calendar years
asYr2 <- function(x, FinalSimYear, endmo) {
return(trunc(FinalSimYear - (endmo - x)/12) +1)
}
# Get data in dataveres
list_data <- function(){
print("Getting API ready...")
# 1. Get links to download simulation data from Harvard Dataverse
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/SSZL6U
# Get list of files from Dataverse
api_call <- "https://dataverse.harvard.edu/api/datasets/:persistentId?persistentId=doi:10.7910/DVN/SSZL6U"
res <- GET(api_call)
strikes <- jsonlite::fromJSON(content(res, 'text'), simplifyVector = FALSE)
data_list <- strikes$data$latestVersion$files
data_df <-
lapply(data_list, function(x) data.frame(x)) %>%
bind_rows() %>%
mutate(
country = str_extract(label, "[A-z]+")
, country = gsub("_", "", country)
, seed = str_extract(label, "[0-9]+")
) %>%
select(country, seed , label, id = dataFile.id)
return(data_df)
}
find_grandparents <- function(countries, year, data_df, export = T){
# Avoid unnecessary calculations by NOT running again analysis for countries
# that already have a saved csv file
if(export){
f <- list.files("Output", pattern = "gp_")
f <- gsub("gp_|.csv", "", f)
f <- gsub("_[0-9]+", "", f)
f <- gsub("_", "", f)
avoid <- countries[countries %in% f]
print(paste0("Note: skipping countries already in Output: ", paste(avoid, collapse = ", ")))
countries <- countries[!countries %in% f]
if(!length(countries)){
return(cat("All countries processed!"))
}
}
file_api_base <- "https://dataverse.harvard.edu/api/access/datafile/"
urls <-
data_df %>%
filter(country %in% countries) %>%
# The first USA sim files is empty, skip it!
filter(!label %in% c("USA_200407")) %>%
group_by(country) %>%
slice(1:num_sims) %>%
ungroup() %>%
mutate(url = paste0(file_api_base, id)) %>%
data.frame()
# Get data
print("Start estimations from microdata...")
gp <- find_grandparents2(urls, year, export)
if(!export){
return(gp)
}
}
find_grandparents2 <- function(urls, year, export){
gps_list <-
lapply(1:nrow(urls), function(n, urls, year){
u <- urls$url[n]
con <- urls$country[n]
se <- urls$seed[n]
lab <- urls$label[n]
print(paste0("Loading data for ", lab))
# Load data
load(url(u))
# Convert months to years
opop <-
sims$opop %>%
mutate(
ego_birth_year = asYr2(dob, FinalSimYear, endmo)
, ego_death_year = asYr2(dod, FinalSimYear, endmo)
)
print("Finding grandparents...")
out <- find_grandparents3(opop = opop, year = year)
out$country <- con
out$seed <- se
# out$year <- year
out$share <- out$gp / out$pop
out[is.na(out)] <- 0
if(export){
print("Writing grandparent estimates to Output.")
write.csv(out, paste0("Output/gp_", lab, ".csv"))
} else {
return(out)
}
closeAllConnections()
}, urls = urls, year = year)
gps_df <-
gps_list %>%
bind_rows()
return(gps_df)
}
# Function to identify share of the population that is a grandparent
find_grandparents3 <- function(opop, year){
out_l <-
lapply(year, function(y, opop){
print(paste0("Working on year ", y))
# Keep only people alive in given year
df_alive <-
opop %>%
filter(ego_birth_year <= y, ego_death_year > y) %>%
mutate(ego_age_now = y - ego_birth_year)
# find grandparents of these people
egos <- df_alive$pid
match_rows <- match(egos, opop$pid)
gp_pp <- opop$pop[match(opop$pop[match_rows], opop$pid)]
gp_pm <- opop$pop[match(opop$mom[match_rows], opop$pid)]
gp_mm <- opop$mom[match(opop$mom[match_rows], opop$pid)]
gp_mp <- opop$mom[match(opop$pop[match_rows], opop$pid)]
# ID of all grandparents, doesn't matter if they're alive
gp <- unique(c(gp_pp, gp_pm, gp_mm, gp_mp))
# ID of living grandparents in year 'year'
gp_alive <- gp[gp %in% egos]
# Get number of grandpas by age
gp_pop <-
df_alive[match(gp_alive, df_alive$pid), ] %>%
count(age = ego_age_now, name = "gp")
# Get age distribution of population in general
all_pop <-
df_alive %>%
count(age = ego_age_now, name = "pop")
out <-
all_pop %>%
left_join(gp_pop, by = c("age"))
out$year <- y
out
}, opop)
out <- bind_rows(out_l)
out
}
# Get population data from UNWPP
get_unwpp_pop <- function(countries, my_startyr = 2022, my_endyr = 2022){
base_url <- 'https://population.un.org/dataportalapi/api/v1'
# First, identify which indicator codes we want to use
target <- paste0(base_url,'/indicators/?format=csv')
codes <- read.csv(target, sep='|', skip=1)
pop_code <- codes$Id[codes$ShortName == "PopByAge1AndSex"]
# Get location codes
target <- paste0(base_url, '/locations?sort=id&format=csv')
df_locations <- read.csv(target, sep='|', skip=1)
# find the codes for countries
iso3 <- countrycode(countries, origin = "country.name", destination = "iso3c")
locs <-
df_locations %>%
filter(Iso3 %in% iso3) %>%
pull(Id)
my_location <- paste(locs, collapse = ",")
print(paste0("Getting pop data for ", paste(countries, collapse = ", ")))
# Avoid overwhelming UN APi
if(length(countries) <= 20){
my_indicator <- pop_code
my_location <- my_location
target <- paste0(base_url,
'/data/indicators/',my_indicator,
'/locations/',my_location,
'/start/',my_startyr,
'/end/',my_endyr,
'/?format=csv')
pop <-
read.csv(target, sep='|', skip=1) %>%
filter(Variant == "Median") %>%
select(iso3 = Iso3, country = Location, year = TimeLabel, age = AgeStart, sex = Sex, value = Value)
} else{
print("Many countries, I'll process in batch")
my_indicator <- pop_code
times <- floor(length(locs)/20)
sp_vec <- rep(1:20, times)
extras <- length(locs) - length(sp_vec)
if(extras > 0) sp_vec <- c(sp_vec, 1:extras)
my_location_l <- split(locs, sp_vec)
pop <-
lapply(1:length(my_location_l), function(n, my_location_l){
print(paste0("Processing batch ", n, "/", length(my_location_l) ))
loc_n <- paste(my_location_l[[n]], collapse = ",")
target <- paste0(base_url,
'/data/indicators/',my_indicator,
'/locations/', loc_n,
'/start/',my_startyr,
'/end/',my_endyr,
'/?format=csv')
pop <- read.csv(target, sep='|', skip=1)
Sys.sleep(1)
pop
}, my_location_l) %>%
bind_rows() %>%
filter(Variant == "Median") %>%
select(iso3 = Iso3, country = Location, year = TimeLabel, age = AgeStart, sex = Sex, value = Value)
}
return(pop)
}
First, show how the estimation works for Guatemala as an example. Later on, I’ll scale this up for all world countries.
Define parameters for analysis:
countries <- c("Guatemala")
# How many simulations per country? max 5
# Don't touch this
num_sims <- 1
years <- 2022
# Don't touch this
get_un_pop_from_api <- T
# To convert months to years in SOCSIM
FinalSimYear <- 2200
endmo <- 5400
Load the simulation data from the Harvard database
# Show simulations for which countries are available
data_df <- list_data()
## [1] "Getting API ready..."
all_countries <- unique(data_df$country)
# Get country iso3c codes
iso3_codes <- countrycode(countries, origin = "country.name", destination = "iso3c")
Now, implement the analysis to see how many people have grandparents in the simulation
gps_df <- find_grandparents(countries, years, data_df, export = F)
## [1] "Start estimations from microdata..."
## [1] "Loading data for Guatemala_296608"
## [1] "Finding grandparents..."
## [1] "Working on year 2022"
# rename things
gps <-
gps_df %>%
rename(sim_pop = pop, sim_gp = gp, sim_share = share) %>%
mutate(iso3 = countrycode(country, origin = "country.name", destination = "iso3c")) %>%
select(iso3, year, age, -country, everything())
Combine with UN population numbers to get an approximation of the number of grandparents in the real population.
# Get pop numbers from UN using API
pop <- get_unwpp_pop(countries, my_startyr = years, my_endyr = years)
## [1] "Getting pop data for Guatemala"
# Aggregate by sex
pop_un <-
pop %>%
arrange(iso3) %>%
filter(sex == "Both sexes") %>%
rename(pop_un = value) %>%
select(-country, -sex)
# Combine simulation estimates and UN population numbers to get estimated number of grandparents
pop_gp_by_age <-
gps %>%
left_join(pop_un, by = c("iso3", "year", "age")) %>%
mutate(number_grandparents = sim_share * pop_un) %>%
select(iso3, year, age, number_grandparents, share_grandparents = sim_share, pop_un)
We can now visualise the number of grandparents over age and the ‘per
capita’ number of grandparents over age (i.e., the share of the
population aged x
who are grandparents).
# Distribution of grandparents over age
pop_gp_by_age %>%
rename(`Grandparents per capita` = share_grandparents, `Number of grandparents` = number_grandparents) %>%
select(-pop_un) %>%
pivot_longer(-c(iso3, year, age)) %>%
# mutate() %>%
ggplot(aes(x = age, y = value)) +
geom_line() +
facet_wrap(.~name, scales = "free") +
labs(y = "") +
theme_bw()
Finally, let’s answer the question: how many grandparents are there in Guatemala in 2022, irrespective of how old the are?
# Not by age, but for all ages combined
pop_gp_by_age %>%
group_by(iso3, year) %>%
summarise(
number_grandparents = sum(number_grandparents)
, pop_un = sum(pop_un)
) %>%
ungroup() %>%
mutate(share_grandparents = number_grandparents/pop_un) %>%
mutate(
pop_un = round(pop_un/1e6, 1)
, number_grandparents = round(number_grandparents/1e6, 1)
) %>%
select(iso3, year, `Number of grandparents (millions)` = number_grandparents, `Total population (millions)` = pop_un, `Grandparents per capita` = share_grandparents) %>%
kable()
## `summarise()` has grouped output by 'iso3'. You can override using the
## `.groups` argument.
iso3 | year | Number of grandparents (millions) | Total population (millions) | Grandparents per capita |
---|---|---|---|---|
GTM | 2022 | 2.9 | 17.8 | 0.1615601 |
We do the same thing, but scaling it up to all countries present in the UNWPP data (https://population.un.org/wpp/).
Note that running the code chunk below is likely to take a couple of hours and requires constantly downloading data from the Harvard Dataverse and writing outputs to the disk. The final estimates are stored in the Output directory, at the country level (by age and for all ages combined) and for the entire world. Estimates for 1990-2020 rely on UNWPP ‘historical’ data, whereas estimates for 2025-2040 come from UNWPP projections (medium scenario).
# 1. Preamble -----------
countries <- c("all")
# How many simulations per country? max 5
# Don't touch this
num_sims <- 1
years <- sort(c(seq(1960, 2050, 5), 2022))
# Don't touch this
get_un_pop_from_api <- F
# To convert months to years in SOCSIM
# 20200414 This should work for new estimates up to 2200
FinalSimYear <- 2200
endmo <- 5400
# 2. Locate grandparents in simulations -----------
data_df <- list_data()
all_countries <- unique(data_df$country)
# Get all countries
print("Getting data for all UN countries.")
# Remove regions, etc.
countries_df <-
data_df %>%
filter(!country %in% c("Chinaanddependencies", "ChinaMacaoSAR")) %>%
mutate(iso3 = countrycode(country, origin = "country.name", destination = "iso3c")) %>%
filter(!is.na(iso3))
countries <- unique(countries_df$country)
# Get country iso codes
iso3_codes <- countrycode(countries, origin = "country.name", destination = "iso3c")
find_grandparents(countries, years, data_df, export = T)
# Read gp data from disk
f <- list.files("Output", pattern = "gp_", full.names = T)
gps <-
lapply(f, read.csv) %>%
bind_rows() %>%
select(-X) %>%
rename(sim_pop = pop, sim_gp = gp, sim_share = share) %>%
mutate(iso3 = countrycode(country, origin = "country.name", destination = "iso3c")) %>%
select(iso3, year, age, -country, everything())
# 3. Multiply by population numbers --------------
# Get pop numbers from UN
if(!file.exists("Data/un_pop_full.csv")){
# If this doesn't work, you may need to download the data from
# https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2022_Population1JanuaryBySingleAgeSex_Medium_1950-2021.zip
# https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2022_Population1JanuaryBySingleAgeSex_Medium_2022-2100.zip
# Download UN data and unzip
# Estimates and historical data
# download.file("https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2022_Population1JanuaryBySingleAgeSex_Medium_1950-2021.zip", "Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_1950-2021.zip")
unzip("Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_1950-2021.zip", exdir = "Data")
file.remove("Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_1950-2021.zip")
# Projections
# download.file("https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2022_Population1JanuaryBySingleAgeSex_Medium_2022-2100.zip", "Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_2022-2100.zip")
unzip("Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_2022-2100.zip", exdir = "Data")
file.remove("Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_2022-2100.zip")
pop <-
fread("Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_1950-2021.csv") %>%
bind_rows(fread("Data/WPP2022_Population1JanuaryBySingleAgeSex_Medium_2022-2100.csv")) %>%
filter(LocTypeName == "Country/Area", Variant == "Medium") %>%
select(iso3 = ISO3_code, year = Time, age = AgeGrp, pop_un = PopTotal) %>%
mutate(
age = as.numeric(ifelse(age == "100+", 100, age))
, pop_un = pop_un*1000
)
fwrite(pop, "Data/un_pop_full.csv", row.names = F)
} else {
pop <- read.csv("Data/un_pop_full.csv", stringsAsFactors = F)
}
pop_un <-
pop %>%
# Keep only relevant years and countries
filter(year %in% years) %>%
filter(iso3 %in% iso3_codes)
# 4. Rough estimate of number of grandparents -------
# 4.1. By country ============
pop_gp_by_age <-
gps %>%
left_join(pop_un, by = c("iso3", "year", "age")) %>%
arrange(iso3) %>%
mutate(number_grandparents = sim_share * pop_un) %>%
select(iso3, year, age, number_grandparents, share_grandparents = sim_share, pop_un)
# Not by age, but for all ages combined
pop_gp <-
pop_gp_by_age %>%
group_by(iso3, year) %>%
summarise(
number_grandparents = sum(number_grandparents)
, pop_un = sum(pop_un)
) %>%
ungroup() %>%
mutate(share_grandparents = number_grandparents/pop_un) %>%
select(iso3, year, number_grandparents, share_grandparents, pop_un)
# For the whole world
pop_gp_world <-
pop_gp %>%
group_by(year) %>%
summarise(
number_grandparents = sum(number_grandparents)
, pop_un = sum(pop_un)
) %>%
ungroup() %>%
mutate(share_grandparents = number_grandparents/pop_un)
# 5. Export
write.csv(pop_gp_by_age, "Output/grandparents_by_country_age.csv", row.names = F)
write.csv(pop_gp, "Output/grandparents_by_country.csv", row.names = F)
write.csv(pop_gp_world, "Output/grandparents_world.csv", row.names = F)
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.41 data.table_1.14.6 countrycode_1.4.0 httr_1.4.4
## [5] forcats_0.5.2 stringr_1.5.0 dplyr_1.0.10 purrr_1.0.0
## [9] readr_2.1.3 tidyr_1.2.1 tibble_3.1.8 ggplot2_3.4.0
## [13] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] lubridate_1.9.0 assertthat_0.2.1 digest_0.6.31
## [4] utf8_1.2.2 R6_2.5.1 cellranger_1.1.0
## [7] backports_1.4.1 reprex_2.0.2 evaluate_0.19
## [10] highr_0.10 pillar_1.8.1 rlang_1.0.6
## [13] googlesheets4_1.0.1 curl_4.3.3 readxl_1.4.1
## [16] rstudioapi_0.14 rmarkdown_2.19 labeling_0.4.2
## [19] googledrive_2.0.0 munsell_0.5.0 broom_1.0.2
## [22] compiler_4.2.1 modelr_0.1.10 xfun_0.36
## [25] pkgconfig_2.0.3 htmltools_0.5.4 tidyselect_1.2.0
## [28] fansi_1.0.3 crayon_1.5.2 tzdb_0.3.0
## [31] dbplyr_2.2.1 withr_2.5.0 grid_4.2.1
## [34] jsonlite_1.8.4 gtable_0.3.1 lifecycle_1.0.3
## [37] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1
## [40] cli_3.5.0 stringi_1.7.8 farver_2.1.1
## [43] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2
## [46] generics_0.1.3 vctrs_0.5.1 tools_4.2.1
## [49] glue_1.6.2 hms_1.1.2 fastmap_1.1.0
## [52] yaml_2.3.6 timechange_0.2.0 colorspace_2.0-3
## [55] gargle_1.2.1 rvest_1.0.3 haven_2.5.1