Skip to content

Commit

Permalink
Add new read_nhgis_ext function
Browse files Browse the repository at this point in the history
Update pivot_nhgis_data to use column_title_col instead of label_col (for consistency w/ getACS package)

Also improve handling of MOE values and upper/lower bound values if present
  • Loading branch information
elipousson committed May 17, 2024
1 parent b629650 commit 3bc2adb
Show file tree
Hide file tree
Showing 4 changed files with 297 additions and 27 deletions.
184 changes: 165 additions & 19 deletions R/pivot_ipumsr.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
pivot_nhgis_data <- function(data,
variable_col = "variable",
value_col = "value",
label_col = "label",
column_title_col = "column_title",
denominator_prefix = "denominator_",
variable_starts_with = c(
"A", "B", "D0",
Expand All @@ -22,19 +22,43 @@ pivot_nhgis_data <- function(data,
)) {
check_installed(c("tidyr", "labelled", "tibble"))

nhgis_variables <- data |>
labelled::get_variable_labels() |>
nhgis_var_labels <- data |>
labelled::get_variable_labels()

stopifnot(
!is_empty(nhgis_var_labels)
)

nhgis_variables <- nhgis_var_labels |>
tibble::enframe(
name = variable_col,
value = label_col
value = column_title_col
) |>
dplyr::mutate(
"{label_col}" := as.character(.data[[label_col]])
"{column_title_col}" := as.character(.data[[column_title_col]])
)

data |>
year_cols <- c("GEOGYEAR", "DATAYEAR", "YEAR")

keep_cols <- c(
"GISJOIN", "AREA", "NAME", "AREANAME",
year_cols,
"STATE", "STATEA", "STATEFP", "STATENH", "STATEICP",
"COUNTY", "COUNTYA", "COUNTYFP", "COUNTYNH", "COUNTYICP"
)

data <- data |>
dplyr::mutate(
dplyr::across(
dplyr::any_of(year_cols),
as.character
)
) |>
tidyr::pivot_longer(
cols = dplyr::starts_with(variable_starts_with),
cols = c(
dplyr::starts_with(variable_starts_with),
!dplyr::any_of(keep_cols)
),
names_to = variable_col,
values_to = value_col
) |>
Expand All @@ -46,26 +70,146 @@ pivot_nhgis_data <- function(data,
dplyr::mutate(
"{denominator_prefix}{variable_col}" := dplyr::case_when(
# Totals
.data[[variable_col]] %in% c("A41AA", "AV0AA", "A68AA") ~ .data[[variable_col]],
.data[[variable_col]] %in% c(
"A41AA", "A68AA", "AV0AA", "A00AA", "AR5AA",
"CM7AA", "CL8AA"
) ~ .data[[variable_col]],

# Total units
.data[[variable_col]] %in% c("A43AA", "A43AB") ~ "A41AA",

# Occupied units
.data[[variable_col]] %in% c("B37AA", "B37AB") ~ "A43AA",

# Families
.data[[variable_col]] %in% c(
"A88AA", "A88AB", "A88AC", "A88AD", "A88AE"
) ~ "A68AA",

# Persons
.data[[variable_col]] %in% c(
"A35AA",
"B18AA", "B18AB", "B18AC", "B18AD", "B18AE",
"B57AA", "B57AB", "B57AC", "B57AD", "B57AE",
"B57AF", "B57AG", "B57AH", "B57AI", "B57AJ",
"B57AK", "B57AL", "B57AM", "B57AN", "B57AO",
"B57AP", "B57AQ", "B57AR", "D08AA", "D08AB",
"B14AA", "B14AB"
) ~ "AV0AA",

# Negro
.data[[variable_col]] %in% c("BYA003", "A8L005", "A8L006") ~ "BYA003",

# Households
.data[[variable_col]] %in% c(
"BS7AA", "BS7AB", "BS7AC", "BS7AD",
"CV5AA", "CV5AB", "CV5AC", "CV5AD", "CV5AE", "CV5AF"
) ~ "A43AA"
.data[[variable_col]] %in% c("CM9AA", "CM9AB") ~ "CM7AA",

# Occupied housing units
.data[[variable_col]] %in% c("CV5AA", "CV5AB", "CV5AC",
"CV5AD", "CV5AE", "CV5AF") ~ "A43AA",
# Housing units
# .data[[variable_col]] %in% c("CM9AA", "CM9AB") ~ "CM7AA",

.data[[variable_col]] %in% c("AF15001", "AF15002") ~ "AV0AA",

# White
.data[[variable_col]] %in% c("A8L001", "A8L002") ~ "AF15001",

# White Native-born / White Foreign-born use White denominator
.data[[variable_col]] %in% c("BYA001", "BYA002") ~ "AF15001",

.data[[variable_col]] %in% c("BS7AA", "BS7AB", "BS7AC", "BS7AD") ~ "AR5AA"
),
.after = dplyr::all_of(variable_col)
)

moe_join <- any(c(
stringr::str_detect(data[[column_title_col]], "^(Lower bound|Upper bound|Margin of error)")
))

if (!moe_join) {
return(data)
}

data |>
join_moe_cols(
moe_pattern = "^Lower bound",
moe_variable_remove = "L$",
moe_col = paste0(value_col, "_lower"),
variable_col = variable_col,
column_title_col = column_title_col,
value_col = value_col,
denominator_prefix = denominator_prefix
) |>
join_moe_cols(
moe_pattern = "^Upper bound",
moe_variable_remove = "U$",
moe_col = paste0(value_col, "_upper"),
variable_col = variable_col,
column_title_col = column_title_col,
value_col = value_col,
denominator_prefix = denominator_prefix
) |>
join_moe_cols(
moe_pattern = "^Margin of error",
moe_variable_remove = "M$",
moe_col = "moe",
variable_col = variable_col,
column_title_col = column_title_col,
value_col = value_col,
denominator_prefix = denominator_prefix
)
}

#' @noRd
join_moe_cols <- function(
data,
moe_pattern = "^Lower bound",
moe_variable_remove = "L$",
variable_col = "variable",
column_title_col = "column_title",
value_col = "value",
moe_col = "value_lower",
denominator_prefix = "denominator_") {
moe_pattern_match <- stringr::str_detect(
data[[column_title_col]],
moe_pattern
)

if (!any(moe_pattern_match)) {
return(data)
}

moe_join_data <- data |>
dplyr::filter(moe_pattern_match) |>
dplyr::rename(
dplyr::any_of(set_names(value_col, moe_col))
) |>
dplyr::mutate(
"{variable_col}" := stringr::str_remove(
.data[[variable_col]],
moe_variable_remove
)
) |>
dplyr::select(
!dplyr::any_of(column_title_col),
!dplyr::starts_with(denominator_prefix)
)

if (inherits(moe_join_data, "sf")) {
check_installed("sf")
moe_join_data <- sf::st_drop_geometry(moe_join_data)
}

data |>
dplyr::filter(!moe_pattern_match) |>
dplyr::left_join(
moe_join_data,
na_matches = "never"
) |>
suppressMessages() |>
dplyr::relocate(
dplyr::all_of(moe_col),
.after = dplyr::all_of(value_col)
)
}

Expand All @@ -77,14 +221,15 @@ pivot_nhgis_data <- function(data,
join_nhgis_percent <- function(data,
variable_col = "variable",
value_col = "value",
label_col = "label",
column_title_col = "column_title",
denominator_prefix = "denominator_",
perc_prefix = "perc_") {
perc_prefix = "perc_",
digits = 2) {
denom_variable_col <- paste0(denominator_prefix, variable_col)

denom_value_col <- paste0(denominator_prefix, value_col)

denom_label_col <- paste0(denominator_prefix, label_col)
denom_title_col <- paste0(denominator_prefix, column_title_col)

denominator_data <- data |>
dplyr::filter(
Expand All @@ -94,8 +239,8 @@ join_nhgis_percent <- function(data,
dplyr::rename(
dplyr::all_of(
rlang::set_names(
c(variable_col, value_col, label_col),
c(denom_variable_col, denom_value_col, denom_label_col)
c(variable_col, value_col, column_title_col),
c(denom_variable_col, denom_value_col, denom_title_col)
)
)
)
Expand All @@ -107,8 +252,9 @@ join_nhgis_percent <- function(data,
dplyr::mutate(
"{perc_prefix}{value_col}" := dplyr::if_else(
!is.na(.data[[denom_variable_col]]) & !is.na(.data[[value_col]]) & .data[[denom_value_col]] > 0,
round(.data[[value_col]] / .data[[denom_value_col]], digits = 2),
round(.data[[value_col]] / .data[[denom_value_col]], digits = digits),
NA_real_
)
),
.after = dplyr::starts_with(value_col)
)
}
93 changes: 91 additions & 2 deletions R/read_nhgis_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,27 @@ read_ipums_geometry <- function(shape_file = NULL,
read_nhgis_data <- function(data_file = NULL,
path = NULL,
file_select = NULL,
multiple = TRUE,
verbose = FALSE,
...) {
...,
format = c("tidy", "wide"),
variable_col = "variable",
value_col = "value",
column_title_col = "column_title",
denominator_prefix = "denominator_",
variable_starts_with = c(
"A", "B", "D0", "AV",
"A4", "BS", "BUQ", "B0J",
"B7", "CV", "CM", "CL"
),
perc_prefix = "perc_",
digits = 2) {
if (is.null(data_file) && has_name(path, "data")) {
data_file <- path[["data"]]
}

format <- arg_match(format, c("tidy", "wide"))

stopifnot(
file.exists(data_file)
)
Expand All @@ -74,12 +89,33 @@ read_nhgis_data <- function(data_file = NULL,
nhgis_data <- lapply(
rlang::set_names(file_select, file_select),
\(file) {
ipumsr::read_nhgis(
data <- ipumsr::read_nhgis(
data_file = data_file,
file_select = file,
verbose = verbose,
...
)

if (format == "tidy") {
data <- data |>
pivot_nhgis_data(
variable_col = variable_col,
value_col = value_col,
column_title_col = column_title_col,
denominator_prefix = denominator_prefix,
variable_starts_with = variable_starts_with
) |>
join_nhgis_percent(
variable_col = variable_col,
value_col = value_col,
column_title_col = column_title_col,
denominator_prefix = denominator_prefix,
perc_prefix = perc_prefix,
digits = digits
)
}

data
}
)

Expand All @@ -89,6 +125,58 @@ read_nhgis_data <- function(data_file = NULL,
)
}

#' @rdname read_nhgis_data
#' @keywords internal
#' @export
read_nhgis_ext <- function(data_file,
file_select = NULL,
verbose = FALSE,
var_attrs = c("val_labels", "var_label", "var_desc"),
...,
format = "tidy",
variable_col = "variable",
value_col = "value",
column_title_col = "column_title",
denominator_prefix = "denominator_",
variable_starts_with = c(
"A", "B", "D0", "AV",
"A4", "BS", "BUQ", "B0J",
"B7", "CV", "CM", "CL"
),
perc_prefix = "perc_",
digits = 2) {
format <- arg_match(format, c("tidy", "wide"))

data <- ipumsr::read_nhgis(
data_file = data_file,
file_select = file,
verbose = verbose,
var_attrs = var_attrs,
...
)

if (format == "wide") {
return(data)
}

data |>
pivot_nhgis_data(
variable_col = variable_col,
value_col = value_col,
column_title_col = column_title_col,
denominator_prefix = denominator_prefix,
variable_starts_with = variable_starts_with
) |>
join_nhgis_percent(
variable_col = variable_col,
value_col = value_col,
column_title_col = column_title_col,
denominator_prefix = denominator_prefix,
perc_prefix = perc_prefix,
digits = digits
)
}

#' Read NHGIS data and geometry
#'
#' Read NHGIS data and geometry to return a named list or a combined `sf`
Expand Down Expand Up @@ -146,6 +234,7 @@ read_nhgis_files <- function(path = NULL,

check_installed("sf")

# FIXME: Should this be replaced with a ipums_shape_join_* function?
nhgis_data <- dplyr::left_join(
nhgis_data,
nhgis_shape,
Expand Down
Loading

0 comments on commit 3bc2adb

Please sign in to comment.