Add new read_nhgis_ext function

Update pivot_nhgis_data to use column_title_col instead of label_col (for consistency w/ getACS package) Also improve handling of MOE values and upper/lower bound values if present
elipousson · May 17, 2024 · 3bc2adb · 3bc2adb
1 parent b629650
commit 3bc2adb
Show file tree

Hide file tree

Showing 4 changed files with 297 additions and 27 deletions.
diff --git a/R/pivot_ipumsr.R b/R/pivot_ipumsr.R
@@ -12,7 +12,7 @@
 pivot_nhgis_data <- function(data,
                              variable_col = "variable",
                              value_col = "value",
-                             label_col = "label",
+                             column_title_col = "column_title",
                              denominator_prefix = "denominator_",
                              variable_starts_with = c(
                                "A", "B", "D0",
@@ -22,19 +22,43 @@ pivot_nhgis_data <- function(data,
                              )) {
   check_installed(c("tidyr", "labelled", "tibble"))
 
-  nhgis_variables <- data |>
-    labelled::get_variable_labels() |>
+  nhgis_var_labels <- data |>
+    labelled::get_variable_labels()
+
+  stopifnot(
+    !is_empty(nhgis_var_labels)
+  )
+
+  nhgis_variables <- nhgis_var_labels |>
     tibble::enframe(
       name = variable_col,
-      value = label_col
+      value = column_title_col
     ) |>
     dplyr::mutate(
-      "{label_col}" := as.character(.data[[label_col]])
+      "{column_title_col}" := as.character(.data[[column_title_col]])
     )
 
-  data |>
+  year_cols <- c("GEOGYEAR", "DATAYEAR", "YEAR")
+
+  keep_cols <- c(
+    "GISJOIN", "AREA", "NAME", "AREANAME",
+    year_cols,
+    "STATE", "STATEA", "STATEFP", "STATENH", "STATEICP",
+    "COUNTY", "COUNTYA", "COUNTYFP", "COUNTYNH", "COUNTYICP"
+  )
+
+  data <- data |>
+    dplyr::mutate(
+      dplyr::across(
+        dplyr::any_of(year_cols),
+        as.character
+      )
+    ) |>
     tidyr::pivot_longer(
-      cols = dplyr::starts_with(variable_starts_with),
+      cols = c(
+        dplyr::starts_with(variable_starts_with),
+        !dplyr::any_of(keep_cols)
+      ),
       names_to = variable_col,
       values_to = value_col
     ) |>
@@ -46,26 +70,146 @@ pivot_nhgis_data <- function(data,
     dplyr::mutate(
       "{denominator_prefix}{variable_col}" := dplyr::case_when(
         # Totals
-        .data[[variable_col]] %in% c("A41AA", "AV0AA", "A68AA") ~ .data[[variable_col]],
+        .data[[variable_col]] %in% c(
+          "A41AA", "A68AA", "AV0AA", "A00AA", "AR5AA",
+          "CM7AA", "CL8AA"
+        ) ~ .data[[variable_col]],
+
         # Total units
         .data[[variable_col]] %in% c("A43AA", "A43AB") ~ "A41AA",
+
         # Occupied units
         .data[[variable_col]] %in% c("B37AA", "B37AB") ~ "A43AA",
+
+        # Families
+        .data[[variable_col]] %in% c(
+          "A88AA", "A88AB", "A88AC", "A88AD", "A88AE"
+        ) ~ "A68AA",
+
         # Persons
         .data[[variable_col]] %in% c(
+          "A35AA",
           "B18AA", "B18AB", "B18AC", "B18AD", "B18AE",
           "B57AA", "B57AB", "B57AC", "B57AD", "B57AE",
           "B57AF", "B57AG", "B57AH", "B57AI", "B57AJ",
           "B57AK", "B57AL", "B57AM", "B57AN", "B57AO",
           "B57AP", "B57AQ", "B57AR", "D08AA", "D08AB",
           "B14AA", "B14AB"
         ) ~ "AV0AA",
+
+        # Negro
+        .data[[variable_col]] %in% c("BYA003", "A8L005", "A8L006") ~ "BYA003",
+
         # Households
-        .data[[variable_col]] %in% c(
-          "BS7AA", "BS7AB", "BS7AC", "BS7AD",
-          "CV5AA", "CV5AB", "CV5AC", "CV5AD", "CV5AE", "CV5AF"
-        ) ~ "A43AA"
+        .data[[variable_col]] %in% c("CM9AA", "CM9AB") ~ "CM7AA",
+
+        # Occupied housing units
+        .data[[variable_col]] %in% c("CV5AA", "CV5AB", "CV5AC",
+                                     "CV5AD", "CV5AE", "CV5AF") ~ "A43AA",
+        # Housing units
+        # .data[[variable_col]] %in% c("CM9AA", "CM9AB") ~ "CM7AA",
+
+        .data[[variable_col]] %in% c("AF15001", "AF15002") ~ "AV0AA",
+
+        # White
+        .data[[variable_col]] %in% c("A8L001", "A8L002") ~ "AF15001",
+
+        # White Native-born / White Foreign-born use White denominator
+        .data[[variable_col]] %in% c("BYA001", "BYA002") ~ "AF15001",
+
+        .data[[variable_col]] %in% c("BS7AA", "BS7AB", "BS7AC", "BS7AD") ~ "AR5AA"
+      ),
+      .after = dplyr::all_of(variable_col)
+    )
+
+  moe_join <- any(c(
+    stringr::str_detect(data[[column_title_col]], "^(Lower bound|Upper bound|Margin of error)")
+  ))
+
+  if (!moe_join) {
+    return(data)
+  }
+
+  data |>
+    join_moe_cols(
+      moe_pattern = "^Lower bound",
+      moe_variable_remove = "L$",
+      moe_col = paste0(value_col, "_lower"),
+      variable_col = variable_col,
+      column_title_col = column_title_col,
+      value_col = value_col,
+      denominator_prefix = denominator_prefix
+    ) |>
+    join_moe_cols(
+      moe_pattern = "^Upper bound",
+      moe_variable_remove = "U$",
+      moe_col = paste0(value_col, "_upper"),
+      variable_col = variable_col,
+      column_title_col = column_title_col,
+      value_col = value_col,
+      denominator_prefix = denominator_prefix
+    ) |>
+    join_moe_cols(
+      moe_pattern = "^Margin of error",
+      moe_variable_remove = "M$",
+      moe_col = "moe",
+      variable_col = variable_col,
+      column_title_col = column_title_col,
+      value_col = value_col,
+      denominator_prefix = denominator_prefix
+    )
+}
+
+#' @noRd
+join_moe_cols <- function(
+    data,
+    moe_pattern = "^Lower bound",
+    moe_variable_remove = "L$",
+    variable_col = "variable",
+    column_title_col = "column_title",
+    value_col = "value",
+    moe_col = "value_lower",
+    denominator_prefix = "denominator_") {
+  moe_pattern_match <- stringr::str_detect(
+    data[[column_title_col]],
+    moe_pattern
+  )
+
+  if (!any(moe_pattern_match)) {
+    return(data)
+  }
+
+  moe_join_data <- data |>
+    dplyr::filter(moe_pattern_match) |>
+    dplyr::rename(
+      dplyr::any_of(set_names(value_col, moe_col))
+    ) |>
+    dplyr::mutate(
+      "{variable_col}" := stringr::str_remove(
+        .data[[variable_col]],
+        moe_variable_remove
       )
+    ) |>
+    dplyr::select(
+      !dplyr::any_of(column_title_col),
+      !dplyr::starts_with(denominator_prefix)
+    )
+
+  if (inherits(moe_join_data, "sf")) {
+    check_installed("sf")
+    moe_join_data <- sf::st_drop_geometry(moe_join_data)
+  }
+
+  data |>
+    dplyr::filter(!moe_pattern_match) |>
+    dplyr::left_join(
+      moe_join_data,
+      na_matches = "never"
+    ) |>
+    suppressMessages() |>
+    dplyr::relocate(
+      dplyr::all_of(moe_col),
+      .after = dplyr::all_of(value_col)
     )
 }
 
@@ -77,14 +221,15 @@ pivot_nhgis_data <- function(data,
 join_nhgis_percent <- function(data,
                                variable_col = "variable",
                                value_col = "value",
-                               label_col = "label",
+                               column_title_col = "column_title",
                                denominator_prefix = "denominator_",
-                               perc_prefix = "perc_") {
+                               perc_prefix = "perc_",
+                               digits = 2) {
   denom_variable_col <- paste0(denominator_prefix, variable_col)
 
   denom_value_col <- paste0(denominator_prefix, value_col)
 
-  denom_label_col <- paste0(denominator_prefix, label_col)
+  denom_title_col <- paste0(denominator_prefix, column_title_col)
 
   denominator_data <- data |>
     dplyr::filter(
@@ -94,8 +239,8 @@ join_nhgis_percent <- function(data,
     dplyr::rename(
       dplyr::all_of(
         rlang::set_names(
-          c(variable_col, value_col, label_col),
-          c(denom_variable_col, denom_value_col, denom_label_col)
+          c(variable_col, value_col, column_title_col),
+          c(denom_variable_col, denom_value_col, denom_title_col)
         )
       )
     )
@@ -107,8 +252,9 @@ join_nhgis_percent <- function(data,
     dplyr::mutate(
       "{perc_prefix}{value_col}" := dplyr::if_else(
         !is.na(.data[[denom_variable_col]]) & !is.na(.data[[value_col]]) & .data[[denom_value_col]] > 0,
-        round(.data[[value_col]] / .data[[denom_value_col]], digits = 2),
+        round(.data[[value_col]] / .data[[denom_value_col]], digits = digits),
         NA_real_
-      )
+      ),
+      .after = dplyr::starts_with(value_col)
     )
 }
diff --git a/R/read_nhgis_files.R b/R/read_nhgis_files.R
@@ -58,12 +58,27 @@ read_ipums_geometry <- function(shape_file = NULL,
 read_nhgis_data <- function(data_file = NULL,
                             path = NULL,
                             file_select = NULL,
+                            multiple = TRUE,
                             verbose = FALSE,
-                            ...) {
+                            ...,
+                            format = c("tidy", "wide"),
+                            variable_col = "variable",
+                            value_col = "value",
+                            column_title_col = "column_title",
+                            denominator_prefix = "denominator_",
+                            variable_starts_with = c(
+                              "A", "B", "D0", "AV",
+                              "A4", "BS", "BUQ", "B0J",
+                              "B7", "CV", "CM", "CL"
+                            ),
+                            perc_prefix = "perc_",
+                            digits = 2) {
   if (is.null(data_file) && has_name(path, "data")) {
     data_file <- path[["data"]]
   }
 
+  format <- arg_match(format, c("tidy", "wide"))
+
   stopifnot(
     file.exists(data_file)
   )
@@ -74,12 +89,33 @@ read_nhgis_data <- function(data_file = NULL,
   nhgis_data <- lapply(
     rlang::set_names(file_select, file_select),
     \(file) {
-      ipumsr::read_nhgis(
+      data <- ipumsr::read_nhgis(
         data_file = data_file,
         file_select = file,
         verbose = verbose,
         ...
       )
+
+      if (format == "tidy") {
+        data <- data |>
+          pivot_nhgis_data(
+            variable_col = variable_col,
+            value_col = value_col,
+            column_title_col = column_title_col,
+            denominator_prefix = denominator_prefix,
+            variable_starts_with = variable_starts_with
+          ) |>
+          join_nhgis_percent(
+            variable_col = variable_col,
+            value_col = value_col,
+            column_title_col = column_title_col,
+            denominator_prefix = denominator_prefix,
+            perc_prefix = perc_prefix,
+            digits = digits
+          )
+      }
+
+      data
     }
   )
 
@@ -89,6 +125,58 @@ read_nhgis_data <- function(data_file = NULL,
   )
 }
 
+#' @rdname read_nhgis_data
+#' @keywords internal
+#' @export
+read_nhgis_ext <- function(data_file,
+                           file_select = NULL,
+                           verbose = FALSE,
+                           var_attrs = c("val_labels", "var_label", "var_desc"),
+                           ...,
+                           format = "tidy",
+                           variable_col = "variable",
+                           value_col = "value",
+                           column_title_col = "column_title",
+                           denominator_prefix = "denominator_",
+                           variable_starts_with = c(
+                             "A", "B", "D0", "AV",
+                             "A4", "BS", "BUQ", "B0J",
+                             "B7", "CV", "CM", "CL"
+                           ),
+                           perc_prefix = "perc_",
+                           digits = 2) {
+  format <- arg_match(format, c("tidy", "wide"))
+
+  data <- ipumsr::read_nhgis(
+    data_file = data_file,
+    file_select = file,
+    verbose = verbose,
+    var_attrs = var_attrs,
+    ...
+  )
+
+  if (format == "wide") {
+    return(data)
+  }
+
+  data |>
+    pivot_nhgis_data(
+      variable_col = variable_col,
+      value_col = value_col,
+      column_title_col = column_title_col,
+      denominator_prefix = denominator_prefix,
+      variable_starts_with = variable_starts_with
+    ) |>
+    join_nhgis_percent(
+      variable_col = variable_col,
+      value_col = value_col,
+      column_title_col = column_title_col,
+      denominator_prefix = denominator_prefix,
+      perc_prefix = perc_prefix,
+      digits = digits
+    )
+}
+
 #' Read NHGIS data and geometry
 #'
 #' Read NHGIS data and geometry to return a named list or a combined `sf`
@@ -146,6 +234,7 @@ read_nhgis_files <- function(path = NULL,
 
   check_installed("sf")
 
+  # FIXME: Should this be replaced with a ipums_shape_join_* function?
   nhgis_data <- dplyr::left_join(
     nhgis_data,
     nhgis_shape,