Skip to content

Commit

Permalink
add support for sdmx parsing
Browse files Browse the repository at this point in the history
this is the first step to resolving
#27 by adding a function that creates
sc_table() like objects based on sdmx
archives

The sdmx format contains all metadata
that is necessary for STATcubeR to reuse
the existing $tabulate() workflow and this
first version already provides support for
various features via the base class (sc_data)

- $tabulate() to aggregate data
- $total_codes() to set/unset total codes
- $recoder to recode datasets (change labels)
  change codes, toggle visibility of
  elements, reorder elements, etc.
- importing german and english labels
  simultaniously (both languages are included
  in a zip download) and allowing to swhitch
  between them using $language<-().

New features
- sdmx arcives provide a $parent column
  in the $fields() table which are used
  to represent hierarchical classifications.
  Previously, this was only possible with
  od_table()

There are still some improvements. See
the issue #27 for more details

- properly parse time variables -
  currently they are treated as generic
  categories.
- parse element annotations (detailed
  descriptions for classification
  elements) and add them to
  $field()$de_desc just like with
  OGD dataset
- parse value annotations (see #39)
- provide a print/fromat method
- add a reasonable logic for total
  codes that takes the parent codes into
  account
- fill meta$measures$fun and
  $meta$measures$precision based on
  the sdmx metadata
- modify very long codes which use
  the @-symbol (probably for escapes)
- extend documentation
- possibly check SuperCROSS compability
  • Loading branch information
GregorDeCillia committed Mar 29, 2023
1 parent 08a5102 commit f4e18ac
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ export(sc_table_custom)
export(sc_table_saved)
export(sc_table_saved_list)
export(sc_tabulate)
export(sdmx_table)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(magrittr,"%T>%")
Expand Down
133 changes: 133 additions & 0 deletions R/sdmx_table.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#' Import data from SDMX
#'
#' Function that reads STATcube data from an sdmx archive - a zip file
#' consisting of `structure.xml` with metadata and `dataset.xml` for the
#' values.
#'
#' @param file a "sdmx archive" file that was downloaded from STATcube.
#' @return An object of class `sc_data`
#' @keywords experimental
#' @export
sdmx_table <- function(file) {
sdmx_read_zip(file) %>%
sdmx_as_data_object()
}

sdmx_read <- function(folder = ".") {
list(
meta = folder %>% sprintf('%s/structure.xml', .) %>% xml2::read_xml() %>%
xml2::xml_ns_strip(),
data = folder %>% sprintf('%s/dataset.xml', .) %>% xml2::read_xml() %>%
xml2::xml_ns_strip()
)
}

sdmx_read_zip <- function(zip_file) {
exdir <- tempfile()
dir.create(exdir)
on.exit(unlink(exdir, recursive = TRUE))
unzip(zipfile = zip_file, exdir = exdir)
sdmx_read(exdir)
}

sdmx_as_raw_df <- function(x) {
obs <- x$data %>% xml2::xml_find_all(".//ObsValue") %>%
xml2::xml_attr("value") %>% as.numeric()
val <- x$data %>% xml2::xml_find_all(".//SeriesKey//Value") %>%
xml2::xml_attr("value")
# assume that entries of SeriesKey always use the same order
val_lab <- x$data %>% xml2::xml_find_first(".//SeriesKey") %>%
xml2::xml_find_all(".//Value") %>% xml2::xml_attr("concept")
val_split <- split(val, rep(
seq_len(length(val)/length(obs)), length(obs)
))
names(val_split) <- val_lab
n <- which(val_lab == "MEASURES_DIMENSION")
obs_split <- split(obs, val_split[[n]])
res <- c(
lapply(val_split[-n], function(x) {
u <- head(x, length(obs_split[[1]]))
factor(u, unique(u))
}),
obs_split
) %>% vctrs::new_data_frame()
names(res) <- gsub("F-DATA_", "", names(res))
res
}

sdmx_fields <- function(x) {
fields <- x$meta %>%
xml2::xml_find_all(".//CodeList[(@id and starts-with(@id, 'C-'))]")
lapply(fields, function(field) {
names <- xml2::xml_find_all(field, ".//Name") %>%
xml2::xml_text()
codes <- xml2::xml_find_all(field, "Code")
values <- xml2::xml_attr(codes, "value")
desc <- codes %>% xml2::xml_find_all("Description") %>%
xml2::xml_text()
ind_de <- seq(1, length(desc), 2)
ind_en <- seq(2, length(desc), 2)
parent <- xml2::xml_attr(codes, "parentCode")
list(
id = field %>% xml2::xml_attr("id"),
label_en = names[2],
label_de = names[1],
elements = vctrs::new_data_frame(list(
label = desc[ind_en],
code = values,
parsed = desc[ind_en],
label_de = desc[ind_de],
label_en = desc[ind_en],
parent = factor(parent, levels = values),
de_desc = rep(NA_character_, length(values)),
en_desc = rep(NA_character_, length(values))
))
)
})
}

sdmx_meta <- function(x) {
code_measure <- x$meta %>% xml2::xml_find_all(
".//*[(@id = 'CL_MEASURES_DIMENSION')]/Code") %>% xml2::xml_attr("value")
label_measure <- x$meta %>% xml2::xml_find_all(
".//*[(@id = 'CL_MEASURES_DIMENSION')]/Code/Description") %>%
xml2::xml_text()
code_db <- xml2::xml_find_all(x$meta, ".//ConceptScheme") %>%
xml2::xml_attr("id")
label_dataset <- x$meta %>% xml2::xml_find_all(".//ConceptScheme/Name") %>%
xml2::xml_text()
ind_de <- seq(1, length(label_measure), 2)
ind_en <- seq(2, length(label_measure), 2)
list(
source = data_frame(label = label_dataset[2], code = code_db,
lang = "en", label_de = label_dataset[1],
label_en = label_dataset[2]),
measures = data_frame(
label = label_measure[ind_en],
code = code_measure,
label_de = label_measure[ind_de],
label_en = label_measure[ind_en],
NAs = rep(0, length(code_measure))
)
)
}

sdmx_as_data_object <- function(x) {
df <- sdmx_as_raw_df(x)
fields <- sdmx_fields(x)
meta <- sdmx_meta(x)
meta$fields <- data_frame(
code = vapply(fields, function(x) x$id, ""),
label = vapply(fields, function(x) x$label_en, "") ,
label_de = vapply(fields, function(x) x$label_de, ""),
label_en = vapply(fields, function(x) x$label_en, ""),
total_code = rep(NA, length(fields)),
nitems = sapply(fields, function(x) {nrow(x$elements)}),
type = rep("Category", length(fields))
)
fields2 <- lapply(fields, function(x) x$elements)
names(df) <- c(meta$fields$code, meta$measures$code)
y <- sc_data$new(df, meta, fields2)
y$language <- "en"
y
}
20 changes: 20 additions & 0 deletions man/sdmx_table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f4e18ac

Please sign in to comment.