-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_table.R
160 lines (142 loc) · 5.33 KB
/
parse_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#' Parsed Census SF1 and ACS Tables
#'
#' Contains parsed table information for the 2010 Decennial Summary File 1 and
#' 2019 ACS 5-year and 1-year tables.
#' This parsed information is used internally in [cens_find_dec()],
#' [cens_find_acs()], [cens_get_dec()], and [cens_get_acs()].
#' For other sets of tables, try using [cens_parse_tables()].
#'
#' @format A list of `cens_table` objects, which are just lists with four elements:
#' - `concept`, a human-readable name
#' - `tables`, the constituent table codes
#' - `surveys`, the supported surveys
#' - `dims`, the parsed names of the dimensions of the tables
#' - `vars`, a `tibble` with all of the parsed variable values
#'
#' @name tables
NULL
#' @rdname tables
"tables_sf1"
#' @rdname tables
"tables_acs"
#' Attempt to Parse Tables from a Census API
#'
#' Uses the same parsing code as that which generates [tables_sf1] and [tables_acs]
#' See <https://www.census.gov/data/developers/data-sets.html> for a list of
#' APIs and corresponding years, or use [censusapi::listCensusApis()].
#'
#' @param api A Census API programmatic name such as `"acs/acs5"`.
#' @param year The year for the data
#'
#' @returns A list of `cens_table` objects, which are just lists with four elements:
#' - `concept`, a human-readable name
#' - `tables`, the constituent table codes
#' - `surveys`, the supported surveys
#' - `dims`, the parsed names of the dimensions of the tables
#' - `vars`, a `tibble` with all of the parsed variable values
#'
#' @examples \dontrun{
#' cens_parse_tables("dec/pl", 2020)
#' }
#'
#' @export
cens_parse_tables = function(api, year) {
vars = get_survey_vars(api, year)
dplyr::group_map(vars, parse_table) %>%
unlist(recursive=FALSE) %>%
lapply(function(x) {
x$surveys = api
x
}) %>%
lapply(new_cens_table)
}
# Internal functions to parse Census tables with the API ------
get_survey_vars = function(api, year) {
raw = censusapi::listCensusMetadata(api, vintage=year, type="variables") %>%
as_tibble() %>%
dplyr::mutate(label = str_trim(.data$label))
if (str_starts(api, "dec")) {
raw = dplyr::filter(raw,
str_starts(.data$label, "(!!)?Total:?!!") |
str_ends(.data$name, "001001"),
!str_detect(.data$label, "!!Not defined"))
} else if (str_starts(api, "acs")) {
raw = dplyr::filter(raw,
str_starts(.data$label, "Estimate!!"),
!str_detect(.data$label, "!!Not defined")) %>%
dplyr::mutate(label = str_sub(.data$label, 11))
}
raw %>%
dplyr::mutate(table = str_extract(.data$group, "[A-Z]*\\d+")) %>%
dplyr::group_by(table) %>%
dplyr::arrange(.data$group)
}
# Handle repeated tables by race
paren_race_re = "\\((WHITE|BLACK|AMERI|ASIAN|NATIVE|SOME OTHER|TWO OR|HISPANIC)[A-Z, ]+\\)"
tidy_names = function(x) {
str_to_lower(x) %>%
make.names(unique = TRUE) %>%
str_replace_all("\\.", "_")
}
# table name to list of nice variables
parse_concept = function(concept) {
parsed = concept %>%
str_replace(paren_race_re, "BY RACE-ETHNICITY") %>%
str_remove_all("AND ") %>%
str_remove_all("\\(IN 20\\d\\d INFLATION-ADJUSTED DOLLARS\\) ") %>%
unique() %>%
str_split(" BY ") %>%
lapply(tidy_names)
parsed[[which.max(lengths(parsed))]]
}
# table column/row name to nice label
parse_label_group = function(lbl) {
if_else(lbl == "", "total", lbl) %>%
str_remove(":") %>%
str_remove(" --") %>%
str_trim() %>%
str_to_lower()
}
parse_table = function(tbl, key) {
dim_cats = str_split(tbl$label, "!!", simplify=TRUE)[, drop=FALSE]
const_cats = apply(dim_cats, 2, dplyr::n_distinct) == 1
if (nrow(tbl) > 1 && any(const_cats)) dim_cats = dim_cats[, !const_cats, drop=FALSE]
paren_cats = str_extract(tbl$concept, paren_race_re)
if (any(!is.na(paren_cats))) {
new_col = dplyr::coalesce(paren_cats, "") %>%
str_remove_all("[()]")
dim_cats = cbind(dim_cats, new_col, deparse.level=0)
}
depth = ncol(dim_cats)
if (depth == 0L) {
dim_cats = matrix("")
depth = 1L
}
dim_lbls = parse_concept(unique(tbl$concept))
short_lbls = str_replace_all(dim_lbls, "_", " ") %>%
str_remove_all("(or|and|of|the|for|in|,) ") %>%
str_to_title() %>%
abbreviate(5, named=FALSE) %>%
str_to_lower() %>%
paste(collapse="_")
if (depth == length(dim_lbls) + 1L) {
new_dim = paste0(short_lbls, "_sub")
dim_lbls = c(dim_lbls, new_dim)
} else if (depth > length(dim_lbls)) {
dim_lbls = str_c(short_lbls, "_", seq_len(depth))
} else if (depth < length(dim_lbls)) {
dim_lbls = dim_lbls[-seq_len(length(dim_lbls) - depth)]
}
vars_tbl = dim_cats %>%
`colnames<-`(dim_lbls) %>%
as_tibble() %>%
dplyr::mutate(variable=tbl$name, .before=1) %>%
dplyr::mutate(dplyr::across(-.data$variable, function(x) as.factor(parse_label_group(x)))) %>%
dplyr::arrange(.data$variable)
out = list(list(concept = tbl$concept[1],
tables = sort(unique(tbl$group)),
dims = dim_lbls,
vars = vars_tbl))
names(out) = key[[1]][1]
out
}