geography + swap censusapi

CoryMcCartan · Aug 25, 2022 · ce97c80 · ce97c80
1 parent 32090cb
commit ce97c80
Show file tree

Hide file tree

Showing 21 changed files with 676 additions and 154 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -1,4 +1,4 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
@@ -10,32 +10,40 @@ name: R-CMD-check
 
 jobs:
   R-CMD-check:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macOS-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
-      CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
       R_KEEP_PKG_SOURCE: yes
+
     steps:
       - uses: actions/checkout@v2
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
         with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
 
-      - uses: r-lib/actions/setup-r-dependencies@v1
+      - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: rcmdcheck
-
-      - uses: r-lib/actions/check-r-package@v1
-
-      - name: Show testthat output
-        if: always()
-        run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
-        shell: bash
+          extra-packages: any::rcmdcheck
+          needs: check
 
-      - name: Upload check results
-        if: failure()
-        uses: actions/upload-artifact@main
+      - uses: r-lib/actions/check-r-package@v2
         with:
-          name: ${{ runner.os }}-r${{ matrix.config.r }}-results
-          path: check
+          upload-snapshots: true
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: easycensus
-Title: Quickly Extract and Marginalize U.S. Census Tables
+Title: Quickly Find, Extract, and Marginalize U.S. Census Tables
 Version: 1.0.0
 Authors@R: 
     person("Cory", "McCartan", email="[email protected]", role=c("aut", "cre"))
@@ -16,7 +16,6 @@ Imports:
     dplyr (>= 1.0.0),
     tidyr (>= 1.0.0),
     stringr,
-    tidycensus,
     censusapi,
     cli
 Suggests:

diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,7 @@ S3method(vec_ptype_abbr,estimate)
 export(as_estimate)
 export(cens_find_acs)
 export(cens_find_dec)
+export(cens_geo)
 export(cens_get_acs)
 export(cens_get_dec)
 export(cens_margin_to)
@@ -44,6 +45,7 @@ export(to_rvar)
 import(cli)
 import(stringr)
 import(vctrs)
+importFrom(dplyr,"%>%")
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,if_else)
 importFrom(pillar,pillar_shaft)

diff --git a/NEWS.md b/NEWS.md
@@ -4,7 +4,8 @@
 * Use `censusapi` instead of `tidycensus` for fewer dependencies
 * New `estimate` vector type that tracks uncertainty through mathematical operations 
 * Expose parsed tables to users with a new `cens_table` type
-* Improve tidiers, including income bin tidiers
+* More flexible geography options when downloading data. See `cens_geo()` for details.
+* Improved tidiers, including new income bin and detailed race tidiers
 
 # easycensus 0.2.0
 

diff --git a/R/cens_find.R b/R/cens_find.R
@@ -12,7 +12,7 @@
 #'   possible matches, at the cost of more output. Negative values will be
 #'   converted to positive but will suppress any printing.
 #'
-#' @returns The codes for the top `show` tables, invisibly.
+#' @returns The codes for the top `show` tables, invisibly if `show` is positive.
 #'
 #' @examples
 #' cens_find_dec("sex", "age")
@@ -32,8 +32,10 @@ cens_find_dec <- function(..., show=2) {
     if (show > 0) {
         cli_h1("Top {show} matching table{?s}")
         lapply(tables_sf1[best], print, all=FALSE)
+        invisible(best)
+    } else {
+        best
     }
-    invisible(best)
 }
 
 #' @rdname cens_find
@@ -44,8 +46,10 @@ cens_find_acs <- function(..., show=4) {
     if (show > 0) {
         cli_h1("Top {show} matching table{?s}")
         lapply(tables_acs[best], print, all=FALSE)
+        invisible(best)
+    } else {
+        best
     }
-    invisible(best)
 }
 
 

diff --git a/R/cens_geo.R b/R/cens_geo.R
@@ -0,0 +1,197 @@
+#' Construct a Geography Specification for Census Data
+#'
+#' Currently used mostly internally.
+#' Builds a Census API-formatted specification of which geographies to download
+#' data for. State and county names (or postal abbreviations) are partially
+#' matched to existing tables, for ease of use. Other geographies should be
+#' specified with Census GEOIDs. The `usgazeteer` package, available with
+#' `remotes::install_github("bhaskarvk/usgazetteer")`, may be useful in finding
+#' GEOIDs for other geographies. Consult the "geography" sections of each API
+#' at <https://www.census.gov/data/developers/data-sets.html> for information on
+#' which geographic specifiers may be provided in combination with others.
+#'
+#' Supported geography arguments:
+#'
+#' ```{r geos, echo=FALSE, results="asis"}
+#' code_names = names(geo)
+#' cens_names = unlist(geo)
+#' paren_labels = dplyr::if_else(
+#'     cens_names == code_names, "",
+#'     stringr::str_c(" (", str_to_title(cens_names), ")")
+#'     )
+#' cat(stringr::str_glue("* `{code_names}`{paren_labels}"), sep="\n")
+#' ```
+#'
+#' @param geo The geographic level to return. One of the machine-readable or
+#'   human-readable names listed in the "Details" section. Will return all
+#'   matching geographies of this level, as filtered by the further arguments to
+#'   `...`.  For example, setting `geo="tract"` is equivalent to setting
+#'   `tract="all"`.
+#' @param ... Geographies to return, as supported by the Census API. Order
+#'   matters here---the first argument will be the geographic level to return
+#'   (i.e., it corresponds to the `geo` argument) and additional arguments will
+#'   filter the results. Use `"all"`, `"*"`, `NA`, or `TRUE` to return all units
+#'   of a particular geography.  See the examples for details.
+#' @param check If `TRUE`, validate the provided geographies against the
+#'   available geographies from the relevant Census API. Requires the `api` and
+#'   `year` arguments to be specified.
+#' @param api A Census API programmatic name such as `"acs/acs5"`.
+#' @param year The year for the data
+#'
+#' @returns A list with two elements, `region` and `regionin`, which together
+#'   specify a valid Census API geography argument.
+#'
+#' @examples
+#' cens_geo(state="WA")
+#' cens_geo("county", state="WA") # equivalent to `cens_geo(county="all", state="WA")`
+#' cens_geo(county="King", state="Wash")
+#' cens_geo(zcta="02138", check=FALSE)
+#' cens_geo(zcta=NA, state="WA", check=FALSE)
+#' cens_geo("zcta", state="WA", check=FALSE)
+#' cens_geo(cd="09", state="WA", check=FALSE)
+#' cens_geo("county_part", state="WA", cd="09", check=FALSE)
+#'
+#' @export
+cens_geo <- function(geo=NULL, ..., check=TRUE, api="acs/acs5", year=2019) {
+    geo_in = list(...)
+    geo_in = lapply(geo_in, regularize_wild)
+    names(geo_in) = vapply(names(geo_in), match_geo, character(1))
+
+    # parse state and county
+    has_state = FALSE
+    if ("state" %in% names(geo_in)) {
+        has_state = TRUE
+        geo_in$state = fips_state(geo_in$state)
+    }
+    idx_cty = which(names(geo_in) == "county" | names(geo_in) == "county (or part)")
+    if (length(idx_cty) > 0 && !has_state)
+        cli_abort("{.arg state} must be provided if {.arg county} is specified.")
+    for (i in idx_cty) {
+        geo_in[[i]] = fips_county(geo_in[[i]], geo_in$state)
+    }
+
+    # find the 'for' variable
+    if (is.null(geo)) {
+        if (length(geo_in) == 0) cli_abort("Must provide at least one geographic level.")
+        geo_for = match_geo(names(geo_in)[[1]])
+        geo_for_val = geo_in[[1]]
+        geo_in = geo_in[-1] # remove first from list
+    } else {
+        if (!is.character(geo) && length(geo) == 1)
+            cli_abort("{.arg geo} should be a string.")
+        geo_for = match_geo(geo)
+        geo_for_val = "*"
+    }
+
+    if (isTRUE(check)) {
+        # match geographies to available
+        d_geo = as_tibble(censusapi::listCensusMetadata(name=api, vintage=year, type="geographies"))
+        idx_for = which(geo_for == d_geo$name)
+        if (length(idx_for) == 0) cli_abort("Geography level {.val {geo_for}} not found.")
+        matched = 0L
+
+        req_test = paste0(names(geo_in), collapse="|")
+        # look for perfect 'requires' match
+        req = vapply(d_geo$requires, paste0, character(1), collapse="|")
+        idx_in = which(req_test == req)
+        idx_both = intersect(idx_for, idx_in)
+        matched = length(idx_both)
+
+        # look for match without wildcards
+        if (matched == 0L) {
+            req_soft = character(nrow(d_geo))
+            for (i in seq_along(req_soft)) {
+                req_soft[i] = paste0(setdiff(d_geo$requires[[i]], d_geo$wildcard[[i]]), collapse="|")
+            }
+            idx_in = which(req_test == req_soft)
+            idx_both = intersect(idx_for, idx_in)
+            matched = length(idx_both)
+            if (matched == 1L) {
+                match_geo = d_geo[idx_both, ]
+                for (x in d_geo$wildcard[[idx_both]]) {
+                    geo_in[[x]] = "*"
+                }
+            }
+        }
+
+        hyper = style_hyperlink("<https://www.census.gov/data/developers/data-sets.html>",
+                                "https://www.census.gov/data/developers/data-sets.html")
+        if (matched == 0L) {
+            cli_abort(c("Geography combination {.val {c(geo_for, names(geo_in))}} not found.",
+                        ">"="Check {hyper} and make sure you are using a valid combination
+                        for your survey and file."))
+        }
+        if (matched > 1L) {
+            cli_abort(c("Multiple geography combinations found for {.val {c(geo_for, names(geo_in))}}.",
+                        ">"="Check {hyper} and make sure you are using a valid combination
+                        for your survey and file."))
+        }
+    }
+
+    if (length(geo_in) > 0) {
+        list(region = str_c(geo_for, ":", geo_for_val),
+             regionin = paste0(str_c(names(geo_in), ":", geo_in), collapse="+"))
+    } else {
+        list(region = str_c(geo_for, ":", geo_for_val),
+             regionin = NULL)
+    }
+}
+
+# Internal: match a geography level
+match_geo <- function(g) {
+    geo_levels = unlist(geo)
+    match = dplyr::coalesce(pmatch(g, names(geo)),
+                            pmatch(g, geo_levels))
+    if (is.na(match)) {
+        cli_abort("Geography level {.val {g}} not found.", .envir=parent.frame())
+    } else {
+        geo_levels[match]
+    }
+}
+
+regularize_wild <- function(x) {
+    if_else(x == "all" | x == "*" | isTRUE(x) | is.na(x), "*", as.character(x))
+}
+
+# Internal: match state to FIPS
+fips_state <- function(state) {
+    if (is.numeric(state) && state > 0 && state < 1e2) { # already a fips code
+        state = str_pad(as.character(as.integer(state)), 2, pad="0")
+    } else {
+        state = str_to_lower(state)
+    }
+
+    match = dplyr::coalesce(pmatch(state, states$fips),
+                            pmatch(state, str_to_lower(states$abbr)),
+                            pmatch(state, str_to_lower(states$name)))
+
+    if (is.na(match)) {
+        cli_abort("No match found for {.val {state}}.", .envir=parent.frame())
+    } else {
+        states$fips[match]
+    }
+}
+
+# Internal: match county to FIPS
+fips_county <- function(county, state_fips) {
+    if (is.numeric(county) && county > 0 && county < 1e5) { # already a fips code
+        county = str_pad(as.character(as.integer(county)), 3, pad="0")
+    } else {
+        county = str_to_lower(county)
+    }
+
+    idx_st = which(counties$state == state_fips)
+    names = str_to_lower(counties$name[idx_st])
+    fips = counties$county[idx_st]
+
+    match = dplyr::coalesce(pmatch(county, fips),
+                            pmatch(county, names))
+
+    if (is.na(match)) {
+        # cli_abort("No match found for {.val {county}}.", .envir=parent.frame())
+        cli_abort("No match found for {.val {county}}.")
+    } else {
+        fips[match]
+    }
+}
+