Skip to content

Commit

Permalink
ARROW-15011: [R] Generate documentation for dplyr function bindings (a…
Browse files Browse the repository at this point in the history
…pache#14014)

Approach:

* `register_binding` takes an additional optional argument, `notes`, where you can list any limitations or differences in behavior between the Arrow version and the R function
* These notes are put in the `.cache` environment when the nse_funcs are built.
* New script `data-raw/docgen.R` that reads `arrow:::.cache$docs` and writes out `dplyr-funcs-docs.R` containing roxygen.
* Similarly, we pull the dplyr functions we s3_register and add them to the generated docs. Unfortunately, the notes about feature limitations aren't easily kept alongside the functions themselves because of how they're registered on load. The approach here creates a list in `arrow-package.R`, where the `.onLoad()` happens, and notes go there. 
* Docs and crossreferences are generated by roxygen2 as usual. 

I deferred filling in all of the function notes. See followup JIRAs on ARROW-17665. 

Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
nealrichardson authored Sep 16, 2022
1 parent 2e72e0a commit 93626ee
Show file tree
Hide file tree
Showing 16 changed files with 1,109 additions and 109 deletions.
1 change: 1 addition & 0 deletions r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ Collate:
'dplyr-funcs-augmented.R'
'dplyr-funcs-conditional.R'
'dplyr-funcs-datetime.R'
'dplyr-funcs-doc.R'
'dplyr-funcs-math.R'
'dplyr-funcs-string.R'
'dplyr-funcs-type.R'
Expand Down
1 change: 1 addition & 0 deletions r/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ style-all:
R -s -e 'styler::style_file(setdiff(dir(pattern = "R$$", recursive = TRUE), source(".styler_excludes.R")$$value))'

doc: style
R -s -f data-raw/docgen.R
R -s -e 'roxygen2::roxygenize()'
-git add --all man/*.Rd

Expand Down
51 changes: 38 additions & 13 deletions r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,50 @@
#' @keywords internal
"_PACKAGE"

# TODO(ARROW-17666): Include notes about features not supported here.
supported_dplyr_methods <- list(
select = NULL,
filter = NULL,
collect = NULL,
summarise = NULL,
group_by = NULL,
groups = NULL,
group_vars = NULL,
group_by_drop_default = NULL,
ungroup = NULL,
mutate = NULL,
transmute = NULL,
arrange = NULL,
rename = NULL,
pull = NULL,
relocate = NULL,
compute = NULL,
collapse = NULL,
distinct = NULL,
left_join = NULL,
right_join = NULL,
inner_join = NULL,
full_join = NULL,
semi_join = NULL,
anti_join = NULL,
count = NULL,
tally = NULL,
rename_with = NULL,
union = NULL,
union_all = NULL,
glimpse = NULL,
show_query = NULL,
explain = NULL
)

#' @importFrom vctrs s3_register vec_size vec_cast vec_unique
.onLoad <- function(...) {
# Make sure C++ knows on which thread it is safe to call the R API
InitializeMainRThread()

dplyr_methods <- paste0(
"dplyr::",
c(
"select", "filter", "collect", "summarise", "group_by", "groups",
"group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
"arrange", "rename", "pull", "relocate", "compute", "collapse",
"distinct", "left_join", "right_join", "inner_join", "full_join",
"semi_join", "anti_join", "count", "tally", "rename_with", "union",
"union_all", "glimpse", "show_query", "explain"
)
)
for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader", "arrow_dplyr_query")) {
for (m in dplyr_methods) {
s3_register(m, cl)
for (m in names(supported_dplyr_methods)) {
s3_register(paste0("dplyr::", m), cl)
}
}
s3_register("dplyr::tbl_vars", "arrow_dplyr_query")
Expand Down
19 changes: 16 additions & 3 deletions r/R/dplyr-funcs-augmented.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,21 @@
# specific language governing permissions and limitations
# under the License.

#' Add the data filename as a column
#'
#' This function only exists inside `arrow` `dplyr` queries, and it only is
#' valid when quering on a `FileSystemDataset`.
#'
#' @return A `FieldRef` `Expression` that refers to the filename augmented
#' column.
#' @examples
#' \dontrun{
#' open_dataset("nyc-taxi") %>%
#' mutate(file = add_filename())
#' }
#' @keywords internal
add_filename <- function() Expression$field_ref("__filename")

register_bindings_augmented <- function() {
register_binding("add_filename", function() {
Expression$field_ref("__filename")
})
register_binding("arrow::add_filename", add_filename)
}
53 changes: 26 additions & 27 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -649,55 +649,54 @@ register_bindings_datetime_parsers <- function() {

build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
})

}

register_bindings_datetime_rounding <- function() {
register_binding(
"round_date",
"lubridate::round_date",
function(x,
unit = "second",
week_start = getOption("lubridate.week.start", 7)) {
opts <- parse_period_unit(unit)
if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
return(shift_temporal_to_week("round_temporal", x, week_start, options = opts))
}

opts <- parse_period_unit(unit)
if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
return(shift_temporal_to_week("round_temporal", x, week_start, options = opts))
Expression$create("round_temporal", x, options = opts)
}

Expression$create("round_temporal", x, options = opts)
})
)

register_binding(
"floor_date",
"lubridate::floor_date",
function(x,
unit = "second",
week_start = getOption("lubridate.week.start", 7)) {
opts <- parse_period_unit(unit)
if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts))
}

opts <- parse_period_unit(unit)
if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts))
Expression$create("floor_temporal", x, options = opts)
}

Expression$create("floor_temporal", x, options = opts)
})
)

register_binding(
"ceiling_date",
"lubridate::ceiling_date",
function(x,
unit = "second",
change_on_boundary = NULL,
week_start = getOption("lubridate.week.start", 7)) {
opts <- parse_period_unit(unit)
if (is.null(change_on_boundary)) {
change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE)
}
opts$ceil_is_strictly_greater <- change_on_boundary

if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
return(shift_temporal_to_week("ceil_temporal", x, week_start, options = opts))
}
opts <- parse_period_unit(unit)
if (is.null(change_on_boundary)) {
change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE)
}
opts$ceil_is_strictly_greater <- change_on_boundary

Expression$create("ceil_temporal", x, options = opts)
})
if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
return(shift_temporal_to_week("ceil_temporal", x, week_start, options = opts))
}

Expression$create("ceil_temporal", x, options = opts)
}
)
}
Loading

0 comments on commit 93626ee

Please sign in to comment.