apacheGH-15247: [R] Error when trying to save a data.frame with NULL …

…column names (apache#34798) Before: ``` df <- data.frame(a=1:3,b=4:6) names(df) <- NULL arrow::write_feather(x=df, sink="~/tmp/df") #> Error: Invalid input type, expected 'character' actual 'NULL' arrow::write_csv_arrow(df, "tst.csv") #> Error in `value[[3L]]()`: #> ! x must be an object of class 'data.frame', 'RecordBatch', 'Dataset', 'Table', or 'RecordBatchReader' not 'data.frame'. #> Backtrace: #> ▆ #> 1. └─arrow::write_csv_arrow(df, "tst.csv") #> 2. └─base::tryCatch(...) #> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers) #> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]]) #> 5. └─value[[3L]](cond) #> 6. └─rlang::abort(...) arrow::write_parquet(df, "tst.csv") #> Error: Invalid input type, expected 'character' actual 'NULL' ``` After: ``` r df <- data.frame(a=1:3,b=4:6) names(df) <- NULL arrow::write_feather(x=df, sink="~/tmp/df") #> Error in `arrow::write_feather()`: #> ! Input data frame columns must be named #> ℹ Column names are NULL #> Backtrace: #> ▆ #> 1. └─arrow::write_feather(x = df, sink = "~/tmp/df") #> 2. └─arrow:::as_writable_table(x) at r/R/feather.R:124:2 #> 3. ├─base::tryCatch(...) at r/R/util.R:150:2 #> 4. │ └─base (local) tryCatchList(expr, classes, parentenv, handlers) #> 5. │ └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]]) #> 6. │ └─base (local) doTryCatch(return(expr), name, parentenv, handler) #> 7. ├─arrow::as_arrow_table(x) at r/R/util.R:150:2 #> 8. └─arrow:::as_arrow_table.data.frame(x) at r/R/table.R:284:2 #> 9. └─arrow:::check_named_cols(x) at r/R/table.R:324:2 #> 10. └─rlang::abort(...) at r/R/util.R:255:4 arrow::write_csv_arrow(df, "tst.csv") #> Error in `arrow::write_csv_arrow()`: #> ! Input data frame columns must be named #> ℹ Column names are NULL #> Backtrace: #> ▆ #> 1. └─arrow::write_csv_arrow(df, "tst.csv") #> 2. └─base::tryCatch(...) at r/R/csv.R:782:4 #> 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers) #> 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]]) #> 5. └─value[[3L]](cond) #> 6. └─rlang::abort(conditionMessage(e), parent = NA) at r/R/csv.R:786:10 arrow::write_parquet(df, "tst.csv") #> Error in `arrow::write_parquet()`: #> ! Input data frame columns must be named #> ℹ Column names are NULL #> Backtrace: #> ▆ #> 1. └─arrow::write_parquet(df, "tst.csv") #> 2. └─arrow:::as_writable_table(x) at r/R/parquet.R:164:2 #> 3. ├─base::tryCatch(...) at r/R/util.R:150:2 #> 4. │ └─base (local) tryCatchList(expr, classes, parentenv, handlers) #> 5. │ └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]]) #> 6. │ └─base (local) doTryCatch(return(expr), name, parentenv, handler) #> 7. ├─arrow::as_arrow_table(x) at r/R/util.R:150:2 #> 8. └─arrow:::as_arrow_table.data.frame(x) at r/R/table.R:284:2 #> 9. └─arrow:::check_named_cols(x) at r/R/table.R:324:2 #> 10. └─rlang::abort(...) at r/R/util.R:255:4 ``` * Closes: apache#15247 Authored-by: Nic Crane <[email protected]> Signed-off-by: Nic Crane <[email protected]>
doytsujin · Mar 31, 2023 · 8efb43d · 8efb43d
1 parent 68dc40a
commit 8efb43d
Show file tree

Hide file tree

Showing 8 changed files with 43 additions and 6 deletions.
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -782,12 +782,17 @@ write_csv_arrow <- function(x,
     tryCatch(
       x <- as_record_batch_reader(x),
       error = function(e) {
-        abort(
-          paste0(
-            "x must be an object of class 'data.frame', 'RecordBatch', ",
-            "'Dataset', 'Table', or 'RecordBatchReader' not '", class(x)[1], "'."
+        if (grepl("Input data frame columns must be named", conditionMessage(e))) {
+          abort(conditionMessage(e), parent = NA)
+        } else {
+          abort(
+            paste0(
+              "x must be an object of class 'data.frame', 'RecordBatch', ",
+              "'Dataset', 'Table', or 'RecordBatchReader' not '", class(x)[1], "'."
+            ),
+            parent = NA
           )
-        )
+        }
       }
     )
   }

diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
@@ -141,6 +141,7 @@ write_dataset <- function(dataset,
     # partitioning vars need to be in the `select` schema
     dataset <- ensure_group_vars(dataset)
   } else {
+    check_named_cols(dataset)
     if (inherits(dataset, "grouped_df")) {
       force(partitioning)
       # Drop the grouping metadata before writing; we've already consumed it

diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R
@@ -237,6 +237,7 @@ as_record_batch_reader.RecordBatch <- function(x, ...) {
 #' @rdname as_record_batch_reader
 #' @export
 as_record_batch_reader.data.frame <- function(x, ...) {
+  check_named_cols(x)
   RecordBatchReader$create(as_record_batch(x))
 }
 

diff --git a/r/R/table.R b/r/R/table.R
@@ -321,6 +321,7 @@ as_arrow_table.RecordBatch <- function(x, ..., schema = NULL) {
 #' @rdname as_arrow_table
 #' @export
 as_arrow_table.data.frame <- function(x, ..., schema = NULL) {
+  check_named_cols(x)
   Table$create(x, schema = schema)
 }
 

diff --git a/r/R/util.R b/r/R/util.R
@@ -171,7 +171,6 @@ recycle_scalars <- function(arrays) {
   is_scalar <- arr_lens == 1
 
   if (length(arrays) > 1 && any(is_scalar) && !all(is_scalar)) {
-
     # Recycling not supported for tibbles and data.frames
     if (all(map_lgl(arrays, ~ inherits(.x, "data.frame")))) {
       abort(c(
@@ -250,3 +249,15 @@ augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) {
   handle_augmented_field_misuse(msg, call)
   abort(msg, call = call)
 }
+
+check_named_cols <- function(df) {
+  if (inherits(df, "data.frame") && is.null(names(df))) {
+    abort(
+      c(
+        "Input data frame columns must be named",
+        i = "Column names are NULL"
+      ),
+      call = caller_env(n = 3)
+    )
+  }
+}
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
@@ -705,3 +705,9 @@ test_that("can create empty table from schema", {
   expect_equal(nrow(out), 0)
   expect_equal(out$schema, schema)
 })
+
+test_that("as_arrow_table() errors on data.frame with NULL names", {
+  df <- data.frame(a = 1, b = "two")
+  names(df) <- NULL
+  expect_error(as_arrow_table(df), "Input data frame columns must be named")
+})
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
@@ -806,3 +806,9 @@ test_that("Can delete filesystem dataset after write_dataset", {
   unlink(dataset_dir2, recursive = TRUE)
   expect_false(dir.exists(dataset_dir2))
 })
+
+test_that("write_dataset() errors on data.frame with NULL names", {
+  df <- data.frame(a = 1, b = "two")
+  names(df) <- NULL
+  expect_error(write_dataset(df, tempfile()), "Input data frame columns must be named")
+})
diff --git a/r/tests/testthat/test-record-batch-reader.R b/r/tests/testthat/test-record-batch-reader.R
@@ -269,3 +269,9 @@ test_that("as_record_batch_reader() works for function", {
     "Expected fun\\(\\) to return batch with schema 'a: string'"
   )
 })
+
+test_that("as_record_batch_reader() errors on data.frame with NULL names", {
+  df <- data.frame(a = 1, b = "two")
+  names(df) <- NULL
+  expect_error(as_record_batch_reader(df), "Input data frame columns must be named")
+})