ceph/src/arrow/r/tests/testthat/test-csv.R

   1 # Licensed to the Apache Software Foundation (ASF) under one
   2 # or more contributor license agreements.  See the NOTICE file
   3 # distributed with this work for additional information
   4 # regarding copyright ownership.  The ASF licenses this file
   5 # to you under the Apache License, Version 2.0 (the
   6 # "License"); you may not use this file except in compliance
   7 # with the License.  You may obtain a copy of the License at
   8 #
   9 #   http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing,
  12 # software distributed under the License is distributed on an
  13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 # KIND, either express or implied.  See the License for the
  15 # specific language governing permissions and limitations
  16 # under the License.
  17
  18 # Not all types round trip via CSV 100% identical by default
  19 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
  20 tbl_no_dates <- tbl
  21 # Add a date to test its parsing
  22 tbl$date <- Sys.Date() + 1:10
  23
  24 csv_file <- tempfile()
  25
  26 test_that("Can read csv file", {
  27   tf <- tempfile()
  28   on.exit(unlink(tf))
  29
  30   write.csv(tbl, tf, row.names = FALSE)
  31
  32   tab0 <- Table$create(tbl)
  33   tab1 <- read_csv_arrow(tf, as_data_frame = FALSE)
  34   expect_equal(tab0, tab1)
  35   tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE)
  36   expect_equal(tab0, tab2)
  37   tab3 <- read_csv_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
  38   expect_equal(tab0, tab3)
  39 })
  40
  41 test_that("read_csv_arrow(as_data_frame=TRUE)", {
  42   tf <- tempfile()
  43   on.exit(unlink(tf))
  44
  45   write.csv(tbl, tf, row.names = FALSE)
  46   tab1 <- read_csv_arrow(tf, as_data_frame = TRUE)
  47   expect_equal(tbl, tab1)
  48 })
  49
  50 test_that("read_delim_arrow parsing options: delim", {
  51   tf <- tempfile()
  52   on.exit(unlink(tf))
  53
  54   write.table(tbl, tf, sep = "\t", row.names = FALSE)
  55   tab1 <- read_tsv_arrow(tf)
  56   tab2 <- read_delim_arrow(tf, delim = "\t")
  57   expect_equal(tab1, tab2)
  58   expect_equal(tbl, tab1)
  59 })
  60
  61 test_that("read_delim_arrow parsing options: quote", {
  62   tf <- tempfile()
  63   on.exit(unlink(tf))
  64
  65   df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'"))
  66   write.table(df, sep = ";", tf, row.names = FALSE, quote = FALSE)
  67   tab1 <- read_delim_arrow(tf, delim = ";", quote = "'")
  68
  69   # Is this a problem?
  70   # Component “a”: target is integer64, current is numeric
  71   tab1$a <- as.numeric(tab1$a)
  72   expect_equal(
  73     tab1,
  74     tibble::tibble(a = c(1, 2), b = c("abc", "def"))
  75   )
  76 })
  77
  78 test_that("read_csv_arrow parsing options: col_names", {
  79   tf <- tempfile()
  80   on.exit(unlink(tf))
  81
  82   # Writing the CSV without the header
  83   write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE)
  84
  85   # Reading with col_names = FALSE autogenerates names
  86   no_names <- read_csv_arrow(tf, col_names = FALSE)
  87   expect_equal(no_names$f0, tbl[[1]])
  88
  89   tab1 <- read_csv_arrow(tf, col_names = names(tbl))
  90
  91   expect_identical(names(tab1), names(tbl))
  92   expect_equal(tbl, tab1)
  93
  94   # This errors (correctly) because I haven't given enough names
  95   # but the error message is "Invalid: Empty CSV file", which is not accurate
  96   expect_error(
  97     read_csv_arrow(tf, col_names = names(tbl)[1])
  98   )
  99   # Same here
 100   expect_error(
 101     read_csv_arrow(tf, col_names = c(names(tbl), names(tbl)))
 102   )
 103 })
 104
 105 test_that("read_csv_arrow parsing options: skip", {
 106   tf <- tempfile()
 107   on.exit(unlink(tf))
 108
 109   # Adding two garbage lines to start the csv
 110   cat("asdf\nqwer\n", file = tf)
 111   suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append = TRUE))
 112
 113   tab1 <- read_csv_arrow(tf, skip = 2)
 114
 115   expect_identical(names(tab1), names(tbl))
 116   expect_equal(tbl, tab1)
 117 })
 118
 119 test_that("read_csv_arrow parsing options: skip_empty_rows", {
 120   tf <- tempfile()
 121   on.exit(unlink(tf))
 122
 123   write.csv(tbl, tf, row.names = FALSE)
 124   cat("\n\n", file = tf, append = TRUE)
 125
 126   tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
 127
 128   expect_equal(nrow(tab1), nrow(tbl) + 2)
 129   expect_true(is.na(tail(tab1, 1)[[1]]))
 130 })
 131
 132 test_that("read_csv_arrow parsing options: na strings", {
 133   tf <- tempfile()
 134   on.exit(unlink(tf))
 135
 136   df <- data.frame(
 137     a = c(1.2, NA, NA, 3.4),
 138     b = c(NA, "B", "C", NA),
 139     stringsAsFactors = FALSE
 140   )
 141   write.csv(df, tf, row.names = FALSE)
 142   expect_equal(grep("NA", readLines(tf)), 2:5)
 143
 144   tab1 <- read_csv_arrow(tf)
 145   expect_equal(is.na(tab1$a), is.na(df$a))
 146   expect_equal(is.na(tab1$b), is.na(df$b))
 147
 148   unlink(tf) # Delete and write to the same file name again
 149
 150   write.csv(df, tf, row.names = FALSE, na = "asdf")
 151   expect_equal(grep("asdf", readLines(tf)), 2:5)
 152
 153   tab2 <- read_csv_arrow(tf, na = "asdf")
 154   expect_equal(is.na(tab2$a), is.na(df$a))
 155   expect_equal(is.na(tab2$b), is.na(df$b))
 156 })
 157
 158 test_that("read_csv_arrow() respects col_select", {
 159   tf <- tempfile()
 160   on.exit(unlink(tf))
 161
 162   write.csv(tbl, tf, row.names = FALSE, quote = FALSE)
 163
 164   tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE)
 165   expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")]))
 166
 167   tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE)
 168   expect_equal(tib, example_data[, c("dbl", "lgl")])
 169 })
 170
 171 test_that("read_csv_arrow() can detect compression from file name", {
 172   skip_if_not_available("gzip")
 173   tf <- tempfile(fileext = ".csv.gz")
 174   on.exit(unlink(tf))
 175
 176   write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE)
 177   tab1 <- read_csv_arrow(tf)
 178   expect_equal(tbl, tab1)
 179 })
 180
 181 test_that("read_csv_arrow(schema=)", {
 182   tbl <- example_data[, "int"]
 183   tf <- tempfile()
 184   on.exit(unlink(tf))
 185   write.csv(tbl, tf, row.names = FALSE)
 186
 187   df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1)
 188   expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
 189 })
 190
 191 test_that("read_csv_arrow(col_types = <Schema>)", {
 192   tbl <- example_data[, "int"]
 193   tf <- tempfile()
 194   on.exit(unlink(tf))
 195   write.csv(tbl, tf, row.names = FALSE)
 196
 197   df <- read_csv_arrow(tf, col_types = schema(int = float64()))
 198   expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
 199 })
 200
 201 test_that("read_csv_arrow(col_types=string, col_names)", {
 202   tbl <- example_data[, "int"]
 203   tf <- tempfile()
 204   on.exit(unlink(tf))
 205   write.csv(tbl, tf, row.names = FALSE)
 206
 207   df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
 208   expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
 209
 210   expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
 211   expect_error(read_csv_arrow(tf, col_types = "d"))
 212   expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
 213   expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
 214 })
 215
 216 test_that("read_csv_arrow() can read timestamps", {
 217   tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC"))
 218   tf <- tempfile()
 219   on.exit(unlink(tf))
 220   write.csv(tbl, tf, row.names = FALSE)
 221
 222   df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC")))
 223   expect_equal(tbl, df)
 224
 225   # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
 226   df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
 227   expect_equal(tbl, df, ignore_attr = "tzone")
 228 })
 229
 230 test_that("read_csv_arrow(timestamp_parsers=)", {
 231   tf <- tempfile()
 232   on.exit(unlink(tf))
 233   tbl <- tibble::tibble(time = "23/09/2020")
 234   write.csv(tbl, tf, row.names = FALSE)
 235
 236   df <- read_csv_arrow(
 237     tf,
 238     col_types = schema(time = timestamp(timezone = "UTC")),
 239     timestamp_parsers = "%d/%m/%Y"
 240   )
 241   expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
 242 })
 243
 244 test_that("Skipping columns with null()", {
 245   tf <- tempfile()
 246   on.exit(unlink(tf))
 247   cols <- c("dbl", "lgl", "false", "chr")
 248   tbl <- example_data[, cols]
 249   write.csv(tbl, tf, row.names = FALSE)
 250
 251   df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1)
 252   expect_identical(df, tbl[, c("dbl", "chr")])
 253 })
 254
 255 test_that("Mix of guessing and declaring types", {
 256   tf <- tempfile()
 257   on.exit(unlink(tf))
 258   cols <- c("dbl", "lgl", "false", "chr")
 259   tbl <- example_data[, cols]
 260   write.csv(tbl, tf, row.names = FALSE)
 261
 262   tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame = FALSE)
 263   expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false = bool(), chr = utf8()))
 264
 265   df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1)
 266   expect_identical(df, tbl[, c("dbl", "false", "chr")])
 267 })
 268
 269
 270 test_that("Write a CSV file with header", {
 271   tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
 272   expect_true(file.exists(csv_file))
 273   expect_identical(tbl_out, tbl_no_dates)
 274
 275   tbl_in <- read_csv_arrow(csv_file)
 276   expect_identical(tbl_in, tbl_no_dates)
 277
 278   tbl_out <- write_csv_arrow(tbl, csv_file)
 279   expect_true(file.exists(csv_file))
 280   expect_identical(tbl_out, tbl)
 281
 282   tbl_in <- read_csv_arrow(csv_file)
 283   expect_identical(tbl_in, tbl)
 284 })
 285
 286
 287 test_that("Write a CSV file with no header", {
 288   tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
 289   expect_true(file.exists(csv_file))
 290   expect_identical(tbl_out, tbl_no_dates)
 291   tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
 292
 293   tbl_expected <- tbl_no_dates
 294   names(tbl_expected) <- c("f0", "f1", "f2", "f3")
 295
 296   expect_identical(tbl_in, tbl_expected)
 297 })
 298
 299 test_that("Write a CSV file with different batch sizes", {
 300   tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
 301   expect_true(file.exists(csv_file))
 302   expect_identical(tbl_out1, tbl_no_dates)
 303   tbl_in1 <- read_csv_arrow(csv_file)
 304   expect_identical(tbl_in1, tbl_no_dates)
 305
 306   tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
 307   expect_true(file.exists(csv_file))
 308   expect_identical(tbl_out2, tbl_no_dates)
 309   tbl_in2 <- read_csv_arrow(csv_file)
 310   expect_identical(tbl_in2, tbl_no_dates)
 311
 312   tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
 313   expect_true(file.exists(csv_file))
 314   expect_identical(tbl_out3, tbl_no_dates)
 315   tbl_in3 <- read_csv_arrow(csv_file)
 316   expect_identical(tbl_in3, tbl_no_dates)
 317 })
 318
 319 test_that("Write a CSV file with invalid input type", {
 320   bad_input <- Array$create(1:5)
 321   expect_error(
 322     write_csv_arrow(bad_input, csv_file),
 323     regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
 324   )
 325 })
 326
 327 test_that("Write a CSV file with invalid batch size", {
 328   expect_error(
 329     write_csv_arrow(tbl_no_dates, csv_file, batch_size = -1),
 330     regexp = "batch_size not greater than 0"
 331   )
 332 })
 333
 334 test_that("time mapping work as expected (ARROW-13624)", {
 335   tbl <- tibble::tibble(
 336     dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"),
 337     time = c(hms::as_hms("16:20:00"), NA)
 338   )
 339   tf <- tempfile()
 340   on.exit(unlink(tf))
 341   write.csv(tbl, tf, row.names = FALSE)
 342
 343   df <- read_csv_arrow(tf,
 344     col_names = c("dt", "time"),
 345     col_types = "Tt",
 346     skip = 1
 347   )
 348
 349   expect_error(
 350     read_csv_arrow(tf,
 351       col_names = c("dt", "time"),
 352       col_types = "tT", skip = 1
 353     )
 354   )
 355
 356   expect_equal(df, tbl, ignore_attr = "tzone")
 357 })