]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/r/tests/testthat/test-csv.R
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / tests / testthat / test-csv.R
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 # Not all types round trip via CSV 100% identical by default
19 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
20 tbl_no_dates <- tbl
21 # Add a date to test its parsing
22 tbl$date <- Sys.Date() + 1:10
23
24 csv_file <- tempfile()
25
26 test_that("Can read csv file", {
27 tf <- tempfile()
28 on.exit(unlink(tf))
29
30 write.csv(tbl, tf, row.names = FALSE)
31
32 tab0 <- Table$create(tbl)
33 tab1 <- read_csv_arrow(tf, as_data_frame = FALSE)
34 expect_equal(tab0, tab1)
35 tab2 <- read_csv_arrow(mmap_open(tf), as_data_frame = FALSE)
36 expect_equal(tab0, tab2)
37 tab3 <- read_csv_arrow(ReadableFile$create(tf), as_data_frame = FALSE)
38 expect_equal(tab0, tab3)
39 })
40
41 test_that("read_csv_arrow(as_data_frame=TRUE)", {
42 tf <- tempfile()
43 on.exit(unlink(tf))
44
45 write.csv(tbl, tf, row.names = FALSE)
46 tab1 <- read_csv_arrow(tf, as_data_frame = TRUE)
47 expect_equal(tbl, tab1)
48 })
49
50 test_that("read_delim_arrow parsing options: delim", {
51 tf <- tempfile()
52 on.exit(unlink(tf))
53
54 write.table(tbl, tf, sep = "\t", row.names = FALSE)
55 tab1 <- read_tsv_arrow(tf)
56 tab2 <- read_delim_arrow(tf, delim = "\t")
57 expect_equal(tab1, tab2)
58 expect_equal(tbl, tab1)
59 })
60
61 test_that("read_delim_arrow parsing options: quote", {
62 tf <- tempfile()
63 on.exit(unlink(tf))
64
65 df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'"))
66 write.table(df, sep = ";", tf, row.names = FALSE, quote = FALSE)
67 tab1 <- read_delim_arrow(tf, delim = ";", quote = "'")
68
69 # Is this a problem?
70 # Component “a”: target is integer64, current is numeric
71 tab1$a <- as.numeric(tab1$a)
72 expect_equal(
73 tab1,
74 tibble::tibble(a = c(1, 2), b = c("abc", "def"))
75 )
76 })
77
78 test_that("read_csv_arrow parsing options: col_names", {
79 tf <- tempfile()
80 on.exit(unlink(tf))
81
82 # Writing the CSV without the header
83 write.table(tbl, tf, sep = ",", row.names = FALSE, col.names = FALSE)
84
85 # Reading with col_names = FALSE autogenerates names
86 no_names <- read_csv_arrow(tf, col_names = FALSE)
87 expect_equal(no_names$f0, tbl[[1]])
88
89 tab1 <- read_csv_arrow(tf, col_names = names(tbl))
90
91 expect_identical(names(tab1), names(tbl))
92 expect_equal(tbl, tab1)
93
94 # This errors (correctly) because I haven't given enough names
95 # but the error message is "Invalid: Empty CSV file", which is not accurate
96 expect_error(
97 read_csv_arrow(tf, col_names = names(tbl)[1])
98 )
99 # Same here
100 expect_error(
101 read_csv_arrow(tf, col_names = c(names(tbl), names(tbl)))
102 )
103 })
104
105 test_that("read_csv_arrow parsing options: skip", {
106 tf <- tempfile()
107 on.exit(unlink(tf))
108
109 # Adding two garbage lines to start the csv
110 cat("asdf\nqwer\n", file = tf)
111 suppressWarnings(write.table(tbl, tf, sep = ",", row.names = FALSE, append = TRUE))
112
113 tab1 <- read_csv_arrow(tf, skip = 2)
114
115 expect_identical(names(tab1), names(tbl))
116 expect_equal(tbl, tab1)
117 })
118
119 test_that("read_csv_arrow parsing options: skip_empty_rows", {
120 tf <- tempfile()
121 on.exit(unlink(tf))
122
123 write.csv(tbl, tf, row.names = FALSE)
124 cat("\n\n", file = tf, append = TRUE)
125
126 tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
127
128 expect_equal(nrow(tab1), nrow(tbl) + 2)
129 expect_true(is.na(tail(tab1, 1)[[1]]))
130 })
131
132 test_that("read_csv_arrow parsing options: na strings", {
133 tf <- tempfile()
134 on.exit(unlink(tf))
135
136 df <- data.frame(
137 a = c(1.2, NA, NA, 3.4),
138 b = c(NA, "B", "C", NA),
139 stringsAsFactors = FALSE
140 )
141 write.csv(df, tf, row.names = FALSE)
142 expect_equal(grep("NA", readLines(tf)), 2:5)
143
144 tab1 <- read_csv_arrow(tf)
145 expect_equal(is.na(tab1$a), is.na(df$a))
146 expect_equal(is.na(tab1$b), is.na(df$b))
147
148 unlink(tf) # Delete and write to the same file name again
149
150 write.csv(df, tf, row.names = FALSE, na = "asdf")
151 expect_equal(grep("asdf", readLines(tf)), 2:5)
152
153 tab2 <- read_csv_arrow(tf, na = "asdf")
154 expect_equal(is.na(tab2$a), is.na(df$a))
155 expect_equal(is.na(tab2$b), is.na(df$b))
156 })
157
158 test_that("read_csv_arrow() respects col_select", {
159 tf <- tempfile()
160 on.exit(unlink(tf))
161
162 write.csv(tbl, tf, row.names = FALSE, quote = FALSE)
163
164 tab <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = FALSE)
165 expect_equal(tab, Table$create(example_data[, c("dbl", "lgl")]))
166
167 tib <- read_csv_arrow(tf, col_select = ends_with("l"), as_data_frame = TRUE)
168 expect_equal(tib, example_data[, c("dbl", "lgl")])
169 })
170
171 test_that("read_csv_arrow() can detect compression from file name", {
172 skip_if_not_available("gzip")
173 tf <- tempfile(fileext = ".csv.gz")
174 on.exit(unlink(tf))
175
176 write.csv(tbl, gzfile(tf), row.names = FALSE, quote = FALSE)
177 tab1 <- read_csv_arrow(tf)
178 expect_equal(tbl, tab1)
179 })
180
181 test_that("read_csv_arrow(schema=)", {
182 tbl <- example_data[, "int"]
183 tf <- tempfile()
184 on.exit(unlink(tf))
185 write.csv(tbl, tf, row.names = FALSE)
186
187 df <- read_csv_arrow(tf, schema = schema(int = float64()), skip = 1)
188 expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
189 })
190
191 test_that("read_csv_arrow(col_types = <Schema>)", {
192 tbl <- example_data[, "int"]
193 tf <- tempfile()
194 on.exit(unlink(tf))
195 write.csv(tbl, tf, row.names = FALSE)
196
197 df <- read_csv_arrow(tf, col_types = schema(int = float64()))
198 expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
199 })
200
201 test_that("read_csv_arrow(col_types=string, col_names)", {
202 tbl <- example_data[, "int"]
203 tf <- tempfile()
204 on.exit(unlink(tf))
205 write.csv(tbl, tf, row.names = FALSE)
206
207 df <- read_csv_arrow(tf, col_names = "int", col_types = "d", skip = 1)
208 expect_identical(df, tibble::tibble(int = as.numeric(tbl$int)))
209
210 expect_error(read_csv_arrow(tf, col_types = c("i", "d")))
211 expect_error(read_csv_arrow(tf, col_types = "d"))
212 expect_error(read_csv_arrow(tf, col_types = "i", col_names = c("a", "b")))
213 expect_error(read_csv_arrow(tf, col_types = "y", col_names = "a"))
214 })
215
216 test_that("read_csv_arrow() can read timestamps", {
217 tbl <- tibble::tibble(time = as.POSIXct("2020-07-20 16:20", tz = "UTC"))
218 tf <- tempfile()
219 on.exit(unlink(tf))
220 write.csv(tbl, tf, row.names = FALSE)
221
222 df <- read_csv_arrow(tf, col_types = schema(time = timestamp(timezone = "UTC")))
223 expect_equal(tbl, df)
224
225 # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
226 df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
227 expect_equal(tbl, df, ignore_attr = "tzone")
228 })
229
230 test_that("read_csv_arrow(timestamp_parsers=)", {
231 tf <- tempfile()
232 on.exit(unlink(tf))
233 tbl <- tibble::tibble(time = "23/09/2020")
234 write.csv(tbl, tf, row.names = FALSE)
235
236 df <- read_csv_arrow(
237 tf,
238 col_types = schema(time = timestamp(timezone = "UTC")),
239 timestamp_parsers = "%d/%m/%Y"
240 )
241 expect_equal(df$time, as.POSIXct(tbl$time, format = "%d/%m/%Y", tz = "UTC"))
242 })
243
244 test_that("Skipping columns with null()", {
245 tf <- tempfile()
246 on.exit(unlink(tf))
247 cols <- c("dbl", "lgl", "false", "chr")
248 tbl <- example_data[, cols]
249 write.csv(tbl, tf, row.names = FALSE)
250
251 df <- read_csv_arrow(tf, col_types = "d-_c", col_names = cols, skip = 1)
252 expect_identical(df, tbl[, c("dbl", "chr")])
253 })
254
255 test_that("Mix of guessing and declaring types", {
256 tf <- tempfile()
257 on.exit(unlink(tf))
258 cols <- c("dbl", "lgl", "false", "chr")
259 tbl <- example_data[, cols]
260 write.csv(tbl, tf, row.names = FALSE)
261
262 tab <- read_csv_arrow(tf, col_types = schema(dbl = float32()), as_data_frame = FALSE)
263 expect_equal(tab$schema, schema(dbl = float32(), lgl = bool(), false = bool(), chr = utf8()))
264
265 df <- read_csv_arrow(tf, col_types = "d-?c", col_names = cols, skip = 1)
266 expect_identical(df, tbl[, c("dbl", "false", "chr")])
267 })
268
269
270 test_that("Write a CSV file with header", {
271 tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
272 expect_true(file.exists(csv_file))
273 expect_identical(tbl_out, tbl_no_dates)
274
275 tbl_in <- read_csv_arrow(csv_file)
276 expect_identical(tbl_in, tbl_no_dates)
277
278 tbl_out <- write_csv_arrow(tbl, csv_file)
279 expect_true(file.exists(csv_file))
280 expect_identical(tbl_out, tbl)
281
282 tbl_in <- read_csv_arrow(csv_file)
283 expect_identical(tbl_in, tbl)
284 })
285
286
287 test_that("Write a CSV file with no header", {
288 tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
289 expect_true(file.exists(csv_file))
290 expect_identical(tbl_out, tbl_no_dates)
291 tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
292
293 tbl_expected <- tbl_no_dates
294 names(tbl_expected) <- c("f0", "f1", "f2", "f3")
295
296 expect_identical(tbl_in, tbl_expected)
297 })
298
299 test_that("Write a CSV file with different batch sizes", {
300 tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
301 expect_true(file.exists(csv_file))
302 expect_identical(tbl_out1, tbl_no_dates)
303 tbl_in1 <- read_csv_arrow(csv_file)
304 expect_identical(tbl_in1, tbl_no_dates)
305
306 tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
307 expect_true(file.exists(csv_file))
308 expect_identical(tbl_out2, tbl_no_dates)
309 tbl_in2 <- read_csv_arrow(csv_file)
310 expect_identical(tbl_in2, tbl_no_dates)
311
312 tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
313 expect_true(file.exists(csv_file))
314 expect_identical(tbl_out3, tbl_no_dates)
315 tbl_in3 <- read_csv_arrow(csv_file)
316 expect_identical(tbl_in3, tbl_no_dates)
317 })
318
319 test_that("Write a CSV file with invalid input type", {
320 bad_input <- Array$create(1:5)
321 expect_error(
322 write_csv_arrow(bad_input, csv_file),
323 regexp = "x must be an object of class 'data.frame', 'RecordBatch', or 'Table', not 'Array'."
324 )
325 })
326
327 test_that("Write a CSV file with invalid batch size", {
328 expect_error(
329 write_csv_arrow(tbl_no_dates, csv_file, batch_size = -1),
330 regexp = "batch_size not greater than 0"
331 )
332 })
333
334 test_that("time mapping work as expected (ARROW-13624)", {
335 tbl <- tibble::tibble(
336 dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"),
337 time = c(hms::as_hms("16:20:00"), NA)
338 )
339 tf <- tempfile()
340 on.exit(unlink(tf))
341 write.csv(tbl, tf, row.names = FALSE)
342
343 df <- read_csv_arrow(tf,
344 col_names = c("dt", "time"),
345 col_types = "Tt",
346 skip = 1
347 )
348
349 expect_error(
350 read_csv_arrow(tf,
351 col_names = c("dt", "time"),
352 col_types = "tT", skip = 1
353 )
354 )
355
356 expect_equal(df, tbl, ignore_attr = "tzone")
357 })