]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/r/tests/testthat/test-chunked-array.R
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / tests / testthat / test-chunked-array.R
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18
19 expect_chunked_roundtrip <- function(x, type) {
20 a <- ChunkedArray$create(!!!x)
21 flat_x <- unlist(x, recursive = FALSE)
22 attributes(flat_x) <- attributes(x[[1]])
23 expect_type_equal(a$type, type)
24 expect_identical(a$num_chunks, length(x))
25 expect_identical(length(a), length(flat_x))
26 if (!inherits(type, "ListType")) {
27 # TODO: revisit how missingness works with ListArrays
28 # R list objects don't handle missingness the same way as other vectors.
29 # Is there some vctrs thing we should do on the roundtrip back to R?
30 expect_identical(as.vector(is.na(a)), is.na(flat_x))
31 }
32 expect_as_vector(a, flat_x)
33 expect_as_vector(a$chunk(0), x[[1]])
34
35 if (length(flat_x)) {
36 a_sliced <- a$Slice(1)
37 x_sliced <- flat_x[-1]
38 expect_type_equal(a_sliced$type, type)
39 expect_identical(length(a_sliced), length(x_sliced))
40 if (!inherits(type, "ListType")) {
41 expect_identical(as.vector(is.na(a_sliced)), is.na(x_sliced))
42 }
43 expect_as_vector(a_sliced, x_sliced)
44 }
45 invisible(a)
46 }
47
48 test_that("ChunkedArray", {
49 x <- expect_chunked_roundtrip(list(1:10, 1:10, 1:5), int32())
50
51 y <- x$Slice(8)
52 expect_equal(y$type, int32())
53 expect_equal(y$num_chunks, 3L)
54 expect_equal(length(y), 17L)
55 expect_as_vector(y, c(9:10, 1:10, 1:5))
56
57 z <- x$Slice(8, 5)
58 expect_equal(z$type, int32())
59 expect_equal(z$num_chunks, 2L)
60 expect_equal(z$length(), 5L)
61 expect_equal(z$as_vector(), c(9:10, 1:3))
62
63 expect_chunked_roundtrip(list(c(1, 2, 3), c(4, 5, 6)), float64())
64
65 # input validation
66 expect_error(x$chunk(14), "subscript out of bounds")
67 expect_error(x$chunk("one"))
68 expect_error(x$chunk(NA_integer_), "'i' cannot be NA")
69 expect_error(x$chunk(-1), "subscript out of bounds")
70
71 expect_error(x$Slice("ten"))
72 expect_error(x$Slice(NA_integer_), "Slice 'offset' cannot be NA")
73 expect_error(x$Slice(NA), "Slice 'offset' cannot be NA")
74 expect_error(x$Slice(10, "ten"))
75 expect_error(x$Slice(10, NA_integer_), "Slice 'length' cannot be NA")
76 expect_error(x$Slice(NA_integer_, NA_integer_), "Slice 'offset' cannot be NA")
77 expect_error(x$Slice(c(10, 10)))
78 expect_error(x$Slice(10, c(10, 10)))
79 expect_error(x$Slice(1000), "Slice 'offset' greater than array length")
80 expect_error(x$Slice(-1), "Slice 'offset' cannot be negative")
81 expect_error(z$Slice(10, 10), "Slice 'offset' greater than array length")
82 expect_error(x$Slice(10, -1), "Slice 'length' cannot be negative")
83 expect_error(x$Slice(-1, 10), "Slice 'offset' cannot be negative")
84
85 expect_warning(x$Slice(10, 15), NA)
86 expect_warning(
87 overslice <- x$Slice(10, 16),
88 "Slice 'length' greater than available length"
89 )
90 expect_equal(length(overslice), 15)
91 expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
92 })
93
94 test_that("print ChunkedArray", {
95 verify_output(test_path("test-chunked-array.txt"), {
96 chunked_array(c(1, 2, 3), c(4, 5, 6))
97 chunked_array(1:30, c(4, 5, 6))
98 chunked_array(1:30)
99 chunked_array(factor(c("a", "b")), factor(c("c", "d")))
100 })
101 })
102
103 test_that("ChunkedArray handles !!! splicing", {
104 data <- list(1, 2, 3)
105 x <- chunked_array(!!!data)
106 expect_equal(x$type, float64())
107 expect_equal(x$num_chunks, 3L)
108 })
109
110 test_that("ChunkedArray handles Inf", {
111 data <- list(c(Inf, 2:10), c(1:3, Inf, 5L), 1:10)
112 x <- chunked_array(!!!data)
113 expect_equal(x$type, float64())
114 expect_equal(x$num_chunks, 3L)
115 expect_equal(length(x), 25L)
116 expect_as_vector(x, c(c(Inf, 2:10), c(1:3, Inf, 5), 1:10))
117
118 chunks <- x$chunks
119 expect_as_vector(is.infinite(chunks[[2]]), is.infinite(data[[2]]))
120 expect_equal(
121 as.vector(is.infinite(x)),
122 c(is.infinite(data[[1]]), is.infinite(data[[2]]), is.infinite(data[[3]]))
123 )
124 })
125
126 test_that("ChunkedArray handles NA", {
127 data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L))
128 x <- chunked_array(!!!data)
129 expect_equal(x$type, int32())
130 expect_equal(x$num_chunks, 3L)
131 expect_equal(length(x), 25L)
132 expect_as_vector(x, c(1:10, c(NA, 2:10), c(1:3, NA, 5)))
133
134 chunks <- x$chunks
135 expect_as_vector(is.na(chunks[[2]]), is.na(data[[2]]))
136 expect_as_vector(is.na(x), c(is.na(data[[1]]), is.na(data[[2]]), is.na(data[[3]])))
137 })
138
139 test_that("ChunkedArray handles NaN", {
140 data <- list(as.numeric(1:10), c(NaN, 2:10), c(1:3, NaN, 5L))
141 x <- chunked_array(!!!data)
142
143 expect_equal(x$type, float64())
144 expect_equal(x$num_chunks, 3L)
145 expect_equal(length(x), 25L)
146 expect_as_vector(x, c(1:10, c(NaN, 2:10), c(1:3, NaN, 5)))
147
148 chunks <- x$chunks
149 expect_as_vector(is.nan(chunks[[2]]), is.nan(data[[2]]))
150 expect_as_vector(is.nan(x), c(is.nan(data[[1]]), is.nan(data[[2]]), is.nan(data[[3]])))
151 })
152
153 test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
154 # with NA
155 data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
156 expect_chunked_roundtrip(data, bool())
157 # without NA
158 data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
159 expect_chunked_roundtrip(data, bool())
160 })
161
162 test_that("ChunkedArray supports character vectors (ARROW-3339)", {
163 data <- list(
164 c("itsy", NA, "spider"),
165 c("Climbed", "up", "the", "water", "spout"),
166 c("Down", "came", "the", "rain"),
167 "And washed the spider out. "
168 )
169 expect_chunked_roundtrip(data, utf8())
170 })
171
172 test_that("ChunkedArray supports factors (ARROW-3716)", {
173 f <- factor(c("itsy", "bitsy", "spider", "spider"))
174 expect_chunked_roundtrip(list(f, f, f), dictionary(int8()))
175 })
176
177 test_that("ChunkedArray supports dates (ARROW-3716)", {
178 d <- Sys.Date() + 1:10
179 expect_chunked_roundtrip(list(d, d), date32())
180 })
181
182 test_that("ChunkedArray supports POSIXct (ARROW-3716)", {
183 times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10
184 expect_chunked_roundtrip(list(times, times), timestamp("us", "UTC"))
185 })
186
187 test_that("ChunkedArray supports integer64 (ARROW-3716)", {
188 x <- bit64::as.integer64(1:10) + MAX_INT
189 expect_chunked_roundtrip(list(x, x), int64())
190 # Also with a first chunk that would downcast
191 zero <- Array$create(0L)$cast(int64())
192 expect_type_equal(zero, int64())
193 ca <- ChunkedArray$create(zero, x)
194 expect_type_equal(ca, int64())
195 expect_s3_class(as.vector(ca), "integer64")
196 expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x))
197 })
198
199 test_that("ChunkedArray supports difftime", {
200 time <- hms::hms(56, 34, 12)
201 expect_chunked_roundtrip(list(time, time), time32("s"))
202 })
203
204 test_that("ChunkedArray supports empty arrays (ARROW-13761)", {
205 types <- c(
206 int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(),
207 uint64(), float32(), float64(), timestamp("ns"), binary(),
208 large_binary(), fixed_size_binary(32), date32(), date64(),
209 decimal(4, 2), dictionary(), struct(x = int32())
210 )
211
212 empty_filter <- ChunkedArray$create(type = bool())
213 for (type in types) {
214 one_empty_chunk <- ChunkedArray$create(type = type)
215 expect_type_equal(one_empty_chunk$type, type)
216 if (type != struct(x = int32())) {
217 expect_identical(length(one_empty_chunk), length(as.vector(one_empty_chunk)))
218 } else {
219 # struct -> tbl and length(tbl) is num_columns instead of num_rows
220 expect_identical(length(as.vector(one_empty_chunk)), 1L)
221 }
222 zero_empty_chunks <- one_empty_chunk$Filter(empty_filter)
223 expect_equal(zero_empty_chunks$num_chunks, 0)
224 expect_type_equal(zero_empty_chunks$type, type)
225 if (type != struct(x = int32())) {
226 expect_identical(length(zero_empty_chunks), length(as.vector(zero_empty_chunks)))
227 } else {
228 expect_identical(length(as.vector(zero_empty_chunks)), 1L)
229 }
230 }
231 })
232
233 test_that("integer types casts for ChunkedArray (ARROW-3741)", {
234 int_types <- c(int8(), int16(), int32(), int64())
235 uint_types <- c(uint8(), uint16(), uint32(), uint64())
236 float_types <- c(float32(), float64()) # float16() not really supported in C++ yet
237 all_types <- c(
238 int_types,
239 uint_types,
240 float_types
241 )
242
243 a <- chunked_array(1:10, 1:10)
244 for (type in c(int_types, uint_types)) {
245 casted <- a$cast(type)
246 expect_r6_class(casted, "ChunkedArray")
247 expect_type_equal(casted$type, type)
248 }
249 # Also test casting to double(), not actually a type, a base R function but should be alias for float64
250 dbl <- a$cast(double())
251 expect_r6_class(dbl, "ChunkedArray")
252 expect_type_equal(dbl$type, float64())
253 })
254
255 test_that("chunked_array() supports the type= argument. conversion from INTSXP and int64 to all int types", {
256 num_int32 <- 12L
257 num_int64 <- bit64::as.integer64(10)
258 for (type in all_types) {
259 expect_type_equal(chunked_array(num_int32, type = type)$type, type)
260 expect_type_equal(chunked_array(num_int64, type = type)$type, type)
261 }
262 # also test creating with double() "type"
263 expect_type_equal(chunked_array(num_int32, type = double())$type, float64())
264 })
265
266 test_that("ChunkedArray$create() aborts on overflow", {
267 expect_error(chunked_array(128L, type = int8())$type)
268 expect_error(chunked_array(-129L, type = int8())$type)
269
270 expect_error(chunked_array(256L, type = uint8())$type)
271 expect_error(chunked_array(-1L, type = uint8())$type)
272
273 expect_error(chunked_array(32768L, type = int16())$type)
274 expect_error(chunked_array(-32769L, type = int16())$type)
275
276 expect_error(chunked_array(65536L, type = uint16())$type)
277 expect_error(chunked_array(-1L, type = uint16())$type)
278
279 expect_error(chunked_array(65536L, type = uint16())$type)
280 expect_error(chunked_array(-1L, type = uint16())$type)
281
282 expect_error(chunked_array(bit64::as.integer64(2^31), type = int32()))
283 expect_error(chunked_array(bit64::as.integer64(2^32), type = uint32()))
284 })
285
286 test_that("chunked_array() convert doubles to integers", {
287 for (type in c(int_types, uint_types)) {
288 a <- chunked_array(10, type = type)
289 expect_type_equal(a$type, type)
290 if (type != uint64()) {
291 # exception for unsigned integer 64 that
292 # wa cannot handle yet
293 expect_true(as.vector(a) == 10)
294 }
295 }
296 })
297
298 test_that("chunked_array() uses the first ... to infer type", {
299 a <- chunked_array(10, 10L)
300 expect_type_equal(a$type, float64())
301 })
302
303 test_that("chunked_array() handles downcasting", {
304 a <- chunked_array(10L, 10)
305 expect_type_equal(a$type, int32())
306 expect_as_vector(a, c(10L, 10L))
307 })
308
309 test_that("chunked_array() makes chunks of the same type", {
310 a <- chunked_array(10L, bit64::as.integer64(13), type = int64())
311 for (chunk in a$chunks) {
312 expect_type_equal(chunk$type, int64())
313 }
314 })
315
316 test_that("chunked_array() handles 0 chunks if given a type", {
317 for (type in all_types) {
318 a <- chunked_array(type = type)
319 expect_type_equal(a$type, as_type(type))
320 expect_equal(length(a), 0L)
321 }
322 })
323
324 test_that("chunked_array() can ingest arrays (ARROW-3815)", {
325 expect_equal(
326 as.vector(chunked_array(1:5, Array$create(6:10))),
327 1:10
328 )
329 })
330
331 test_that("chunked_array() handles data frame -> struct arrays (ARROW-3811)", {
332 df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
333 a <- chunked_array(df, df)
334 expect_type_equal(a$type, struct(x = int32(), y = float64(), z = utf8()))
335 expect_equal(a$as_vector(), rbind(df, df), ignore_attr = TRUE)
336 })
337
338 test_that("ChunkedArray$View() (ARROW-6542)", {
339 a <- ChunkedArray$create(1:3, 1:4)
340 b <- a$View(float32())
341 expect_equal(b$type, float32())
342 expect_equal(length(b), 7L)
343 expect_true(all(
344 sapply(b$chunks, function(.x) .x$type == float32())
345 ))
346 # Input validation
347 expect_error(a$View("not a type"), "type must be a DataType, not character")
348 })
349
350 test_that("ChunkedArray$Validate()", {
351 a <- ChunkedArray$create(1:10)
352 expect_error(a$Validate(), NA)
353 })
354
355 test_that("[ ChunkedArray", {
356 one_chunk <- chunked_array(2:11)
357 x <- chunked_array(1:10, 31:40, 51:55)
358 # Slice
359 expect_as_vector(x[8:12], c(8:10, 31:32))
360 # Take from same chunk
361 expect_as_vector(x[c(11, 15, 12)], c(31, 35, 32))
362 # Take from multiple chunks (calls Concatenate)
363 expect_as_vector(x[c(2, 11, 15, 12, 3)], c(2, 31, 35, 32, 3))
364 # Take with Array (note these are 0-based)
365 take1 <- Array$create(c(10L, 14L, 11L))
366 expect_as_vector(x[take1], c(31, 35, 32))
367 # Take with ChunkedArray
368 take2 <- ChunkedArray$create(c(10L, 14L), 11L)
369 expect_as_vector(x[take2], c(31, 35, 32))
370
371 # Filter (with recycling)
372 expect_as_vector(
373 one_chunk[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
374 c(3, 6, 8, 11)
375 )
376 # Filter where both are 1-chunk
377 expect_as_vector(
378 one_chunk[ChunkedArray$create(rep(c(FALSE, TRUE, FALSE, FALSE, TRUE), 2))],
379 c(3, 6, 8, 11)
380 )
381 # Filter multi-chunk with logical (-> Array)
382 expect_as_vector(
383 x[c(FALSE, TRUE, FALSE, FALSE, TRUE)],
384 c(2, 5, 7, 10, 32, 35, 37, 40, 52, 55)
385 )
386 # Filter with a chunked array with different sized chunks
387 p1 <- c(FALSE, TRUE, FALSE, FALSE, TRUE)
388 p2 <- c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)
389 filt <- ChunkedArray$create(p1, p2, p2)
390 expect_as_vector(
391 x[filt],
392 c(2, 5, 6, 8, 9, 35, 36, 38, 39, 55)
393 )
394 })
395
396 test_that("ChunkedArray head/tail", {
397 vec <- 11:20
398 a <- ChunkedArray$create(11:15, 16:20)
399 expect_as_vector(head(a), head(vec))
400 expect_as_vector(head(a, 4), head(vec, 4))
401 expect_as_vector(head(a, 40), head(vec, 40))
402 expect_as_vector(head(a, -4), head(vec, -4))
403 expect_as_vector(head(a, -40), head(vec, -40))
404 expect_as_vector(tail(a), tail(vec))
405 expect_as_vector(tail(a, 4), tail(vec, 4))
406 expect_as_vector(tail(a, 40), tail(vec, 40))
407 expect_as_vector(tail(a, -40), tail(vec, -40))
408 })
409
410 test_that("ChunkedArray$Equals", {
411 vec <- 11:20
412 a <- ChunkedArray$create(vec[1:5], vec[6:10])
413 b <- ChunkedArray$create(vec[1:5], vec[6:10])
414 expect_equal(a, b)
415 expect_true(a$Equals(b))
416 expect_false(a$Equals(vec))
417 })
418
419 test_that("Converting a chunked array unifies factors (ARROW-8374)", {
420 f1 <- factor(c("a"), levels = c("a", "b"))
421 f2 <- factor(c("c"), levels = c("c", "d"))
422 f3 <- factor(NA, levels = "a")
423 f4 <- factor()
424
425 res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))
426 ca <- ChunkedArray$create(f1, f2, f3, f4)
427
428 expect_identical(ca$as_vector(), res)
429 })
430
431 test_that("Handling string data with embedded nuls", {
432 raws <- structure(list(
433 as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
434 as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
435 as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
436 as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
437 as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
438 as.raw(c(0x74, 0x76))
439 ),
440 class = c("arrow_binary", "vctrs_vctr", "list")
441 )
442 chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8())
443
444 # The behavior of the warnings/errors is slightly different with and without
445 # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately
446 # on `as.vector()` where as with it, the error only happens on materialization)
447 skip_if_r_version("3.5.0")
448
449 v <- expect_error(as.vector(chunked_array_with_nul), NA)
450
451 expect_error(
452 v[],
453 paste0(
454 "embedded nul in string: 'ma\\0n'; to strip nuls when converting from Arrow to R, ",
455 "set options(arrow.skip_nul = TRUE)"
456 ),
457 fixed = TRUE
458 )
459
460 withr::with_options(list(arrow.skip_nul = TRUE), {
461 v <- expect_warning(as.vector(chunked_array_with_nul), NA)
462 expect_warning(
463 expect_identical(v[3], "man"),
464 "Stripping '\\0' (nul) from character vector",
465 fixed = TRUE
466 )
467 })
468 })