]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/r/tests/testthat/test-dplyr-funcs-type.R
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / tests / testthat / test-dplyr-funcs-type.R
CommitLineData
1d09f67e
TL
1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements. See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership. The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License. You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied. See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18skip_if_not_available("dataset")
19
20library(dplyr, warn.conflicts = FALSE)
21suppressPackageStartupMessages(library(bit64))
22
23
24tbl <- example_data
25
26test_that("explicit type conversions with cast()", {
27 num_int32 <- 12L
28 num_int64 <- bit64::as.integer64(10)
29
30 int_types <- c(int8(), int16(), int32(), int64())
31 uint_types <- c(uint8(), uint16(), uint32(), uint64())
32 float_types <- c(float32(), float64())
33
34 types <- c(
35 int_types,
36 uint_types,
37 float_types,
38 double(), # not actually a type, a base R function but should be alias for float64
39 string()
40 )
41
42 for (type in types) {
43 expect_type_equal(
44 object = {
45 t1 <- Table$create(x = num_int32) %>%
46 transmute(x = cast(x, type)) %>%
47 compute()
48 t1$schema[[1]]$type
49 },
50 as_type(type)
51 )
52 expect_type_equal(
53 object = {
54 t1 <- Table$create(x = num_int64) %>%
55 transmute(x = cast(x, type)) %>%
56 compute()
57 t1$schema[[1]]$type
58 },
59 as_type(type)
60 )
61 }
62
63 # Arrow errors when truncating floats...
64 expect_error(
65 expect_type_equal(
66 object = {
67 t1 <- Table$create(pi = pi) %>%
68 transmute(three = cast(pi, int32())) %>%
69 compute()
70 t1$schema[[1]]$type
71 },
72 int32()
73 ),
74 "truncated"
75 )
76
77 # ... unless safe = FALSE (or allow_float_truncate = TRUE)
78 expect_type_equal(
79 object = {
80 t1 <- Table$create(pi = pi) %>%
81 transmute(three = cast(pi, int32(), safe = FALSE)) %>%
82 compute()
83 t1$schema[[1]]$type
84 },
85 int32()
86 )
87})
88
89test_that("explicit type conversions with as.*()", {
90 library(bit64)
91 compare_dplyr_binding(
92 .input %>%
93 transmute(
94 int2chr = as.character(int),
95 int2dbl = as.double(int),
96 int2int = as.integer(int),
97 int2num = as.numeric(int),
98 dbl2chr = as.character(dbl),
99 dbl2dbl = as.double(dbl),
100 dbl2int = as.integer(dbl),
101 dbl2num = as.numeric(dbl),
102 ) %>%
103 collect(),
104 tbl
105 )
106 compare_dplyr_binding(
107 .input %>%
108 transmute(
109 chr2chr = as.character(chr),
110 chr2dbl = as.double(chr),
111 chr2int = as.integer(chr),
112 chr2num = as.numeric(chr)
113 ) %>%
114 collect(),
115 tibble(chr = c("1", "2", "3"))
116 )
117 compare_dplyr_binding(
118 .input %>%
119 transmute(
120 chr2i64 = as.integer64(chr),
121 dbl2i64 = as.integer64(dbl),
122 i642i64 = as.integer64(i64),
123 ) %>%
124 collect(),
125 tibble(chr = "10000000000", dbl = 10000000000, i64 = as.integer64(1e10))
126 )
127 compare_dplyr_binding(
128 .input %>%
129 transmute(
130 chr2lgl = as.logical(chr),
131 dbl2lgl = as.logical(dbl),
132 int2lgl = as.logical(int)
133 ) %>%
134 collect(),
135 tibble(
136 chr = c("TRUE", "FALSE", "true", "false"),
137 dbl = c(1, 0, -99, 0),
138 int = c(1L, 0L, -99L, 0L)
139 )
140 )
141 compare_dplyr_binding(
142 .input %>%
143 transmute(
144 dbl2chr = as.character(dbl),
145 dbl2dbl = as.double(dbl),
146 dbl2int = as.integer(dbl),
147 dbl2lgl = as.logical(dbl),
148 int2chr = as.character(int),
149 int2dbl = as.double(int),
150 int2int = as.integer(int),
151 int2lgl = as.logical(int),
152 lgl2chr = as.character(lgl), # Arrow returns "true", "false" here ...
153 lgl2dbl = as.double(lgl),
154 lgl2int = as.integer(lgl),
155 lgl2lgl = as.logical(lgl)
156 ) %>%
157 collect() %>%
158 # need to use toupper() *after* collect() or else skip if utf8proc not available
159 mutate(lgl2chr = toupper(lgl2chr)), # ... but we need "TRUE", "FALSE"
160 tibble(
161 dbl = c(1, 0, NA_real_),
162 int = c(1L, 0L, NA_integer_),
163 lgl = c(TRUE, FALSE, NA)
164 )
165 )
166})
167
168test_that("is.finite(), is.infinite(), is.nan()", {
169 df <- tibble(x = c(
170 -4.94065645841246544e-324, 1.79769313486231570e+308, 0,
171 NA_real_, NaN, Inf, -Inf
172 ))
173 compare_dplyr_binding(
174 .input %>%
175 transmute(
176 is_fin = is.finite(x),
177 is_inf = is.infinite(x)
178 ) %>%
179 collect(),
180 df
181 )
182 # is.nan() evaluates to FALSE on NA_real_ (ARROW-12850)
183 compare_dplyr_binding(
184 .input %>%
185 transmute(
186 is_nan = is.nan(x)
187 ) %>%
188 collect(),
189 df
190 )
191})
192
193test_that("is.na() evaluates to TRUE on NaN (ARROW-12055)", {
194 df <- tibble(x = c(1.1, 2.2, NA_real_, 4.4, NaN, 6.6, 7.7))
195 compare_dplyr_binding(
196 .input %>%
197 transmute(
198 is_na = is.na(x)
199 ) %>%
200 collect(),
201 df
202 )
203})
204
205test_that("type checks with is() giving Arrow types", {
206 # with class2=DataType
207 expect_equal(
208 Table$create(
209 i32 = Array$create(1, int32()),
210 dec = Array$create(pi)$cast(decimal(3, 2)),
211 f64 = Array$create(1.1, float64()),
212 str = Array$create("a", arrow::string())
213 ) %>% transmute(
214 i32_is_i32 = is(i32, int32()),
215 i32_is_dec = is(i32, decimal(3, 2)),
216 i32_is_i64 = is(i32, float64()),
217 i32_is_str = is(i32, arrow::string()),
218 dec_is_i32 = is(dec, int32()),
219 dec_is_dec = is(dec, decimal(3, 2)),
220 dec_is_i64 = is(dec, float64()),
221 dec_is_str = is(dec, arrow::string()),
222 f64_is_i32 = is(f64, int32()),
223 f64_is_dec = is(f64, decimal(3, 2)),
224 f64_is_i64 = is(f64, float64()),
225 f64_is_str = is(f64, arrow::string()),
226 str_is_i32 = is(str, int32()),
227 str_is_dec = is(str, decimal(3, 2)),
228 str_is_i64 = is(str, float64()),
229 str_is_str = is(str, arrow::string())
230 ) %>%
231 collect() %>%
232 t() %>%
233 as.vector(),
234 c(
235 TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE,
236 FALSE, FALSE, FALSE, FALSE, TRUE
237 )
238 )
239 # with class2=string
240 expect_equal(
241 Table$create(
242 i32 = Array$create(1, int32()),
243 f64 = Array$create(1.1, float64()),
244 str = Array$create("a", arrow::string())
245 ) %>% transmute(
246 i32_is_i32 = is(i32, "int32"),
247 i32_is_i64 = is(i32, "double"),
248 i32_is_str = is(i32, "string"),
249 f64_is_i32 = is(f64, "int32"),
250 f64_is_i64 = is(f64, "double"),
251 f64_is_str = is(f64, "string"),
252 str_is_i32 = is(str, "int32"),
253 str_is_i64 = is(str, "double"),
254 str_is_str = is(str, "string")
255 ) %>%
256 collect() %>%
257 t() %>%
258 as.vector(),
259 c(TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE)
260 )
261 # with class2=string alias
262 expect_equal(
263 Table$create(
264 f16 = Array$create(NA_real_, halffloat()),
265 f32 = Array$create(1.1, float()),
266 f64 = Array$create(2.2, float64()),
267 lgl = Array$create(TRUE, bool()),
268 str = Array$create("a", arrow::string())
269 ) %>% transmute(
270 f16_is_f16 = is(f16, "float16"),
271 f16_is_f32 = is(f16, "float32"),
272 f16_is_f64 = is(f16, "float64"),
273 f16_is_lgl = is(f16, "boolean"),
274 f16_is_str = is(f16, "utf8"),
275 f32_is_f16 = is(f32, "float16"),
276 f32_is_f32 = is(f32, "float32"),
277 f32_is_f64 = is(f32, "float64"),
278 f32_is_lgl = is(f32, "boolean"),
279 f32_is_str = is(f32, "utf8"),
280 f64_is_f16 = is(f64, "float16"),
281 f64_is_f32 = is(f64, "float32"),
282 f64_is_f64 = is(f64, "float64"),
283 f64_is_lgl = is(f64, "boolean"),
284 f64_is_str = is(f64, "utf8"),
285 lgl_is_f16 = is(lgl, "float16"),
286 lgl_is_f32 = is(lgl, "float32"),
287 lgl_is_f64 = is(lgl, "float64"),
288 lgl_is_lgl = is(lgl, "boolean"),
289 lgl_is_str = is(lgl, "utf8"),
290 str_is_f16 = is(str, "float16"),
291 str_is_f32 = is(str, "float32"),
292 str_is_f64 = is(str, "float64"),
293 str_is_lgl = is(str, "boolean"),
294 str_is_str = is(str, "utf8")
295 ) %>%
296 collect() %>%
297 t() %>%
298 as.vector(),
299 c(
300 TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
301 FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
302 FALSE, FALSE, TRUE
303 )
304 )
305})
306
307test_that("type checks with is() giving R types", {
308 library(bit64)
309 compare_dplyr_binding(
310 .input %>%
311 transmute(
312 chr_is_chr = is(chr, "character"),
313 chr_is_fct = is(chr, "factor"),
314 chr_is_int = is(chr, "integer"),
315 chr_is_i64 = is(chr, "integer64"),
316 chr_is_lst = is(chr, "list"),
317 chr_is_lgl = is(chr, "logical"),
318 chr_is_num = is(chr, "numeric"),
319 dbl_is_chr = is(dbl, "character"),
320 dbl_is_fct = is(dbl, "factor"),
321 dbl_is_int = is(dbl, "integer"),
322 dbl_is_i64 = is(dbl, "integer64"),
323 dbl_is_lst = is(dbl, "list"),
324 dbl_is_lgl = is(dbl, "logical"),
325 dbl_is_num = is(dbl, "numeric"),
326 fct_is_chr = is(fct, "character"),
327 fct_is_fct = is(fct, "factor"),
328 fct_is_int = is(fct, "integer"),
329 fct_is_i64 = is(fct, "integer64"),
330 fct_is_lst = is(fct, "list"),
331 fct_is_lgl = is(fct, "logical"),
332 fct_is_num = is(fct, "numeric"),
333 int_is_chr = is(int, "character"),
334 int_is_fct = is(int, "factor"),
335 int_is_int = is(int, "integer"),
336 int_is_i64 = is(int, "integer64"),
337 int_is_lst = is(int, "list"),
338 int_is_lgl = is(int, "logical"),
339 int_is_num = is(int, "numeric"),
340 lgl_is_chr = is(lgl, "character"),
341 lgl_is_fct = is(lgl, "factor"),
342 lgl_is_int = is(lgl, "integer"),
343 lgl_is_i64 = is(lgl, "integer64"),
344 lgl_is_lst = is(lgl, "list"),
345 lgl_is_lgl = is(lgl, "logical"),
346 lgl_is_num = is(lgl, "numeric")
347 ) %>%
348 collect(),
349 tbl
350 )
351 compare_dplyr_binding(
352 .input %>%
353 transmute(
354 i64_is_chr = is(i64, "character"),
355 i64_is_fct = is(i64, "factor"),
356 # we want Arrow to return TRUE, but bit64 returns FALSE
357 # i64_is_int = is(i64, "integer"),
358 i64_is_i64 = is(i64, "integer64"),
359 i64_is_lst = is(i64, "list"),
360 i64_is_lgl = is(i64, "logical"),
361 # we want Arrow to return TRUE, but bit64 returns FALSE
362 # i64_is_num = is(i64, "numeric"),
363 lst_is_chr = is(lst, "character"),
364 lst_is_fct = is(lst, "factor"),
365 lst_is_int = is(lst, "integer"),
366 lst_is_i64 = is(lst, "integer64"),
367 lst_is_lst = is(lst, "list"),
368 lst_is_lgl = is(lst, "logical"),
369 lst_is_num = is(lst, "numeric")
370 ) %>%
371 collect(),
372 tibble(
373 i64 = as.integer64(1:3),
374 lst = list(c("a", "b"), c("d", "e"), c("f", "g"))
375 )
376 )
377})
378
379test_that("type checks with is.*()", {
380 library(bit64)
381 compare_dplyr_binding(
382 .input %>%
383 transmute(
384 chr_is_chr = is.character(chr),
385 chr_is_dbl = is.double(chr),
386 chr_is_fct = is.factor(chr),
387 chr_is_int = is.integer(chr),
388 chr_is_i64 = is.integer64(chr),
389 chr_is_lst = is.list(chr),
390 chr_is_lgl = is.logical(chr),
391 chr_is_num = is.numeric(chr),
392 dbl_is_chr = is.character(dbl),
393 dbl_is_dbl = is.double(dbl),
394 dbl_is_fct = is.factor(dbl),
395 dbl_is_int = is.integer(dbl),
396 dbl_is_i64 = is.integer64(dbl),
397 dbl_is_lst = is.list(dbl),
398 dbl_is_lgl = is.logical(dbl),
399 dbl_is_num = is.numeric(dbl),
400 fct_is_chr = is.character(fct),
401 fct_is_dbl = is.double(fct),
402 fct_is_fct = is.factor(fct),
403 fct_is_int = is.integer(fct),
404 fct_is_i64 = is.integer64(fct),
405 fct_is_lst = is.list(fct),
406 fct_is_lgl = is.logical(fct),
407 fct_is_num = is.numeric(fct),
408 int_is_chr = is.character(int),
409 int_is_dbl = is.double(int),
410 int_is_fct = is.factor(int),
411 int_is_int = is.integer(int),
412 int_is_i64 = is.integer64(int),
413 int_is_lst = is.list(int),
414 int_is_lgl = is.logical(int),
415 int_is_num = is.numeric(int),
416 lgl_is_chr = is.character(lgl),
417 lgl_is_dbl = is.double(lgl),
418 lgl_is_fct = is.factor(lgl),
419 lgl_is_int = is.integer(lgl),
420 lgl_is_i64 = is.integer64(lgl),
421 lgl_is_lst = is.list(lgl),
422 lgl_is_lgl = is.logical(lgl),
423 lgl_is_num = is.numeric(lgl)
424 ) %>%
425 collect(),
426 tbl
427 )
428 compare_dplyr_binding(
429 .input %>%
430 transmute(
431 i64_is_chr = is.character(i64),
432 # TODO: investigate why this is not matching when testthat runs it
433 # i64_is_dbl = is.double(i64),
434 i64_is_fct = is.factor(i64),
435 # we want Arrow to return TRUE, but bit64 returns FALSE
436 # i64_is_int = is.integer(i64),
437 i64_is_i64 = is.integer64(i64),
438 i64_is_lst = is.list(i64),
439 i64_is_lgl = is.logical(i64),
440 i64_is_num = is.numeric(i64),
441 lst_is_chr = is.character(lst),
442 lst_is_dbl = is.double(lst),
443 lst_is_fct = is.factor(lst),
444 lst_is_int = is.integer(lst),
445 lst_is_i64 = is.integer64(lst),
446 lst_is_lst = is.list(lst),
447 lst_is_lgl = is.logical(lst),
448 lst_is_num = is.numeric(lst)
449 ) %>%
450 collect(),
451 tibble(
452 i64 = as.integer64(1:3),
453 lst = list(c("a", "b"), c("d", "e"), c("f", "g"))
454 )
455 )
456})
457
458test_that("type checks with is_*()", {
459 library(rlang, warn.conflicts = FALSE)
460 compare_dplyr_binding(
461 .input %>%
462 transmute(
463 chr_is_chr = is_character(chr),
464 chr_is_dbl = is_double(chr),
465 chr_is_int = is_integer(chr),
466 chr_is_lst = is_list(chr),
467 chr_is_lgl = is_logical(chr),
468 dbl_is_chr = is_character(dbl),
469 dbl_is_dbl = is_double(dbl),
470 dbl_is_int = is_integer(dbl),
471 dbl_is_lst = is_list(dbl),
472 dbl_is_lgl = is_logical(dbl),
473 int_is_chr = is_character(int),
474 int_is_dbl = is_double(int),
475 int_is_int = is_integer(int),
476 int_is_lst = is_list(int),
477 int_is_lgl = is_logical(int),
478 lgl_is_chr = is_character(lgl),
479 lgl_is_dbl = is_double(lgl),
480 lgl_is_int = is_integer(lgl),
481 lgl_is_lst = is_list(lgl),
482 lgl_is_lgl = is_logical(lgl)
483 ) %>%
484 collect(),
485 tbl
486 )
487})
488
489test_that("type checks on expressions", {
490 compare_dplyr_binding(
491 .input %>%
492 transmute(
493 a = is.character(as.character(int)),
494 b = is.integer(as.character(int)),
495 c = is.integer(int + int),
496 d = is.double(int + dbl),
497 e = is.logical(dbl > pi)
498 ) %>%
499 collect(),
500 tbl
501 )
502
503 # the code in the expectation below depends on RE2
504 skip_if_not_available("re2")
505
506 compare_dplyr_binding(
507 .input %>%
508 transmute(
509 a = is.logical(grepl("[def]", chr))
510 ) %>%
511 collect(),
512 tbl
513 )
514})
515
516test_that("type checks on R scalar literals", {
517 compare_dplyr_binding(
518 .input %>%
519 transmute(
520 chr_is_chr = is.character("foo"),
521 int_is_chr = is.character(42L),
522 int_is_int = is.integer(42L),
523 chr_is_int = is.integer("foo"),
524 dbl_is_num = is.numeric(3.14159),
525 int_is_num = is.numeric(42L),
526 chr_is_num = is.numeric("foo"),
527 dbl_is_dbl = is.double(3.14159),
528 chr_is_dbl = is.double("foo"),
529 lgl_is_lgl = is.logical(TRUE),
530 chr_is_lgl = is.logical("foo"),
531 fct_is_fct = is.factor(factor("foo", levels = c("foo", "bar", "baz"))),
532 chr_is_fct = is.factor("foo"),
533 lst_is_lst = is.list(list(c(a = "foo", b = "bar"))),
534 chr_is_lst = is.list("foo")
535 ) %>%
536 collect(),
537 tbl
538 )
539})
540
541test_that("as.factor()/dictionary_encode()", {
542 skip("ARROW-12632: ExecuteScalarExpression cannot Execute non-scalar expression")
543 df1 <- tibble(x = c("C", "D", "B", NA, "D", "B", "S", "A", "B", "Z", "B"))
544 df2 <- tibble(x = c(5, 5, 5, NA, 2, 3, 6, 8))
545
546 compare_dplyr_binding(
547 .input %>%
548 transmute(x = as.factor(x)) %>%
549 collect(),
550 df1
551 )
552
553 expect_warning(
554 compare_dplyr_binding(
555 .input %>%
556 transmute(x = as.factor(x)) %>%
557 collect(),
558 df2
559 ),
560 "Coercing dictionary values to R character factor levels"
561 )
562
563 # dictionary values with default null encoding behavior ("mask") omits
564 # nulls from the dictionary values
565 expect_equal(
566 object = {
567 rb1 <- df1 %>%
568 record_batch() %>%
569 transmute(x = dictionary_encode(x)) %>%
570 compute()
571 dict <- rb1$x$dictionary()
572 as.vector(dict$Take(dict$SortIndices()))
573 },
574 sort(unique(df1$x), na.last = NA)
575 )
576
577 # dictionary values with "encode" null encoding behavior includes nulls in
578 # the dictionary values
579 expect_equal(
580 object = {
581 rb1 <- df1 %>%
582 record_batch() %>%
583 transmute(x = dictionary_encode(x, null_encoding_behavior = "encode")) %>%
584 compute()
585 dict <- rb1$x$dictionary()
586 as.vector(dict$Take(dict$SortIndices()))
587 },
588 sort(unique(df1$x), na.last = TRUE)
589 )
590})
591
592test_that("bad explicit type conversions with as.*()", {
593
594 # Arrow returns lowercase "true", "false" (instead of "TRUE", "FALSE" like R)
595 expect_error(
596 compare_dplyr_binding(
597 .input %>%
598 transmute(lgl2chr = as.character(lgl)) %>%
599 collect(),
600 tibble(lgl = c(TRUE, FALSE, NA))
601 )
602 )
603
604 # Arrow fails to parse these strings as numbers (instead of returning NAs with
605 # a warning like R does)
606 expect_error(
607 expect_warning(
608 compare_dplyr_binding(
609 .input %>%
610 transmute(chr2num = as.numeric(chr)) %>%
611 collect(),
612 tibble(chr = c("l.O", "S.S", ""))
613 )
614 )
615 )
616
617 # Arrow fails to parse these strings as Booleans (instead of returning NAs
618 # like R does)
619 expect_error(
620 compare_dplyr_binding(
621 .input %>%
622 transmute(chr2lgl = as.logical(chr)) %>%
623 collect(),
624 tibble(chr = c("TRU", "FAX", ""))
625 )
626 )
627})