]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/python/pyarrow/tests/test_types.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / pyarrow / tests / test_types.py
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 from collections import OrderedDict
19 from collections.abc import Iterator
20 from functools import partial
21 import datetime
22 import sys
23
24 import pickle
25 import pytest
26 import pytz
27 import hypothesis as h
28 import hypothesis.strategies as st
29 import hypothesis.extra.pytz as tzst
30 import weakref
31
32 import numpy as np
33 import pyarrow as pa
34 import pyarrow.types as types
35 import pyarrow.tests.strategies as past
36
37
38 def get_many_types():
39 # returning them from a function is required because of pa.dictionary
40 # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
41 # checks that the default memory pool has zero allocated bytes
42 return (
43 pa.null(),
44 pa.bool_(),
45 pa.int32(),
46 pa.time32('s'),
47 pa.time64('us'),
48 pa.date32(),
49 pa.timestamp('us'),
50 pa.timestamp('us', tz='UTC'),
51 pa.timestamp('us', tz='Europe/Paris'),
52 pa.duration('s'),
53 pa.float16(),
54 pa.float32(),
55 pa.float64(),
56 pa.decimal128(19, 4),
57 pa.decimal256(76, 38),
58 pa.string(),
59 pa.binary(),
60 pa.binary(10),
61 pa.large_string(),
62 pa.large_binary(),
63 pa.list_(pa.int32()),
64 pa.list_(pa.int32(), 2),
65 pa.large_list(pa.uint16()),
66 pa.map_(pa.string(), pa.int32()),
67 pa.map_(pa.field('key', pa.int32(), nullable=False),
68 pa.field('value', pa.int32())),
69 pa.struct([pa.field('a', pa.int32()),
70 pa.field('b', pa.int8()),
71 pa.field('c', pa.string())]),
72 pa.struct([pa.field('a', pa.int32(), nullable=False),
73 pa.field('b', pa.int8(), nullable=False),
74 pa.field('c', pa.string())]),
75 pa.union([pa.field('a', pa.binary(10)),
76 pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
77 pa.union([pa.field('a', pa.binary(10)),
78 pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE,
79 type_codes=[4, 8]),
80 pa.union([pa.field('a', pa.binary(10)),
81 pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
82 pa.union([pa.field('a', pa.binary(10), nullable=False),
83 pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
84 pa.dictionary(pa.int32(), pa.string())
85 )
86
87
88 def test_is_boolean():
89 assert types.is_boolean(pa.bool_())
90 assert not types.is_boolean(pa.int8())
91
92
93 def test_is_integer():
94 signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
95 unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
96
97 for t in signed_ints + unsigned_ints:
98 assert types.is_integer(t)
99
100 for t in signed_ints:
101 assert types.is_signed_integer(t)
102 assert not types.is_unsigned_integer(t)
103
104 for t in unsigned_ints:
105 assert types.is_unsigned_integer(t)
106 assert not types.is_signed_integer(t)
107
108 assert not types.is_integer(pa.float32())
109 assert not types.is_signed_integer(pa.float32())
110
111
112 def test_is_floating():
113 for t in [pa.float16(), pa.float32(), pa.float64()]:
114 assert types.is_floating(t)
115
116 assert not types.is_floating(pa.int32())
117
118
119 def test_is_null():
120 assert types.is_null(pa.null())
121 assert not types.is_null(pa.list_(pa.int32()))
122
123
124 def test_null_field_may_not_be_non_nullable():
125 # ARROW-7273
126 with pytest.raises(ValueError):
127 pa.field('f0', pa.null(), nullable=False)
128
129
130 def test_is_decimal():
131 decimal128 = pa.decimal128(19, 4)
132 decimal256 = pa.decimal256(76, 38)
133 int32 = pa.int32()
134
135 assert types.is_decimal(decimal128)
136 assert types.is_decimal(decimal256)
137 assert not types.is_decimal(int32)
138
139 assert types.is_decimal128(decimal128)
140 assert not types.is_decimal128(decimal256)
141 assert not types.is_decimal128(int32)
142
143 assert not types.is_decimal256(decimal128)
144 assert types.is_decimal256(decimal256)
145 assert not types.is_decimal256(int32)
146
147
148 def test_is_list():
149 a = pa.list_(pa.int32())
150 b = pa.large_list(pa.int32())
151 c = pa.list_(pa.int32(), 3)
152
153 assert types.is_list(a)
154 assert not types.is_large_list(a)
155 assert not types.is_fixed_size_list(a)
156 assert types.is_large_list(b)
157 assert not types.is_list(b)
158 assert not types.is_fixed_size_list(b)
159 assert types.is_fixed_size_list(c)
160 assert not types.is_list(c)
161 assert not types.is_large_list(c)
162
163 assert not types.is_list(pa.int32())
164
165
166 def test_is_map():
167 m = pa.map_(pa.utf8(), pa.int32())
168
169 assert types.is_map(m)
170 assert not types.is_map(pa.int32())
171
172 fields = pa.map_(pa.field('key_name', pa.utf8(), nullable=False),
173 pa.field('value_name', pa.int32()))
174 assert types.is_map(fields)
175
176 entries_type = pa.struct([pa.field('key', pa.int8()),
177 pa.field('value', pa.int8())])
178 list_type = pa.list_(entries_type)
179 assert not types.is_map(list_type)
180
181
182 def test_is_dictionary():
183 assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
184 assert not types.is_dictionary(pa.int32())
185
186
187 def test_is_nested_or_struct():
188 struct_ex = pa.struct([pa.field('a', pa.int32()),
189 pa.field('b', pa.int8()),
190 pa.field('c', pa.string())])
191
192 assert types.is_struct(struct_ex)
193 assert not types.is_struct(pa.list_(pa.int32()))
194
195 assert types.is_nested(struct_ex)
196 assert types.is_nested(pa.list_(pa.int32()))
197 assert types.is_nested(pa.large_list(pa.int32()))
198 assert not types.is_nested(pa.int32())
199
200
201 def test_is_union():
202 for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]:
203 assert types.is_union(pa.union([pa.field('a', pa.int32()),
204 pa.field('b', pa.int8()),
205 pa.field('c', pa.string())],
206 mode=mode))
207 assert not types.is_union(pa.list_(pa.int32()))
208
209
210 # TODO(wesm): is_map, once implemented
211
212
213 def test_is_binary_string():
214 assert types.is_binary(pa.binary())
215 assert not types.is_binary(pa.string())
216 assert not types.is_binary(pa.large_binary())
217 assert not types.is_binary(pa.large_string())
218
219 assert types.is_string(pa.string())
220 assert types.is_unicode(pa.string())
221 assert not types.is_string(pa.binary())
222 assert not types.is_string(pa.large_string())
223 assert not types.is_string(pa.large_binary())
224
225 assert types.is_large_binary(pa.large_binary())
226 assert not types.is_large_binary(pa.large_string())
227 assert not types.is_large_binary(pa.binary())
228 assert not types.is_large_binary(pa.string())
229
230 assert types.is_large_string(pa.large_string())
231 assert not types.is_large_string(pa.large_binary())
232 assert not types.is_large_string(pa.string())
233 assert not types.is_large_string(pa.binary())
234
235 assert types.is_fixed_size_binary(pa.binary(5))
236 assert not types.is_fixed_size_binary(pa.binary())
237
238
239 def test_is_temporal_date_time_timestamp():
240 date_types = [pa.date32(), pa.date64()]
241 time_types = [pa.time32('s'), pa.time64('ns')]
242 timestamp_types = [pa.timestamp('ms')]
243 duration_types = [pa.duration('ms')]
244 interval_types = [pa.month_day_nano_interval()]
245
246 for case in (date_types + time_types + timestamp_types + duration_types +
247 interval_types):
248 assert types.is_temporal(case)
249
250 for case in date_types:
251 assert types.is_date(case)
252 assert not types.is_time(case)
253 assert not types.is_timestamp(case)
254 assert not types.is_duration(case)
255 assert not types.is_interval(case)
256
257 for case in time_types:
258 assert types.is_time(case)
259 assert not types.is_date(case)
260 assert not types.is_timestamp(case)
261 assert not types.is_duration(case)
262 assert not types.is_interval(case)
263
264 for case in timestamp_types:
265 assert types.is_timestamp(case)
266 assert not types.is_date(case)
267 assert not types.is_time(case)
268 assert not types.is_duration(case)
269 assert not types.is_interval(case)
270
271 for case in duration_types:
272 assert types.is_duration(case)
273 assert not types.is_date(case)
274 assert not types.is_time(case)
275 assert not types.is_timestamp(case)
276 assert not types.is_interval(case)
277
278 for case in interval_types:
279 assert types.is_interval(case)
280 assert not types.is_date(case)
281 assert not types.is_time(case)
282 assert not types.is_timestamp(case)
283
284 assert not types.is_temporal(pa.int32())
285
286
287 def test_is_primitive():
288 assert types.is_primitive(pa.int32())
289 assert not types.is_primitive(pa.list_(pa.int32()))
290
291
292 @pytest.mark.parametrize(('tz', 'expected'), [
293 (pytz.utc, 'UTC'),
294 (pytz.timezone('Europe/Paris'), 'Europe/Paris'),
295 # StaticTzInfo.tzname returns with '-09' so we need to infer the timezone's
296 # name from the tzinfo.zone attribute
297 (pytz.timezone('Etc/GMT-9'), 'Etc/GMT-9'),
298 (pytz.FixedOffset(180), '+03:00'),
299 (datetime.timezone.utc, 'UTC'),
300 (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30')
301 ])
302 def test_tzinfo_to_string(tz, expected):
303 assert pa.lib.tzinfo_to_string(tz) == expected
304
305
306 def test_tzinfo_to_string_errors():
307 msg = "Not an instance of datetime.tzinfo"
308 with pytest.raises(TypeError):
309 pa.lib.tzinfo_to_string("Europe/Budapest")
310
311 if sys.version_info >= (3, 8):
312 # before 3.8 it was only possible to create timezone objects with whole
313 # number of minutes
314 tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30))
315 msg = "Offset must represent whole number of minutes"
316 with pytest.raises(ValueError, match=msg):
317 pa.lib.tzinfo_to_string(tz)
318
319
320 @h.given(tzst.timezones())
321 def test_pytz_timezone_roundtrip(tz):
322 timezone_string = pa.lib.tzinfo_to_string(tz)
323 timezone_tzinfo = pa.lib.string_to_tzinfo(timezone_string)
324 assert timezone_tzinfo == tz
325
326
327 def test_convert_custom_tzinfo_objects_to_string():
328 class CorrectTimezone1(datetime.tzinfo):
329 """
330 Conversion is using utcoffset()
331 """
332
333 def tzname(self, dt):
334 return None
335
336 def utcoffset(self, dt):
337 return datetime.timedelta(hours=-3, minutes=30)
338
339 class CorrectTimezone2(datetime.tzinfo):
340 """
341 Conversion is using tzname()
342 """
343
344 def tzname(self, dt):
345 return "+03:00"
346
347 def utcoffset(self, dt):
348 return datetime.timedelta(hours=3)
349
350 class BuggyTimezone1(datetime.tzinfo):
351 """
352 Unable to infer name or offset
353 """
354
355 def tzname(self, dt):
356 return None
357
358 def utcoffset(self, dt):
359 return None
360
361 class BuggyTimezone2(datetime.tzinfo):
362 """
363 Wrong offset type
364 """
365
366 def tzname(self, dt):
367 return None
368
369 def utcoffset(self, dt):
370 return "one hour"
371
372 class BuggyTimezone3(datetime.tzinfo):
373 """
374 Wrong timezone name type
375 """
376
377 def tzname(self, dt):
378 return 240
379
380 def utcoffset(self, dt):
381 return None
382
383 assert pa.lib.tzinfo_to_string(CorrectTimezone1()) == "-02:30"
384 assert pa.lib.tzinfo_to_string(CorrectTimezone2()) == "+03:00"
385
386 msg = (r"Object returned by tzinfo.utcoffset\(None\) is not an instance "
387 r"of datetime.timedelta")
388 for wrong in [BuggyTimezone1(), BuggyTimezone2(), BuggyTimezone3()]:
389 with pytest.raises(ValueError, match=msg):
390 pa.lib.tzinfo_to_string(wrong)
391
392
393 @pytest.mark.parametrize(('string', 'expected'), [
394 ('UTC', pytz.utc),
395 ('Europe/Paris', pytz.timezone('Europe/Paris')),
396 ('+03:00', pytz.FixedOffset(180)),
397 ('+01:30', pytz.FixedOffset(90)),
398 ('-02:00', pytz.FixedOffset(-120))
399 ])
400 def test_string_to_tzinfo(string, expected):
401 result = pa.lib.string_to_tzinfo(string)
402 assert result == expected
403
404
405 @pytest.mark.parametrize('tz,name', [
406 (pytz.FixedOffset(90), '+01:30'),
407 (pytz.FixedOffset(-90), '-01:30'),
408 (pytz.utc, 'UTC'),
409 (pytz.timezone('America/New_York'), 'America/New_York')
410 ])
411 def test_timezone_string_roundtrip(tz, name):
412 assert pa.lib.tzinfo_to_string(tz) == name
413 assert pa.lib.string_to_tzinfo(name) == tz
414
415
416 def test_timestamp():
417 for unit in ('s', 'ms', 'us', 'ns'):
418 for tz in (None, 'UTC', 'Europe/Paris'):
419 ty = pa.timestamp(unit, tz=tz)
420 assert ty.unit == unit
421 assert ty.tz == tz
422
423 for invalid_unit in ('m', 'arbit', 'rary'):
424 with pytest.raises(ValueError, match='Invalid time unit'):
425 pa.timestamp(invalid_unit)
426
427
428 def test_time32_units():
429 for valid_unit in ('s', 'ms'):
430 ty = pa.time32(valid_unit)
431 assert ty.unit == valid_unit
432
433 for invalid_unit in ('m', 'us', 'ns'):
434 error_msg = 'Invalid time unit for time32: {!r}'.format(invalid_unit)
435 with pytest.raises(ValueError, match=error_msg):
436 pa.time32(invalid_unit)
437
438
439 def test_time64_units():
440 for valid_unit in ('us', 'ns'):
441 ty = pa.time64(valid_unit)
442 assert ty.unit == valid_unit
443
444 for invalid_unit in ('m', 's', 'ms'):
445 error_msg = 'Invalid time unit for time64: {!r}'.format(invalid_unit)
446 with pytest.raises(ValueError, match=error_msg):
447 pa.time64(invalid_unit)
448
449
450 def test_duration():
451 for unit in ('s', 'ms', 'us', 'ns'):
452 ty = pa.duration(unit)
453 assert ty.unit == unit
454
455 for invalid_unit in ('m', 'arbit', 'rary'):
456 with pytest.raises(ValueError, match='Invalid time unit'):
457 pa.duration(invalid_unit)
458
459
460 def test_list_type():
461 ty = pa.list_(pa.int64())
462 assert isinstance(ty, pa.ListType)
463 assert ty.value_type == pa.int64()
464 assert ty.value_field == pa.field("item", pa.int64(), nullable=True)
465
466 with pytest.raises(TypeError):
467 pa.list_(None)
468
469
470 def test_large_list_type():
471 ty = pa.large_list(pa.utf8())
472 assert isinstance(ty, pa.LargeListType)
473 assert ty.value_type == pa.utf8()
474 assert ty.value_field == pa.field("item", pa.utf8(), nullable=True)
475
476 with pytest.raises(TypeError):
477 pa.large_list(None)
478
479
480 def test_map_type():
481 ty = pa.map_(pa.utf8(), pa.int32())
482 assert isinstance(ty, pa.MapType)
483 assert ty.key_type == pa.utf8()
484 assert ty.key_field == pa.field("key", pa.utf8(), nullable=False)
485 assert ty.item_type == pa.int32()
486 assert ty.item_field == pa.field("value", pa.int32(), nullable=True)
487
488 with pytest.raises(TypeError):
489 pa.map_(None)
490 with pytest.raises(TypeError):
491 pa.map_(pa.int32(), None)
492 with pytest.raises(TypeError):
493 pa.map_(pa.field("name", pa.string(), nullable=True), pa.int64())
494
495
496 def test_fixed_size_list_type():
497 ty = pa.list_(pa.float64(), 2)
498 assert isinstance(ty, pa.FixedSizeListType)
499 assert ty.value_type == pa.float64()
500 assert ty.value_field == pa.field("item", pa.float64(), nullable=True)
501 assert ty.list_size == 2
502
503 with pytest.raises(ValueError):
504 pa.list_(pa.float64(), -2)
505
506
507 def test_struct_type():
508 fields = [
509 # Duplicate field name on purpose
510 pa.field('a', pa.int64()),
511 pa.field('a', pa.int32()),
512 pa.field('b', pa.int32())
513 ]
514 ty = pa.struct(fields)
515
516 assert len(ty) == ty.num_fields == 3
517 assert list(ty) == fields
518 assert ty[0].name == 'a'
519 assert ty[2].type == pa.int32()
520 with pytest.raises(IndexError):
521 assert ty[3]
522
523 assert ty['b'] == ty[2]
524
525 # Not found
526 with pytest.raises(KeyError):
527 ty['c']
528
529 # Neither integer nor string
530 with pytest.raises(TypeError):
531 ty[None]
532
533 for a, b in zip(ty, fields):
534 a == b
535
536 # Construct from list of tuples
537 ty = pa.struct([('a', pa.int64()),
538 ('a', pa.int32()),
539 ('b', pa.int32())])
540 assert list(ty) == fields
541 for a, b in zip(ty, fields):
542 a == b
543
544 # Construct from mapping
545 fields = [pa.field('a', pa.int64()),
546 pa.field('b', pa.int32())]
547 ty = pa.struct(OrderedDict([('a', pa.int64()),
548 ('b', pa.int32())]))
549 assert list(ty) == fields
550 for a, b in zip(ty, fields):
551 a == b
552
553 # Invalid args
554 with pytest.raises(TypeError):
555 pa.struct([('a', None)])
556
557
558 def test_struct_duplicate_field_names():
559 fields = [
560 pa.field('a', pa.int64()),
561 pa.field('b', pa.int32()),
562 pa.field('a', pa.int32())
563 ]
564 ty = pa.struct(fields)
565
566 # Duplicate
567 with pytest.warns(UserWarning):
568 with pytest.raises(KeyError):
569 ty['a']
570
571 # StructType::GetFieldIndex
572 assert ty.get_field_index('a') == -1
573
574 # StructType::GetAllFieldIndices
575 assert ty.get_all_field_indices('a') == [0, 2]
576
577
578 def test_union_type():
579 def check_fields(ty, fields):
580 assert ty.num_fields == len(fields)
581 assert [ty[i] for i in range(ty.num_fields)] == fields
582
583 fields = [pa.field('x', pa.list_(pa.int32())),
584 pa.field('y', pa.binary())]
585 type_codes = [5, 9]
586
587 sparse_factories = [
588 partial(pa.union, mode='sparse'),
589 partial(pa.union, mode=pa.lib.UnionMode_SPARSE),
590 pa.sparse_union,
591 ]
592
593 dense_factories = [
594 partial(pa.union, mode='dense'),
595 partial(pa.union, mode=pa.lib.UnionMode_DENSE),
596 pa.dense_union,
597 ]
598
599 for factory in sparse_factories:
600 ty = factory(fields)
601 assert isinstance(ty, pa.SparseUnionType)
602 assert ty.mode == 'sparse'
603 check_fields(ty, fields)
604 assert ty.type_codes == [0, 1]
605 ty = factory(fields, type_codes=type_codes)
606 assert ty.mode == 'sparse'
607 check_fields(ty, fields)
608 assert ty.type_codes == type_codes
609 # Invalid number of type codes
610 with pytest.raises(ValueError):
611 factory(fields, type_codes=type_codes[1:])
612
613 for factory in dense_factories:
614 ty = factory(fields)
615 assert isinstance(ty, pa.DenseUnionType)
616 assert ty.mode == 'dense'
617 check_fields(ty, fields)
618 assert ty.type_codes == [0, 1]
619 ty = factory(fields, type_codes=type_codes)
620 assert ty.mode == 'dense'
621 check_fields(ty, fields)
622 assert ty.type_codes == type_codes
623 # Invalid number of type codes
624 with pytest.raises(ValueError):
625 factory(fields, type_codes=type_codes[1:])
626
627 for mode in ('unknown', 2):
628 with pytest.raises(ValueError, match='Invalid union mode'):
629 pa.union(fields, mode=mode)
630
631
632 def test_dictionary_type():
633 ty0 = pa.dictionary(pa.int32(), pa.string())
634 assert ty0.index_type == pa.int32()
635 assert ty0.value_type == pa.string()
636 assert ty0.ordered is False
637
638 ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
639 assert ty1.index_type == pa.int8()
640 assert ty1.value_type == pa.float64()
641 assert ty1.ordered is True
642
643 # construct from non-arrow objects
644 ty2 = pa.dictionary('int8', 'string')
645 assert ty2.index_type == pa.int8()
646 assert ty2.value_type == pa.string()
647 assert ty2.ordered is False
648
649 # allow unsigned integers for index type
650 ty3 = pa.dictionary(pa.uint32(), pa.string())
651 assert ty3.index_type == pa.uint32()
652 assert ty3.value_type == pa.string()
653 assert ty3.ordered is False
654
655 # invalid index type raises
656 with pytest.raises(TypeError):
657 pa.dictionary(pa.string(), pa.int64())
658
659
660 def test_dictionary_ordered_equals():
661 # Python side checking of ARROW-6345
662 d1 = pa.dictionary('int32', 'binary', ordered=True)
663 d2 = pa.dictionary('int32', 'binary', ordered=False)
664 d3 = pa.dictionary('int8', 'binary', ordered=True)
665 d4 = pa.dictionary('int32', 'binary', ordered=True)
666
667 assert not d1.equals(d2)
668 assert not d1.equals(d3)
669 assert d1.equals(d4)
670
671
672 def test_types_hashable():
673 many_types = get_many_types()
674 in_dict = {}
675 for i, type_ in enumerate(many_types):
676 assert hash(type_) == hash(type_)
677 in_dict[type_] = i
678 assert len(in_dict) == len(many_types)
679 for i, type_ in enumerate(many_types):
680 assert in_dict[type_] == i
681
682
683 def test_types_picklable():
684 for ty in get_many_types():
685 data = pickle.dumps(ty)
686 assert pickle.loads(data) == ty
687
688
689 def test_types_weakref():
690 for ty in get_many_types():
691 wr = weakref.ref(ty)
692 assert wr() is not None
693 # Note that ty may be a singleton and therefore outlive this loop
694
695 wr = weakref.ref(pa.int32())
696 assert wr() is not None # singleton
697 wr = weakref.ref(pa.list_(pa.int32()))
698 assert wr() is None # not a singleton
699
700
701 def test_fields_hashable():
702 in_dict = {}
703 fields = [pa.field('a', pa.int32()),
704 pa.field('a', pa.int64()),
705 pa.field('a', pa.int64(), nullable=False),
706 pa.field('b', pa.int32()),
707 pa.field('b', pa.int32(), nullable=False)]
708 for i, field in enumerate(fields):
709 in_dict[field] = i
710 assert len(in_dict) == len(fields)
711 for i, field in enumerate(fields):
712 assert in_dict[field] == i
713
714
715 def test_fields_weakrefable():
716 field = pa.field('a', pa.int32())
717 wr = weakref.ref(field)
718 assert wr() is not None
719 del field
720 assert wr() is None
721
722
723 @pytest.mark.parametrize('t,check_func', [
724 (pa.date32(), types.is_date32),
725 (pa.date64(), types.is_date64),
726 (pa.time32('s'), types.is_time32),
727 (pa.time64('ns'), types.is_time64),
728 (pa.int8(), types.is_int8),
729 (pa.int16(), types.is_int16),
730 (pa.int32(), types.is_int32),
731 (pa.int64(), types.is_int64),
732 (pa.uint8(), types.is_uint8),
733 (pa.uint16(), types.is_uint16),
734 (pa.uint32(), types.is_uint32),
735 (pa.uint64(), types.is_uint64),
736 (pa.float16(), types.is_float16),
737 (pa.float32(), types.is_float32),
738 (pa.float64(), types.is_float64)
739 ])
740 def test_exact_primitive_types(t, check_func):
741 assert check_func(t)
742
743
744 def test_type_id():
745 # enum values are not exposed publicly
746 for ty in get_many_types():
747 assert isinstance(ty.id, int)
748
749
750 def test_bit_width():
751 for ty, expected in [(pa.bool_(), 1),
752 (pa.int8(), 8),
753 (pa.uint32(), 32),
754 (pa.float16(), 16),
755 (pa.decimal128(19, 4), 128),
756 (pa.decimal256(76, 38), 256),
757 (pa.binary(42), 42 * 8)]:
758 assert ty.bit_width == expected
759 for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
760 with pytest.raises(ValueError, match="fixed width"):
761 ty.bit_width
762
763
764 def test_fixed_size_binary_byte_width():
765 ty = pa.binary(5)
766 assert ty.byte_width == 5
767
768
769 def test_decimal_properties():
770 ty = pa.decimal128(19, 4)
771 assert ty.byte_width == 16
772 assert ty.precision == 19
773 assert ty.scale == 4
774 ty = pa.decimal256(76, 38)
775 assert ty.byte_width == 32
776 assert ty.precision == 76
777 assert ty.scale == 38
778
779
780 def test_decimal_overflow():
781 pa.decimal128(1, 0)
782 pa.decimal128(38, 0)
783 for i in (0, -1, 39):
784 with pytest.raises(ValueError):
785 pa.decimal128(i, 0)
786
787 pa.decimal256(1, 0)
788 pa.decimal256(76, 0)
789 for i in (0, -1, 77):
790 with pytest.raises(ValueError):
791 pa.decimal256(i, 0)
792
793
794 def test_type_equality_operators():
795 many_types = get_many_types()
796 non_pyarrow = ('foo', 16, {'s', 'e', 't'})
797
798 for index, ty in enumerate(many_types):
799 # could use two parametrization levels,
800 # but that'd bloat pytest's output
801 for i, other in enumerate(many_types + non_pyarrow):
802 if i == index:
803 assert ty == other
804 else:
805 assert ty != other
806
807
808 def test_key_value_metadata():
809 m = pa.KeyValueMetadata({'a': 'A', 'b': 'B'})
810 assert len(m) == 2
811 assert m['a'] == b'A'
812 assert m[b'a'] == b'A'
813 assert m['b'] == b'B'
814 assert 'a' in m
815 assert b'a' in m
816 assert 'c' not in m
817
818 m1 = pa.KeyValueMetadata({'a': 'A', 'b': 'B'})
819 m2 = pa.KeyValueMetadata(a='A', b='B')
820 m3 = pa.KeyValueMetadata([('a', 'A'), ('b', 'B')])
821
822 assert m1 != 2
823 assert m1 == m2
824 assert m2 == m3
825 assert m1 == {'a': 'A', 'b': 'B'}
826 assert m1 != {'a': 'A', 'b': 'C'}
827
828 with pytest.raises(TypeError):
829 pa.KeyValueMetadata({'a': 1})
830 with pytest.raises(TypeError):
831 pa.KeyValueMetadata({1: 'a'})
832 with pytest.raises(TypeError):
833 pa.KeyValueMetadata(a=1)
834
835 expected = [(b'a', b'A'), (b'b', b'B')]
836 result = [(k, v) for k, v in m3.items()]
837 assert result == expected
838 assert list(m3.items()) == expected
839 assert list(m3.keys()) == [b'a', b'b']
840 assert list(m3.values()) == [b'A', b'B']
841 assert len(m3) == 2
842
843 # test duplicate key support
844 md = pa.KeyValueMetadata([
845 ('a', 'alpha'),
846 ('b', 'beta'),
847 ('a', 'Alpha'),
848 ('a', 'ALPHA'),
849 ])
850
851 expected = [
852 (b'a', b'alpha'),
853 (b'b', b'beta'),
854 (b'a', b'Alpha'),
855 (b'a', b'ALPHA')
856 ]
857 assert len(md) == 4
858 assert isinstance(md.keys(), Iterator)
859 assert isinstance(md.values(), Iterator)
860 assert isinstance(md.items(), Iterator)
861 assert list(md.items()) == expected
862 assert list(md.keys()) == [k for k, _ in expected]
863 assert list(md.values()) == [v for _, v in expected]
864
865 # first occurrence
866 assert md['a'] == b'alpha'
867 assert md['b'] == b'beta'
868 assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA']
869 assert md.get_all('b') == [b'beta']
870 assert md.get_all('unkown') == []
871
872 with pytest.raises(KeyError):
873 md = pa.KeyValueMetadata([
874 ('a', 'alpha'),
875 ('b', 'beta'),
876 ('a', 'Alpha'),
877 ('a', 'ALPHA'),
878 ], b='BETA')
879
880
881 def test_key_value_metadata_duplicates():
882 meta = pa.KeyValueMetadata({'a': '1', 'b': '2'})
883
884 with pytest.raises(KeyError):
885 pa.KeyValueMetadata(meta, a='3')
886
887
888 def test_field_basic():
889 t = pa.string()
890 f = pa.field('foo', t)
891
892 assert f.name == 'foo'
893 assert f.nullable
894 assert f.type is t
895 assert repr(f) == "pyarrow.Field<foo: string>"
896
897 f = pa.field('foo', t, False)
898 assert not f.nullable
899
900 with pytest.raises(TypeError):
901 pa.field('foo', None)
902
903
904 def test_field_equals():
905 meta1 = {b'foo': b'bar'}
906 meta2 = {b'bizz': b'bazz'}
907
908 f1 = pa.field('a', pa.int8(), nullable=True)
909 f2 = pa.field('a', pa.int8(), nullable=True)
910 f3 = pa.field('a', pa.int8(), nullable=False)
911 f4 = pa.field('a', pa.int16(), nullable=False)
912 f5 = pa.field('b', pa.int16(), nullable=False)
913 f6 = pa.field('a', pa.int8(), nullable=True, metadata=meta1)
914 f7 = pa.field('a', pa.int8(), nullable=True, metadata=meta1)
915 f8 = pa.field('a', pa.int8(), nullable=True, metadata=meta2)
916
917 assert f1.equals(f2)
918 assert f6.equals(f7)
919 assert not f1.equals(f3)
920 assert not f1.equals(f4)
921 assert not f3.equals(f4)
922 assert not f4.equals(f5)
923
924 # No metadata in f1, but metadata in f6
925 assert f1.equals(f6)
926 assert not f1.equals(f6, check_metadata=True)
927
928 # Different metadata
929 assert f6.equals(f7)
930 assert f7.equals(f8)
931 assert not f7.equals(f8, check_metadata=True)
932
933
934 def test_field_equality_operators():
935 f1 = pa.field('a', pa.int8(), nullable=True)
936 f2 = pa.field('a', pa.int8(), nullable=True)
937 f3 = pa.field('b', pa.int8(), nullable=True)
938 f4 = pa.field('b', pa.int8(), nullable=False)
939
940 assert f1 == f2
941 assert f1 != f3
942 assert f3 != f4
943 assert f1 != 'foo'
944
945
946 def test_field_metadata():
947 f1 = pa.field('a', pa.int8())
948 f2 = pa.field('a', pa.int8(), metadata={})
949 f3 = pa.field('a', pa.int8(), metadata={b'bizz': b'bazz'})
950
951 assert f1.metadata is None
952 assert f2.metadata == {}
953 assert f3.metadata[b'bizz'] == b'bazz'
954
955
956 def test_field_add_remove_metadata():
957 import collections
958
959 f0 = pa.field('foo', pa.int32())
960
961 assert f0.metadata is None
962
963 metadata = {b'foo': b'bar', b'pandas': b'badger'}
964 metadata2 = collections.OrderedDict([
965 (b'a', b'alpha'),
966 (b'b', b'beta')
967 ])
968
969 f1 = f0.with_metadata(metadata)
970 assert f1.metadata == metadata
971
972 f2 = f0.with_metadata(metadata2)
973 assert f2.metadata == metadata2
974
975 with pytest.raises(TypeError):
976 f0.with_metadata([1, 2, 3])
977
978 f3 = f1.remove_metadata()
979 assert f3.metadata is None
980
981 # idempotent
982 f4 = f3.remove_metadata()
983 assert f4.metadata is None
984
985 f5 = pa.field('foo', pa.int32(), True, metadata)
986 f6 = f0.with_metadata(metadata)
987 assert f5.equals(f6)
988
989
990 def test_field_modified_copies():
991 f0 = pa.field('foo', pa.int32(), True)
992 f0_ = pa.field('foo', pa.int32(), True)
993 assert f0.equals(f0_)
994
995 f1 = pa.field('foo', pa.int64(), True)
996 f1_ = f0.with_type(pa.int64())
997 assert f1.equals(f1_)
998 # Original instance is unmodified
999 assert f0.equals(f0_)
1000
1001 f2 = pa.field('foo', pa.int32(), False)
1002 f2_ = f0.with_nullable(False)
1003 assert f2.equals(f2_)
1004 # Original instance is unmodified
1005 assert f0.equals(f0_)
1006
1007 f3 = pa.field('bar', pa.int32(), True)
1008 f3_ = f0.with_name('bar')
1009 assert f3.equals(f3_)
1010 # Original instance is unmodified
1011 assert f0.equals(f0_)
1012
1013
1014 def test_is_integer_value():
1015 assert pa.types.is_integer_value(1)
1016 assert pa.types.is_integer_value(np.int64(1))
1017 assert not pa.types.is_integer_value('1')
1018
1019
1020 def test_is_float_value():
1021 assert not pa.types.is_float_value(1)
1022 assert pa.types.is_float_value(1.)
1023 assert pa.types.is_float_value(np.float64(1))
1024 assert not pa.types.is_float_value('1.0')
1025
1026
1027 def test_is_boolean_value():
1028 assert not pa.types.is_boolean_value(1)
1029 assert pa.types.is_boolean_value(True)
1030 assert pa.types.is_boolean_value(False)
1031 assert pa.types.is_boolean_value(np.bool_(True))
1032 assert pa.types.is_boolean_value(np.bool_(False))
1033
1034
1035 @h.given(
1036 past.all_types |
1037 past.all_fields |
1038 past.all_schemas
1039 )
1040 @h.example(
1041 pa.field(name='', type=pa.null(), metadata={'0': '', '': ''})
1042 )
1043 def test_pickling(field):
1044 data = pickle.dumps(field)
1045 assert pickle.loads(data) == field
1046
1047
1048 @h.given(
1049 st.lists(past.all_types) |
1050 st.lists(past.all_fields) |
1051 st.lists(past.all_schemas)
1052 )
1053 def test_hashing(items):
1054 h.assume(
1055 # well, this is still O(n^2), but makes the input unique
1056 all(not a.equals(b) for i, a in enumerate(items) for b in items[:i])
1057 )
1058
1059 container = {}
1060 for i, item in enumerate(items):
1061 assert hash(item) == hash(item)
1062 container[item] = i
1063
1064 assert len(container) == len(items)
1065
1066 for i, item in enumerate(items):
1067 assert container[item] == i