]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | # Licensed to the Apache Software Foundation (ASF) under one |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | from collections import OrderedDict | |
19 | import pickle | |
20 | import sys | |
21 | import weakref | |
22 | ||
23 | import pytest | |
24 | import numpy as np | |
25 | import pyarrow as pa | |
26 | ||
27 | import pyarrow.tests.util as test_util | |
28 | from pyarrow.vendored.version import Version | |
29 | ||
30 | ||
31 | def test_schema_constructor_errors(): | |
32 | msg = ("Do not call Schema's constructor directly, use `pyarrow.schema` " | |
33 | "instead") | |
34 | with pytest.raises(TypeError, match=msg): | |
35 | pa.Schema() | |
36 | ||
37 | ||
38 | def test_type_integers(): | |
39 | dtypes = ['int8', 'int16', 'int32', 'int64', | |
40 | 'uint8', 'uint16', 'uint32', 'uint64'] | |
41 | ||
42 | for name in dtypes: | |
43 | factory = getattr(pa, name) | |
44 | t = factory() | |
45 | assert str(t) == name | |
46 | ||
47 | ||
48 | def test_type_to_pandas_dtype(): | |
49 | M8_ns = np.dtype('datetime64[ns]') | |
50 | cases = [ | |
51 | (pa.null(), np.object_), | |
52 | (pa.bool_(), np.bool_), | |
53 | (pa.int8(), np.int8), | |
54 | (pa.int16(), np.int16), | |
55 | (pa.int32(), np.int32), | |
56 | (pa.int64(), np.int64), | |
57 | (pa.uint8(), np.uint8), | |
58 | (pa.uint16(), np.uint16), | |
59 | (pa.uint32(), np.uint32), | |
60 | (pa.uint64(), np.uint64), | |
61 | (pa.float16(), np.float16), | |
62 | (pa.float32(), np.float32), | |
63 | (pa.float64(), np.float64), | |
64 | (pa.date32(), M8_ns), | |
65 | (pa.date64(), M8_ns), | |
66 | (pa.timestamp('ms'), M8_ns), | |
67 | (pa.binary(), np.object_), | |
68 | (pa.binary(12), np.object_), | |
69 | (pa.string(), np.object_), | |
70 | (pa.list_(pa.int8()), np.object_), | |
71 | # (pa.list_(pa.int8(), 2), np.object_), # TODO needs pandas conversion | |
72 | (pa.map_(pa.int64(), pa.float64()), np.object_), | |
73 | ] | |
74 | for arrow_type, numpy_type in cases: | |
75 | assert arrow_type.to_pandas_dtype() == numpy_type | |
76 | ||
77 | ||
78 | @pytest.mark.pandas | |
79 | def test_type_to_pandas_dtype_check_import(): | |
80 | # ARROW-7980 | |
81 | test_util.invoke_script('arrow_7980.py') | |
82 | ||
83 | ||
84 | def test_type_list(): | |
85 | value_type = pa.int32() | |
86 | list_type = pa.list_(value_type) | |
87 | assert str(list_type) == 'list<item: int32>' | |
88 | ||
89 | field = pa.field('my_item', pa.string()) | |
90 | l2 = pa.list_(field) | |
91 | assert str(l2) == 'list<my_item: string>' | |
92 | ||
93 | ||
94 | def test_type_comparisons(): | |
95 | val = pa.int32() | |
96 | assert val == pa.int32() | |
97 | assert val == 'int32' | |
98 | assert val != 5 | |
99 | ||
100 | ||
101 | def test_type_for_alias(): | |
102 | cases = [ | |
103 | ('i1', pa.int8()), | |
104 | ('int8', pa.int8()), | |
105 | ('i2', pa.int16()), | |
106 | ('int16', pa.int16()), | |
107 | ('i4', pa.int32()), | |
108 | ('int32', pa.int32()), | |
109 | ('i8', pa.int64()), | |
110 | ('int64', pa.int64()), | |
111 | ('u1', pa.uint8()), | |
112 | ('uint8', pa.uint8()), | |
113 | ('u2', pa.uint16()), | |
114 | ('uint16', pa.uint16()), | |
115 | ('u4', pa.uint32()), | |
116 | ('uint32', pa.uint32()), | |
117 | ('u8', pa.uint64()), | |
118 | ('uint64', pa.uint64()), | |
119 | ('f4', pa.float32()), | |
120 | ('float32', pa.float32()), | |
121 | ('f8', pa.float64()), | |
122 | ('float64', pa.float64()), | |
123 | ('date32', pa.date32()), | |
124 | ('date64', pa.date64()), | |
125 | ('string', pa.string()), | |
126 | ('str', pa.string()), | |
127 | ('binary', pa.binary()), | |
128 | ('time32[s]', pa.time32('s')), | |
129 | ('time32[ms]', pa.time32('ms')), | |
130 | ('time64[us]', pa.time64('us')), | |
131 | ('time64[ns]', pa.time64('ns')), | |
132 | ('timestamp[s]', pa.timestamp('s')), | |
133 | ('timestamp[ms]', pa.timestamp('ms')), | |
134 | ('timestamp[us]', pa.timestamp('us')), | |
135 | ('timestamp[ns]', pa.timestamp('ns')), | |
136 | ('duration[s]', pa.duration('s')), | |
137 | ('duration[ms]', pa.duration('ms')), | |
138 | ('duration[us]', pa.duration('us')), | |
139 | ('duration[ns]', pa.duration('ns')), | |
140 | ('month_day_nano_interval', pa.month_day_nano_interval()), | |
141 | ] | |
142 | ||
143 | for val, expected in cases: | |
144 | assert pa.type_for_alias(val) == expected | |
145 | ||
146 | ||
147 | def test_type_string(): | |
148 | t = pa.string() | |
149 | assert str(t) == 'string' | |
150 | ||
151 | ||
152 | def test_type_timestamp_with_tz(): | |
153 | tz = 'America/Los_Angeles' | |
154 | t = pa.timestamp('ns', tz=tz) | |
155 | assert t.unit == 'ns' | |
156 | assert t.tz == tz | |
157 | ||
158 | ||
159 | def test_time_types(): | |
160 | t1 = pa.time32('s') | |
161 | t2 = pa.time32('ms') | |
162 | t3 = pa.time64('us') | |
163 | t4 = pa.time64('ns') | |
164 | ||
165 | assert t1.unit == 's' | |
166 | assert t2.unit == 'ms' | |
167 | assert t3.unit == 'us' | |
168 | assert t4.unit == 'ns' | |
169 | ||
170 | assert str(t1) == 'time32[s]' | |
171 | assert str(t4) == 'time64[ns]' | |
172 | ||
173 | with pytest.raises(ValueError): | |
174 | pa.time32('us') | |
175 | ||
176 | with pytest.raises(ValueError): | |
177 | pa.time64('s') | |
178 | ||
179 | ||
180 | def test_from_numpy_dtype(): | |
181 | cases = [ | |
182 | (np.dtype('bool'), pa.bool_()), | |
183 | (np.dtype('int8'), pa.int8()), | |
184 | (np.dtype('int16'), pa.int16()), | |
185 | (np.dtype('int32'), pa.int32()), | |
186 | (np.dtype('int64'), pa.int64()), | |
187 | (np.dtype('uint8'), pa.uint8()), | |
188 | (np.dtype('uint16'), pa.uint16()), | |
189 | (np.dtype('uint32'), pa.uint32()), | |
190 | (np.dtype('float16'), pa.float16()), | |
191 | (np.dtype('float32'), pa.float32()), | |
192 | (np.dtype('float64'), pa.float64()), | |
193 | (np.dtype('U'), pa.string()), | |
194 | (np.dtype('S'), pa.binary()), | |
195 | (np.dtype('datetime64[s]'), pa.timestamp('s')), | |
196 | (np.dtype('datetime64[ms]'), pa.timestamp('ms')), | |
197 | (np.dtype('datetime64[us]'), pa.timestamp('us')), | |
198 | (np.dtype('datetime64[ns]'), pa.timestamp('ns')), | |
199 | (np.dtype('timedelta64[s]'), pa.duration('s')), | |
200 | (np.dtype('timedelta64[ms]'), pa.duration('ms')), | |
201 | (np.dtype('timedelta64[us]'), pa.duration('us')), | |
202 | (np.dtype('timedelta64[ns]'), pa.duration('ns')), | |
203 | ] | |
204 | ||
205 | for dt, pt in cases: | |
206 | result = pa.from_numpy_dtype(dt) | |
207 | assert result == pt | |
208 | ||
209 | # Things convertible to numpy dtypes work | |
210 | assert pa.from_numpy_dtype('U') == pa.string() | |
211 | assert pa.from_numpy_dtype(np.str_) == pa.string() | |
212 | assert pa.from_numpy_dtype('int32') == pa.int32() | |
213 | assert pa.from_numpy_dtype(bool) == pa.bool_() | |
214 | ||
215 | with pytest.raises(NotImplementedError): | |
216 | pa.from_numpy_dtype(np.dtype('O')) | |
217 | ||
218 | with pytest.raises(TypeError): | |
219 | pa.from_numpy_dtype('not_convertible_to_dtype') | |
220 | ||
221 | ||
222 | def test_schema(): | |
223 | fields = [ | |
224 | pa.field('foo', pa.int32()), | |
225 | pa.field('bar', pa.string()), | |
226 | pa.field('baz', pa.list_(pa.int8())) | |
227 | ] | |
228 | sch = pa.schema(fields) | |
229 | ||
230 | assert sch.names == ['foo', 'bar', 'baz'] | |
231 | assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] | |
232 | ||
233 | assert len(sch) == 3 | |
234 | assert sch[0].name == 'foo' | |
235 | assert sch[0].type == fields[0].type | |
236 | assert sch.field('foo').name == 'foo' | |
237 | assert sch.field('foo').type == fields[0].type | |
238 | ||
239 | assert repr(sch) == """\ | |
240 | foo: int32 | |
241 | bar: string | |
242 | baz: list<item: int8> | |
243 | child 0, item: int8""" | |
244 | ||
245 | with pytest.raises(TypeError): | |
246 | pa.schema([None]) | |
247 | ||
248 | ||
249 | def test_schema_weakref(): | |
250 | fields = [ | |
251 | pa.field('foo', pa.int32()), | |
252 | pa.field('bar', pa.string()), | |
253 | pa.field('baz', pa.list_(pa.int8())) | |
254 | ] | |
255 | schema = pa.schema(fields) | |
256 | wr = weakref.ref(schema) | |
257 | assert wr() is not None | |
258 | del schema | |
259 | assert wr() is None | |
260 | ||
261 | ||
262 | def test_schema_to_string_with_metadata(): | |
263 | lorem = """\ | |
264 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla accumsan vel | |
265 | turpis et mollis. Aliquam tincidunt arcu id tortor blandit blandit. Donec | |
266 | eget leo quis lectus scelerisque varius. Class aptent taciti sociosqu ad | |
267 | litora torquent per conubia nostra, per inceptos himenaeos. Praesent | |
268 | faucibus, diam eu volutpat iaculis, tellus est porta ligula, a efficitur | |
269 | turpis nulla facilisis quam. Aliquam vitae lorem erat. Proin a dolor ac libero | |
270 | dignissim mollis vitae eu mauris. Quisque posuere tellus vitae massa | |
271 | pellentesque sagittis. Aenean feugiat, diam ac dignissim fermentum, lorem | |
272 | sapien commodo massa, vel volutpat orci nisi eu justo. Nulla non blandit | |
273 | sapien. Quisque pretium vestibulum urna eu vehicula.""" | |
274 | # ARROW-7063 | |
275 | my_schema = pa.schema([pa.field("foo", "int32", False, | |
276 | metadata={"key1": "value1"}), | |
277 | pa.field("bar", "string", True, | |
278 | metadata={"key3": "value3"})], | |
279 | metadata={"lorem": lorem}) | |
280 | ||
281 | assert my_schema.to_string() == """\ | |
282 | foo: int32 not null | |
283 | -- field metadata -- | |
284 | key1: 'value1' | |
285 | bar: string | |
286 | -- field metadata -- | |
287 | key3: 'value3' | |
288 | -- schema metadata -- | |
289 | lorem: '""" + lorem[:65] + "' + " + str(len(lorem) - 65) | |
290 | ||
291 | # Metadata that exactly fits | |
292 | result = pa.schema([('f0', 'int32')], | |
293 | metadata={'key': 'value' + 'x' * 62}).to_string() | |
294 | assert result == """\ | |
295 | f0: int32 | |
296 | -- schema metadata -- | |
297 | key: 'valuexxxxxxxxxxxxxxxxxxxxxxxxxxxxx\ | |
298 | xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'""" | |
299 | ||
300 | assert my_schema.to_string(truncate_metadata=False) == """\ | |
301 | foo: int32 not null | |
302 | -- field metadata -- | |
303 | key1: 'value1' | |
304 | bar: string | |
305 | -- field metadata -- | |
306 | key3: 'value3' | |
307 | -- schema metadata -- | |
308 | lorem: '{}'""".format(lorem) | |
309 | ||
310 | assert my_schema.to_string(truncate_metadata=False, | |
311 | show_field_metadata=False) == """\ | |
312 | foo: int32 not null | |
313 | bar: string | |
314 | -- schema metadata -- | |
315 | lorem: '{}'""".format(lorem) | |
316 | ||
317 | assert my_schema.to_string(truncate_metadata=False, | |
318 | show_schema_metadata=False) == """\ | |
319 | foo: int32 not null | |
320 | -- field metadata -- | |
321 | key1: 'value1' | |
322 | bar: string | |
323 | -- field metadata -- | |
324 | key3: 'value3'""" | |
325 | ||
326 | assert my_schema.to_string(truncate_metadata=False, | |
327 | show_field_metadata=False, | |
328 | show_schema_metadata=False) == """\ | |
329 | foo: int32 not null | |
330 | bar: string""" | |
331 | ||
332 | ||
333 | def test_schema_from_tuples(): | |
334 | fields = [ | |
335 | ('foo', pa.int32()), | |
336 | ('bar', pa.string()), | |
337 | ('baz', pa.list_(pa.int8())), | |
338 | ] | |
339 | sch = pa.schema(fields) | |
340 | assert sch.names == ['foo', 'bar', 'baz'] | |
341 | assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] | |
342 | assert len(sch) == 3 | |
343 | assert repr(sch) == """\ | |
344 | foo: int32 | |
345 | bar: string | |
346 | baz: list<item: int8> | |
347 | child 0, item: int8""" | |
348 | ||
349 | with pytest.raises(TypeError): | |
350 | pa.schema([('foo', None)]) | |
351 | ||
352 | ||
353 | def test_schema_from_mapping(): | |
354 | fields = OrderedDict([ | |
355 | ('foo', pa.int32()), | |
356 | ('bar', pa.string()), | |
357 | ('baz', pa.list_(pa.int8())), | |
358 | ]) | |
359 | sch = pa.schema(fields) | |
360 | assert sch.names == ['foo', 'bar', 'baz'] | |
361 | assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] | |
362 | assert len(sch) == 3 | |
363 | assert repr(sch) == """\ | |
364 | foo: int32 | |
365 | bar: string | |
366 | baz: list<item: int8> | |
367 | child 0, item: int8""" | |
368 | ||
369 | fields = OrderedDict([('foo', None)]) | |
370 | with pytest.raises(TypeError): | |
371 | pa.schema(fields) | |
372 | ||
373 | ||
374 | def test_schema_duplicate_fields(): | |
375 | fields = [ | |
376 | pa.field('foo', pa.int32()), | |
377 | pa.field('bar', pa.string()), | |
378 | pa.field('foo', pa.list_(pa.int8())), | |
379 | ] | |
380 | sch = pa.schema(fields) | |
381 | assert sch.names == ['foo', 'bar', 'foo'] | |
382 | assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] | |
383 | assert len(sch) == 3 | |
384 | assert repr(sch) == """\ | |
385 | foo: int32 | |
386 | bar: string | |
387 | foo: list<item: int8> | |
388 | child 0, item: int8""" | |
389 | ||
390 | assert sch[0].name == 'foo' | |
391 | assert sch[0].type == fields[0].type | |
392 | with pytest.warns(FutureWarning): | |
393 | assert sch.field_by_name('bar') == fields[1] | |
394 | with pytest.warns(FutureWarning): | |
395 | assert sch.field_by_name('xxx') is None | |
396 | with pytest.warns((UserWarning, FutureWarning)): | |
397 | assert sch.field_by_name('foo') is None | |
398 | ||
399 | # Schema::GetFieldIndex | |
400 | assert sch.get_field_index('foo') == -1 | |
401 | ||
402 | # Schema::GetAllFieldIndices | |
403 | assert sch.get_all_field_indices('foo') == [0, 2] | |
404 | ||
405 | ||
406 | def test_field_flatten(): | |
407 | f0 = pa.field('foo', pa.int32()).with_metadata({b'foo': b'bar'}) | |
408 | assert f0.flatten() == [f0] | |
409 | ||
410 | f1 = pa.field('bar', pa.float64(), nullable=False) | |
411 | ff = pa.field('ff', pa.struct([f0, f1]), nullable=False) | |
412 | assert ff.flatten() == [ | |
413 | pa.field('ff.foo', pa.int32()).with_metadata({b'foo': b'bar'}), | |
414 | pa.field('ff.bar', pa.float64(), nullable=False)] # XXX | |
415 | ||
416 | # Nullable parent makes flattened child nullable | |
417 | ff = pa.field('ff', pa.struct([f0, f1])) | |
418 | assert ff.flatten() == [ | |
419 | pa.field('ff.foo', pa.int32()).with_metadata({b'foo': b'bar'}), | |
420 | pa.field('ff.bar', pa.float64())] | |
421 | ||
422 | fff = pa.field('fff', pa.struct([ff])) | |
423 | assert fff.flatten() == [pa.field('fff.ff', pa.struct([f0, f1]))] | |
424 | ||
425 | ||
426 | def test_schema_add_remove_metadata(): | |
427 | fields = [ | |
428 | pa.field('foo', pa.int32()), | |
429 | pa.field('bar', pa.string()), | |
430 | pa.field('baz', pa.list_(pa.int8())) | |
431 | ] | |
432 | ||
433 | s1 = pa.schema(fields) | |
434 | ||
435 | assert s1.metadata is None | |
436 | ||
437 | metadata = {b'foo': b'bar', b'pandas': b'badger'} | |
438 | ||
439 | s2 = s1.with_metadata(metadata) | |
440 | assert s2.metadata == metadata | |
441 | ||
442 | s3 = s2.remove_metadata() | |
443 | assert s3.metadata is None | |
444 | ||
445 | # idempotent | |
446 | s4 = s3.remove_metadata() | |
447 | assert s4.metadata is None | |
448 | ||
449 | ||
450 | def test_schema_equals(): | |
451 | fields = [ | |
452 | pa.field('foo', pa.int32()), | |
453 | pa.field('bar', pa.string()), | |
454 | pa.field('baz', pa.list_(pa.int8())) | |
455 | ] | |
456 | metadata = {b'foo': b'bar', b'pandas': b'badger'} | |
457 | ||
458 | sch1 = pa.schema(fields) | |
459 | sch2 = pa.schema(fields) | |
460 | sch3 = pa.schema(fields, metadata=metadata) | |
461 | sch4 = pa.schema(fields, metadata=metadata) | |
462 | ||
463 | assert sch1.equals(sch2, check_metadata=True) | |
464 | assert sch3.equals(sch4, check_metadata=True) | |
465 | assert sch1.equals(sch3) | |
466 | assert not sch1.equals(sch3, check_metadata=True) | |
467 | assert not sch1.equals(sch3, check_metadata=True) | |
468 | ||
469 | del fields[-1] | |
470 | sch3 = pa.schema(fields) | |
471 | assert not sch1.equals(sch3) | |
472 | ||
473 | ||
474 | def test_schema_equals_propagates_check_metadata(): | |
475 | # ARROW-4088 | |
476 | schema1 = pa.schema([ | |
477 | pa.field('foo', pa.int32()), | |
478 | pa.field('bar', pa.string()) | |
479 | ]) | |
480 | schema2 = pa.schema([ | |
481 | pa.field('foo', pa.int32()), | |
482 | pa.field('bar', pa.string(), metadata={'a': 'alpha'}), | |
483 | ]) | |
484 | assert not schema1.equals(schema2, check_metadata=True) | |
485 | assert schema1.equals(schema2) | |
486 | ||
487 | ||
488 | def test_schema_equals_invalid_type(): | |
489 | # ARROW-5873 | |
490 | schema = pa.schema([pa.field("a", pa.int64())]) | |
491 | ||
492 | for val in [None, 'string', pa.array([1, 2])]: | |
493 | with pytest.raises(TypeError): | |
494 | schema.equals(val) | |
495 | ||
496 | ||
497 | def test_schema_equality_operators(): | |
498 | fields = [ | |
499 | pa.field('foo', pa.int32()), | |
500 | pa.field('bar', pa.string()), | |
501 | pa.field('baz', pa.list_(pa.int8())) | |
502 | ] | |
503 | metadata = {b'foo': b'bar', b'pandas': b'badger'} | |
504 | ||
505 | sch1 = pa.schema(fields) | |
506 | sch2 = pa.schema(fields) | |
507 | sch3 = pa.schema(fields, metadata=metadata) | |
508 | sch4 = pa.schema(fields, metadata=metadata) | |
509 | ||
510 | assert sch1 == sch2 | |
511 | assert sch3 == sch4 | |
512 | ||
513 | # __eq__ and __ne__ do not check metadata | |
514 | assert sch1 == sch3 | |
515 | assert not sch1 != sch3 | |
516 | ||
517 | assert sch2 == sch4 | |
518 | ||
519 | # comparison with other types doesn't raise | |
520 | assert sch1 != [] | |
521 | assert sch3 != 'foo' | |
522 | ||
523 | ||
524 | def test_schema_get_fields(): | |
525 | fields = [ | |
526 | pa.field('foo', pa.int32()), | |
527 | pa.field('bar', pa.string()), | |
528 | pa.field('baz', pa.list_(pa.int8())) | |
529 | ] | |
530 | ||
531 | schema = pa.schema(fields) | |
532 | ||
533 | assert schema.field('foo').name == 'foo' | |
534 | assert schema.field(0).name == 'foo' | |
535 | assert schema.field(-1).name == 'baz' | |
536 | ||
537 | with pytest.raises(KeyError): | |
538 | schema.field('other') | |
539 | with pytest.raises(TypeError): | |
540 | schema.field(0.0) | |
541 | with pytest.raises(IndexError): | |
542 | schema.field(4) | |
543 | ||
544 | ||
545 | def test_schema_negative_indexing(): | |
546 | fields = [ | |
547 | pa.field('foo', pa.int32()), | |
548 | pa.field('bar', pa.string()), | |
549 | pa.field('baz', pa.list_(pa.int8())) | |
550 | ] | |
551 | ||
552 | schema = pa.schema(fields) | |
553 | ||
554 | assert schema[-1].equals(schema[2]) | |
555 | assert schema[-2].equals(schema[1]) | |
556 | assert schema[-3].equals(schema[0]) | |
557 | ||
558 | with pytest.raises(IndexError): | |
559 | schema[-4] | |
560 | ||
561 | with pytest.raises(IndexError): | |
562 | schema[3] | |
563 | ||
564 | ||
565 | def test_schema_repr_with_dictionaries(): | |
566 | fields = [ | |
567 | pa.field('one', pa.dictionary(pa.int16(), pa.string())), | |
568 | pa.field('two', pa.int32()) | |
569 | ] | |
570 | sch = pa.schema(fields) | |
571 | ||
572 | expected = ( | |
573 | """\ | |
574 | one: dictionary<values=string, indices=int16, ordered=0> | |
575 | two: int32""") | |
576 | ||
577 | assert repr(sch) == expected | |
578 | ||
579 | ||
580 | def test_type_schema_pickling(): | |
581 | cases = [ | |
582 | pa.int8(), | |
583 | pa.string(), | |
584 | pa.binary(), | |
585 | pa.binary(10), | |
586 | pa.list_(pa.string()), | |
587 | pa.map_(pa.string(), pa.int8()), | |
588 | pa.struct([ | |
589 | pa.field('a', 'int8'), | |
590 | pa.field('b', 'string') | |
591 | ]), | |
592 | pa.union([ | |
593 | pa.field('a', pa.int8()), | |
594 | pa.field('b', pa.int16()) | |
595 | ], pa.lib.UnionMode_SPARSE), | |
596 | pa.union([ | |
597 | pa.field('a', pa.int8()), | |
598 | pa.field('b', pa.int16()) | |
599 | ], pa.lib.UnionMode_DENSE), | |
600 | pa.time32('s'), | |
601 | pa.time64('us'), | |
602 | pa.date32(), | |
603 | pa.date64(), | |
604 | pa.timestamp('ms'), | |
605 | pa.timestamp('ns'), | |
606 | pa.decimal128(12, 2), | |
607 | pa.decimal256(76, 38), | |
608 | pa.field('a', 'string', metadata={b'foo': b'bar'}), | |
609 | pa.list_(pa.field("element", pa.int64())), | |
610 | pa.large_list(pa.field("element", pa.int64())), | |
611 | pa.map_(pa.field("key", pa.string(), nullable=False), | |
612 | pa.field("value", pa.int8())) | |
613 | ] | |
614 | ||
615 | for val in cases: | |
616 | roundtripped = pickle.loads(pickle.dumps(val)) | |
617 | assert val == roundtripped | |
618 | ||
619 | fields = [] | |
620 | for i, f in enumerate(cases): | |
621 | if isinstance(f, pa.Field): | |
622 | fields.append(f) | |
623 | else: | |
624 | fields.append(pa.field('_f{}'.format(i), f)) | |
625 | ||
626 | schema = pa.schema(fields, metadata={b'foo': b'bar'}) | |
627 | roundtripped = pickle.loads(pickle.dumps(schema)) | |
628 | assert schema == roundtripped | |
629 | ||
630 | ||
631 | def test_empty_table(): | |
632 | schema1 = pa.schema([ | |
633 | pa.field('f0', pa.int64()), | |
634 | pa.field('f1', pa.dictionary(pa.int32(), pa.string())), | |
635 | pa.field('f2', pa.list_(pa.list_(pa.int64()))), | |
636 | ]) | |
637 | # test it preserves field nullability | |
638 | schema2 = pa.schema([ | |
639 | pa.field('a', pa.int64(), nullable=False), | |
640 | pa.field('b', pa.int64()) | |
641 | ]) | |
642 | ||
643 | for schema in [schema1, schema2]: | |
644 | table = schema.empty_table() | |
645 | assert isinstance(table, pa.Table) | |
646 | assert table.num_rows == 0 | |
647 | assert table.schema == schema | |
648 | ||
649 | ||
650 | @pytest.mark.pandas | |
651 | def test_schema_from_pandas(): | |
652 | import pandas as pd | |
653 | inputs = [ | |
654 | list(range(10)), | |
655 | pd.Categorical(list(range(10))), | |
656 | ['foo', 'bar', None, 'baz', 'qux'], | |
657 | np.array([ | |
658 | '2007-07-13T01:23:34.123456789', | |
659 | '2006-01-13T12:34:56.432539784', | |
660 | '2010-08-13T05:46:57.437699912' | |
661 | ], dtype='datetime64[ns]'), | |
662 | ] | |
663 | if Version(pd.__version__) >= Version('1.0.0'): | |
664 | inputs.append(pd.array([1, 2, None], dtype=pd.Int32Dtype())) | |
665 | for data in inputs: | |
666 | df = pd.DataFrame({'a': data}) | |
667 | schema = pa.Schema.from_pandas(df) | |
668 | expected = pa.Table.from_pandas(df).schema | |
669 | assert schema == expected | |
670 | ||
671 | ||
672 | def test_schema_sizeof(): | |
673 | schema = pa.schema([ | |
674 | pa.field('foo', pa.int32()), | |
675 | pa.field('bar', pa.string()), | |
676 | ]) | |
677 | ||
678 | assert sys.getsizeof(schema) > 30 | |
679 | ||
680 | schema2 = schema.with_metadata({"key": "some metadata"}) | |
681 | assert sys.getsizeof(schema2) > sys.getsizeof(schema) | |
682 | schema3 = schema.with_metadata({"key": "some more metadata"}) | |
683 | assert sys.getsizeof(schema3) > sys.getsizeof(schema2) | |
684 | ||
685 | ||
686 | def test_schema_merge(): | |
687 | a = pa.schema([ | |
688 | pa.field('foo', pa.int32()), | |
689 | pa.field('bar', pa.string()), | |
690 | pa.field('baz', pa.list_(pa.int8())) | |
691 | ]) | |
692 | b = pa.schema([ | |
693 | pa.field('foo', pa.int32()), | |
694 | pa.field('qux', pa.bool_()) | |
695 | ]) | |
696 | c = pa.schema([ | |
697 | pa.field('quux', pa.dictionary(pa.int32(), pa.string())) | |
698 | ]) | |
699 | d = pa.schema([ | |
700 | pa.field('foo', pa.int64()), | |
701 | pa.field('qux', pa.bool_()) | |
702 | ]) | |
703 | ||
704 | result = pa.unify_schemas([a, b, c]) | |
705 | expected = pa.schema([ | |
706 | pa.field('foo', pa.int32()), | |
707 | pa.field('bar', pa.string()), | |
708 | pa.field('baz', pa.list_(pa.int8())), | |
709 | pa.field('qux', pa.bool_()), | |
710 | pa.field('quux', pa.dictionary(pa.int32(), pa.string())) | |
711 | ]) | |
712 | assert result.equals(expected) | |
713 | ||
714 | with pytest.raises(pa.ArrowInvalid): | |
715 | pa.unify_schemas([b, d]) | |
716 | ||
717 | # ARROW-14002: Try with tuple instead of list | |
718 | result = pa.unify_schemas((a, b, c)) | |
719 | assert result.equals(expected) | |
720 | ||
721 | ||
722 | def test_undecodable_metadata(): | |
723 | # ARROW-10214: undecodable metadata shouldn't fail repr() | |
724 | data1 = b'abcdef\xff\x00' | |
725 | data2 = b'ghijkl\xff\x00' | |
726 | schema = pa.schema( | |
727 | [pa.field('ints', pa.int16(), metadata={'key': data1})], | |
728 | metadata={'key': data2}) | |
729 | assert 'abcdef' in str(schema) | |
730 | assert 'ghijkl' in str(schema) |