]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/python/pyarrow/tests/test_pandas.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / pyarrow / tests / test_pandas.py
CommitLineData
1d09f67e
TL
1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements. See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership. The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License. You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied. See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18import gc
19import decimal
20import json
21import multiprocessing as mp
22import sys
23
24from collections import OrderedDict
25from datetime import date, datetime, time, timedelta, timezone
26
27import hypothesis as h
28import hypothesis.extra.pytz as tzst
29import hypothesis.strategies as st
30import numpy as np
31import numpy.testing as npt
32import pytest
33import pytz
34
35from pyarrow.pandas_compat import get_logical_type, _pandas_api
36from pyarrow.tests.util import invoke_script, random_ascii, rands
37import pyarrow.tests.strategies as past
38from pyarrow.vendored.version import Version
39
40import pyarrow as pa
41try:
42 from pyarrow import parquet as pq
43except ImportError:
44 pass
45
46try:
47 import pandas as pd
48 import pandas.testing as tm
49 from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
50except ImportError:
51 pass
52
53
54# Marks all of the tests in this module
55pytestmark = pytest.mark.pandas
56
57
58def _alltypes_example(size=100):
59 return pd.DataFrame({
60 'uint8': np.arange(size, dtype=np.uint8),
61 'uint16': np.arange(size, dtype=np.uint16),
62 'uint32': np.arange(size, dtype=np.uint32),
63 'uint64': np.arange(size, dtype=np.uint64),
64 'int8': np.arange(size, dtype=np.int16),
65 'int16': np.arange(size, dtype=np.int16),
66 'int32': np.arange(size, dtype=np.int32),
67 'int64': np.arange(size, dtype=np.int64),
68 'float32': np.arange(size, dtype=np.float32),
69 'float64': np.arange(size, dtype=np.float64),
70 'bool': np.random.randn(size) > 0,
71 # TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
72 # us, ns
73 'datetime': np.arange("2016-01-01T00:00:00.001", size,
74 dtype='datetime64[ms]'),
75 'str': [str(x) for x in range(size)],
76 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
77 'empty_str': [''] * size
78 })
79
80
81def _check_pandas_roundtrip(df, expected=None, use_threads=False,
82 expected_schema=None,
83 check_dtype=True, schema=None,
84 preserve_index=False,
85 as_batch=False):
86 klass = pa.RecordBatch if as_batch else pa.Table
87 table = klass.from_pandas(df, schema=schema,
88 preserve_index=preserve_index,
89 nthreads=2 if use_threads else 1)
90 result = table.to_pandas(use_threads=use_threads)
91
92 if expected_schema:
93 # all occurrences of _check_pandas_roundtrip passes expected_schema
94 # without the pandas generated key-value metadata
95 assert table.schema.equals(expected_schema)
96
97 if expected is None:
98 expected = df
99
100 tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
101 check_index_type=('equiv' if preserve_index
102 else False))
103
104
105def _check_series_roundtrip(s, type_=None, expected_pa_type=None):
106 arr = pa.array(s, from_pandas=True, type=type_)
107
108 if type_ is not None and expected_pa_type is None:
109 expected_pa_type = type_
110
111 if expected_pa_type is not None:
112 assert arr.type == expected_pa_type
113
114 result = pd.Series(arr.to_pandas(), name=s.name)
115 tm.assert_series_equal(s, result)
116
117
118def _check_array_roundtrip(values, expected=None, mask=None,
119 type=None):
120 arr = pa.array(values, from_pandas=True, mask=mask, type=type)
121 result = arr.to_pandas()
122
123 values_nulls = pd.isnull(values)
124 if mask is None:
125 assert arr.null_count == values_nulls.sum()
126 else:
127 assert arr.null_count == (mask | values_nulls).sum()
128
129 if expected is None:
130 if mask is None:
131 expected = pd.Series(values)
132 else:
133 expected = pd.Series(np.ma.masked_array(values, mask=mask))
134
135 tm.assert_series_equal(pd.Series(result), expected, check_names=False)
136
137
138def _check_array_from_pandas_roundtrip(np_array, type=None):
139 arr = pa.array(np_array, from_pandas=True, type=type)
140 result = arr.to_pandas()
141 npt.assert_array_equal(result, np_array)
142
143
144class TestConvertMetadata:
145 """
146 Conversion tests for Pandas metadata & indices.
147 """
148
149 def test_non_string_columns(self):
150 df = pd.DataFrame({0: [1, 2, 3]})
151 table = pa.Table.from_pandas(df)
152 assert table.field(0).name == '0'
153
154 def test_from_pandas_with_columns(self):
155 df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]},
156 columns=[1, 0])
157
158 table = pa.Table.from_pandas(df, columns=[0, 1])
159 expected = pa.Table.from_pandas(df[[0, 1]])
160 assert expected.equals(table)
161
162 record_batch_table = pa.RecordBatch.from_pandas(df, columns=[0, 1])
163 record_batch_expected = pa.RecordBatch.from_pandas(df[[0, 1]])
164 assert record_batch_expected.equals(record_batch_table)
165
166 def test_column_index_names_are_preserved(self):
167 df = pd.DataFrame({'data': [1, 2, 3]})
168 df.columns.names = ['a']
169 _check_pandas_roundtrip(df, preserve_index=True)
170
171 def test_range_index_shortcut(self):
172 # ARROW-1639
173 index_name = 'foo'
174 df = pd.DataFrame({'a': [1, 2, 3, 4]},
175 index=pd.RangeIndex(0, 8, step=2, name=index_name))
176
177 df2 = pd.DataFrame({'a': [4, 5, 6, 7]},
178 index=pd.RangeIndex(0, 4))
179
180 table = pa.Table.from_pandas(df)
181 table_no_index_name = pa.Table.from_pandas(df2)
182
183 # The RangeIndex is tracked in the metadata only
184 assert len(table.schema) == 1
185
186 result = table.to_pandas()
187 tm.assert_frame_equal(result, df)
188 assert isinstance(result.index, pd.RangeIndex)
189 assert _pandas_api.get_rangeindex_attribute(result.index, 'step') == 2
190 assert result.index.name == index_name
191
192 result2 = table_no_index_name.to_pandas()
193 tm.assert_frame_equal(result2, df2)
194 assert isinstance(result2.index, pd.RangeIndex)
195 assert _pandas_api.get_rangeindex_attribute(result2.index, 'step') == 1
196 assert result2.index.name is None
197
198 def test_range_index_force_serialization(self):
199 # ARROW-5427: preserve_index=True will force the RangeIndex to
200 # be serialized as a column rather than tracked more
201 # efficiently as metadata
202 df = pd.DataFrame({'a': [1, 2, 3, 4]},
203 index=pd.RangeIndex(0, 8, step=2, name='foo'))
204
205 table = pa.Table.from_pandas(df, preserve_index=True)
206 assert table.num_columns == 2
207 assert 'foo' in table.column_names
208
209 restored = table.to_pandas()
210 tm.assert_frame_equal(restored, df)
211
212 def test_rangeindex_doesnt_warn(self):
213 # ARROW-5606: pandas 0.25 deprecated private _start/stop/step
214 # attributes -> can be removed if support < pd 0.25 is dropped
215 df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
216
217 with pytest.warns(None) as record:
218 _check_pandas_roundtrip(df, preserve_index=True)
219
220 assert len(record) == 0
221
222 def test_multiindex_columns(self):
223 columns = pd.MultiIndex.from_arrays([
224 ['one', 'two'], ['X', 'Y']
225 ])
226 df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
227 _check_pandas_roundtrip(df, preserve_index=True)
228
229 def test_multiindex_columns_with_dtypes(self):
230 columns = pd.MultiIndex.from_arrays(
231 [
232 ['one', 'two'],
233 pd.DatetimeIndex(['2017-08-01', '2017-08-02']),
234 ],
235 names=['level_1', 'level_2'],
236 )
237 df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
238 _check_pandas_roundtrip(df, preserve_index=True)
239
240 def test_multiindex_with_column_dtype_object(self):
241 # ARROW-3651 & ARROW-9096
242 # Bug when dtype of the columns is object.
243
244 # uinderlying dtype: integer
245 df = pd.DataFrame([1], columns=pd.Index([1], dtype=object))
246 _check_pandas_roundtrip(df, preserve_index=True)
247
248 # underlying dtype: floating
249 df = pd.DataFrame([1], columns=pd.Index([1.1], dtype=object))
250 _check_pandas_roundtrip(df, preserve_index=True)
251
252 # underlying dtype: datetime
253 # ARROW-9096: a simple roundtrip now works
254 df = pd.DataFrame([1], columns=pd.Index(
255 [datetime(2018, 1, 1)], dtype="object"))
256 _check_pandas_roundtrip(df, preserve_index=True)
257
258 def test_multiindex_columns_unicode(self):
259 columns = pd.MultiIndex.from_arrays([['あ', 'い'], ['X', 'Y']])
260 df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
261 _check_pandas_roundtrip(df, preserve_index=True)
262
263 def test_multiindex_doesnt_warn(self):
264 # ARROW-3953: pandas 0.24 rename of MultiIndex labels to codes
265 columns = pd.MultiIndex.from_arrays([['one', 'two'], ['X', 'Y']])
266 df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
267
268 with pytest.warns(None) as record:
269 _check_pandas_roundtrip(df, preserve_index=True)
270
271 assert len(record) == 0
272
273 def test_integer_index_column(self):
274 df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
275 _check_pandas_roundtrip(df, preserve_index=True)
276
277 def test_index_metadata_field_name(self):
278 # test None case, and strangely named non-index columns
279 df = pd.DataFrame(
280 [(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
281 index=pd.MultiIndex.from_arrays(
282 [['c', 'b', 'a'], [3, 2, 1]],
283 names=[None, 'foo']
284 ),
285 columns=['a', None, '__index_level_0__'],
286 )
287 with pytest.warns(UserWarning):
288 t = pa.Table.from_pandas(df, preserve_index=True)
289 js = t.schema.pandas_metadata
290
291 col1, col2, col3, idx0, foo = js['columns']
292
293 assert col1['name'] == 'a'
294 assert col1['name'] == col1['field_name']
295
296 assert col2['name'] is None
297 assert col2['field_name'] == 'None'
298
299 assert col3['name'] == '__index_level_0__'
300 assert col3['name'] == col3['field_name']
301
302 idx0_descr, foo_descr = js['index_columns']
303 assert idx0_descr == '__index_level_0__'
304 assert idx0['field_name'] == idx0_descr
305 assert idx0['name'] is None
306
307 assert foo_descr == 'foo'
308 assert foo['field_name'] == foo_descr
309 assert foo['name'] == foo_descr
310
311 def test_categorical_column_index(self):
312 df = pd.DataFrame(
313 [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
314 columns=pd.Index(list('def'), dtype='category')
315 )
316 t = pa.Table.from_pandas(df, preserve_index=True)
317 js = t.schema.pandas_metadata
318
319 column_indexes, = js['column_indexes']
320 assert column_indexes['name'] is None
321 assert column_indexes['pandas_type'] == 'categorical'
322 assert column_indexes['numpy_type'] == 'int8'
323
324 md = column_indexes['metadata']
325 assert md['num_categories'] == 3
326 assert md['ordered'] is False
327
328 def test_string_column_index(self):
329 df = pd.DataFrame(
330 [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
331 columns=pd.Index(list('def'), name='stringz')
332 )
333 t = pa.Table.from_pandas(df, preserve_index=True)
334 js = t.schema.pandas_metadata
335
336 column_indexes, = js['column_indexes']
337 assert column_indexes['name'] == 'stringz'
338 assert column_indexes['name'] == column_indexes['field_name']
339 assert column_indexes['numpy_type'] == 'object'
340 assert column_indexes['pandas_type'] == 'unicode'
341
342 md = column_indexes['metadata']
343
344 assert len(md) == 1
345 assert md['encoding'] == 'UTF-8'
346
347 def test_datetimetz_column_index(self):
348 df = pd.DataFrame(
349 [(1, 'a', 2.0), (2, 'b', 3.0), (3, 'c', 4.0)],
350 columns=pd.date_range(
351 start='2017-01-01', periods=3, tz='America/New_York'
352 )
353 )
354 t = pa.Table.from_pandas(df, preserve_index=True)
355 js = t.schema.pandas_metadata
356
357 column_indexes, = js['column_indexes']
358 assert column_indexes['name'] is None
359 assert column_indexes['pandas_type'] == 'datetimetz'
360 assert column_indexes['numpy_type'] == 'datetime64[ns]'
361
362 md = column_indexes['metadata']
363 assert md['timezone'] == 'America/New_York'
364
365 def test_datetimetz_row_index(self):
366 df = pd.DataFrame({
367 'a': pd.date_range(
368 start='2017-01-01', periods=3, tz='America/New_York'
369 )
370 })
371 df = df.set_index('a')
372
373 _check_pandas_roundtrip(df, preserve_index=True)
374
375 def test_categorical_row_index(self):
376 df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
377 df['a'] = df.a.astype('category')
378 df = df.set_index('a')
379
380 _check_pandas_roundtrip(df, preserve_index=True)
381
382 def test_duplicate_column_names_does_not_crash(self):
383 df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa'))
384 with pytest.raises(ValueError):
385 pa.Table.from_pandas(df)
386
387 def test_dictionary_indices_boundscheck(self):
388 # ARROW-1658. No validation of indices leads to segfaults in pandas
389 indices = [[0, 1], [0, -1]]
390
391 for inds in indices:
392 arr = pa.DictionaryArray.from_arrays(inds, ['a'], safe=False)
393 batch = pa.RecordBatch.from_arrays([arr], ['foo'])
394 table = pa.Table.from_batches([batch, batch, batch])
395
396 with pytest.raises(IndexError):
397 arr.to_pandas()
398
399 with pytest.raises(IndexError):
400 table.to_pandas()
401
402 def test_unicode_with_unicode_column_and_index(self):
403 df = pd.DataFrame({'あ': ['い']}, index=['う'])
404
405 _check_pandas_roundtrip(df, preserve_index=True)
406
407 def test_mixed_column_names(self):
408 # mixed type column names are not reconstructed exactly
409 df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
410
411 for cols in [['あ', b'a'], [1, '2'], [1, 1.5]]:
412 df.columns = pd.Index(cols, dtype=object)
413
414 # assert that the from_pandas raises the warning
415 with pytest.warns(UserWarning):
416 pa.Table.from_pandas(df)
417
418 expected = df.copy()
419 expected.columns = df.columns.values.astype(str)
420 with pytest.warns(UserWarning):
421 _check_pandas_roundtrip(df, expected=expected,
422 preserve_index=True)
423
424 def test_binary_column_name(self):
425 column_data = ['い']
426 key = 'あ'.encode()
427 data = {key: column_data}
428 df = pd.DataFrame(data)
429
430 # we can't use _check_pandas_roundtrip here because our metadata
431 # is always decoded as utf8: even if binary goes in, utf8 comes out
432 t = pa.Table.from_pandas(df, preserve_index=True)
433 df2 = t.to_pandas()
434 assert df.values[0] == df2.values[0]
435 assert df.index.values[0] == df2.index.values[0]
436 assert df.columns[0] == key
437
438 def test_multiindex_duplicate_values(self):
439 num_rows = 3
440 numbers = list(range(num_rows))
441 index = pd.MultiIndex.from_arrays(
442 [['foo', 'foo', 'bar'], numbers],
443 names=['foobar', 'some_numbers'],
444 )
445
446 df = pd.DataFrame({'numbers': numbers}, index=index)
447
448 _check_pandas_roundtrip(df, preserve_index=True)
449
450 def test_metadata_with_mixed_types(self):
451 df = pd.DataFrame({'data': [b'some_bytes', 'some_unicode']})
452 table = pa.Table.from_pandas(df)
453 js = table.schema.pandas_metadata
454 assert 'mixed' not in js
455 data_column = js['columns'][0]
456 assert data_column['pandas_type'] == 'bytes'
457 assert data_column['numpy_type'] == 'object'
458
459 def test_ignore_metadata(self):
460 df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']},
461 index=['one', 'two', 'three'])
462 table = pa.Table.from_pandas(df)
463
464 result = table.to_pandas(ignore_metadata=True)
465 expected = (table.cast(table.schema.remove_metadata())
466 .to_pandas())
467
468 tm.assert_frame_equal(result, expected)
469
470 def test_list_metadata(self):
471 df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
472 schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
473 table = pa.Table.from_pandas(df, schema=schema)
474 js = table.schema.pandas_metadata
475 assert 'mixed' not in js
476 data_column = js['columns'][0]
477 assert data_column['pandas_type'] == 'list[int64]'
478 assert data_column['numpy_type'] == 'object'
479
480 def test_struct_metadata(self):
481 df = pd.DataFrame({'dicts': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
482 table = pa.Table.from_pandas(df)
483 pandas_metadata = table.schema.pandas_metadata
484 assert pandas_metadata['columns'][0]['pandas_type'] == 'object'
485
486 def test_decimal_metadata(self):
487 expected = pd.DataFrame({
488 'decimals': [
489 decimal.Decimal('394092382910493.12341234678'),
490 -decimal.Decimal('314292388910493.12343437128'),
491 ]
492 })
493 table = pa.Table.from_pandas(expected)
494 js = table.schema.pandas_metadata
495 assert 'mixed' not in js
496 data_column = js['columns'][0]
497 assert data_column['pandas_type'] == 'decimal'
498 assert data_column['numpy_type'] == 'object'
499 assert data_column['metadata'] == {'precision': 26, 'scale': 11}
500
501 def test_table_column_subset_metadata(self):
502 # ARROW-1883
503 # non-default index
504 for index in [
505 pd.Index(['a', 'b', 'c'], name='index'),
506 pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]:
507 df = pd.DataFrame({'a': [1, 2, 3],
508 'b': [.1, .2, .3]}, index=index)
509 table = pa.Table.from_pandas(df)
510
511 table_subset = table.remove_column(1)
512 result = table_subset.to_pandas()
513 expected = df[['a']]
514 if isinstance(df.index, pd.DatetimeIndex):
515 df.index.freq = None
516 tm.assert_frame_equal(result, expected)
517
518 table_subset2 = table_subset.remove_column(1)
519 result = table_subset2.to_pandas()
520 tm.assert_frame_equal(result, df[['a']].reset_index(drop=True))
521
522 def test_to_pandas_column_subset_multiindex(self):
523 # ARROW-10122
524 df = pd.DataFrame(
525 {"first": list(range(5)),
526 "second": list(range(5)),
527 "value": np.arange(5)}
528 )
529 table = pa.Table.from_pandas(df.set_index(["first", "second"]))
530
531 subset = table.select(["first", "value"])
532 result = subset.to_pandas()
533 expected = df[["first", "value"]].set_index("first")
534 tm.assert_frame_equal(result, expected)
535
536 def test_empty_list_metadata(self):
537 # Create table with array of empty lists, forced to have type
538 # list(string) in pyarrow
539 c1 = [["test"], ["a", "b"], None]
540 c2 = [[], [], []]
541 arrays = OrderedDict([
542 ('c1', pa.array(c1, type=pa.list_(pa.string()))),
543 ('c2', pa.array(c2, type=pa.list_(pa.string()))),
544 ])
545 rb = pa.RecordBatch.from_arrays(
546 list(arrays.values()),
547 list(arrays.keys())
548 )
549 tbl = pa.Table.from_batches([rb])
550
551 # First roundtrip changes schema, because pandas cannot preserve the
552 # type of empty lists
553 df = tbl.to_pandas()
554 tbl2 = pa.Table.from_pandas(df)
555 md2 = tbl2.schema.pandas_metadata
556
557 # Second roundtrip
558 df2 = tbl2.to_pandas()
559 expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
560
561 tm.assert_frame_equal(df2, expected)
562
563 assert md2['columns'] == [
564 {
565 'name': 'c1',
566 'field_name': 'c1',
567 'metadata': None,
568 'numpy_type': 'object',
569 'pandas_type': 'list[unicode]',
570 },
571 {
572 'name': 'c2',
573 'field_name': 'c2',
574 'metadata': None,
575 'numpy_type': 'object',
576 'pandas_type': 'list[empty]',
577 }
578 ]
579
580 def test_metadata_pandas_version(self):
581 df = pd.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]})
582 table = pa.Table.from_pandas(df)
583 assert table.schema.pandas_metadata['pandas_version'] is not None
584
585 def test_mismatch_metadata_schema(self):
586 # ARROW-10511
587 # It is possible that the metadata and actual schema is not fully
588 # matching (eg no timezone information for tz-aware column)
589 # -> to_pandas() conversion should not fail on that
590 df = pd.DataFrame({"datetime": pd.date_range("2020-01-01", periods=3)})
591
592 # OPTION 1: casting after conversion
593 table = pa.Table.from_pandas(df)
594 # cast the "datetime" column to be tz-aware
595 new_col = table["datetime"].cast(pa.timestamp('ns', tz="UTC"))
596 new_table1 = table.set_column(
597 0, pa.field("datetime", new_col.type), new_col
598 )
599
600 # OPTION 2: specify schema during conversion
601 schema = pa.schema([("datetime", pa.timestamp('ns', tz="UTC"))])
602 new_table2 = pa.Table.from_pandas(df, schema=schema)
603
604 expected = df.copy()
605 expected["datetime"] = expected["datetime"].dt.tz_localize("UTC")
606
607 for new_table in [new_table1, new_table2]:
608 # ensure the new table still has the pandas metadata
609 assert new_table.schema.pandas_metadata is not None
610 # convert to pandas
611 result = new_table.to_pandas()
612 tm.assert_frame_equal(result, expected)
613
614
615class TestConvertPrimitiveTypes:
616 """
617 Conversion tests for primitive (e.g. numeric) types.
618 """
619
620 def test_float_no_nulls(self):
621 data = {}
622 fields = []
623 dtypes = [('f2', pa.float16()),
624 ('f4', pa.float32()),
625 ('f8', pa.float64())]
626 num_values = 100
627
628 for numpy_dtype, arrow_dtype in dtypes:
629 values = np.random.randn(num_values)
630 data[numpy_dtype] = values.astype(numpy_dtype)
631 fields.append(pa.field(numpy_dtype, arrow_dtype))
632
633 df = pd.DataFrame(data)
634 schema = pa.schema(fields)
635 _check_pandas_roundtrip(df, expected_schema=schema)
636
637 def test_float_nulls(self):
638 num_values = 100
639
640 null_mask = np.random.randint(0, 10, size=num_values) < 3
641 dtypes = [('f2', pa.float16()),
642 ('f4', pa.float32()),
643 ('f8', pa.float64())]
644 names = ['f2', 'f4', 'f8']
645 expected_cols = []
646
647 arrays = []
648 fields = []
649 for name, arrow_dtype in dtypes:
650 values = np.random.randn(num_values).astype(name)
651
652 arr = pa.array(values, from_pandas=True, mask=null_mask)
653 arrays.append(arr)
654 fields.append(pa.field(name, arrow_dtype))
655 values[null_mask] = np.nan
656
657 expected_cols.append(values)
658
659 ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
660 columns=names)
661
662 table = pa.Table.from_arrays(arrays, names)
663 assert table.schema.equals(pa.schema(fields))
664 result = table.to_pandas()
665 tm.assert_frame_equal(result, ex_frame)
666
667 def test_float_nulls_to_ints(self):
668 # ARROW-2135
669 df = pd.DataFrame({"a": [1.0, 2.0, np.NaN]})
670 schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
671 table = pa.Table.from_pandas(df, schema=schema, safe=False)
672 assert table[0].to_pylist() == [1, 2, None]
673 tm.assert_frame_equal(df, table.to_pandas())
674
675 def test_float_nulls_to_boolean(self):
676 s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
677 expected = pd.Series([False, True, True, None, True])
678 _check_array_roundtrip(s, expected=expected, type=pa.bool_())
679
680 def test_series_from_pandas_false_respected(self):
681 # Check that explicit from_pandas=False is respected
682 s = pd.Series([0.0, np.nan])
683 arr = pa.array(s, from_pandas=False)
684 assert arr.null_count == 0
685 assert np.isnan(arr[1].as_py())
686
687 def test_integer_no_nulls(self):
688 data = OrderedDict()
689 fields = []
690
691 numpy_dtypes = [
692 ('i1', pa.int8()), ('i2', pa.int16()),
693 ('i4', pa.int32()), ('i8', pa.int64()),
694 ('u1', pa.uint8()), ('u2', pa.uint16()),
695 ('u4', pa.uint32()), ('u8', pa.uint64()),
696 ('longlong', pa.int64()), ('ulonglong', pa.uint64())
697 ]
698 num_values = 100
699
700 for dtype, arrow_dtype in numpy_dtypes:
701 info = np.iinfo(dtype)
702 values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
703 min(info.max, np.iinfo(np.int_).max),
704 size=num_values)
705 data[dtype] = values.astype(dtype)
706 fields.append(pa.field(dtype, arrow_dtype))
707
708 df = pd.DataFrame(data)
709 schema = pa.schema(fields)
710 _check_pandas_roundtrip(df, expected_schema=schema)
711
712 def test_all_integer_types(self):
713 # Test all Numpy integer aliases
714 data = OrderedDict()
715 numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
716 'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc',
717 'int_', 'uint', 'longlong', 'ulonglong']
718 for dtype in numpy_dtypes:
719 data[dtype] = np.arange(12, dtype=dtype)
720 df = pd.DataFrame(data)
721 _check_pandas_roundtrip(df)
722
723 # Do the same with pa.array()
724 # (for some reason, it doesn't use the same code paths at all)
725 for np_arr in data.values():
726 arr = pa.array(np_arr)
727 assert arr.to_pylist() == np_arr.tolist()
728
729 def test_integer_byteorder(self):
730 # Byteswapped arrays are not supported yet
731 int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
732 for dt in int_dtypes:
733 for order in '=<>':
734 data = np.array([1, 2, 42], dtype=order + dt)
735 for np_arr in (data, data[::2]):
736 if data.dtype.isnative:
737 arr = pa.array(data)
738 assert arr.to_pylist() == data.tolist()
739 else:
740 with pytest.raises(NotImplementedError):
741 arr = pa.array(data)
742
743 def test_integer_with_nulls(self):
744 # pandas requires upcast to float dtype
745
746 int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
747 num_values = 100
748
749 null_mask = np.random.randint(0, 10, size=num_values) < 3
750
751 expected_cols = []
752 arrays = []
753 for name in int_dtypes:
754 values = np.random.randint(0, 100, size=num_values)
755
756 arr = pa.array(values, mask=null_mask)
757 arrays.append(arr)
758
759 expected = values.astype('f8')
760 expected[null_mask] = np.nan
761
762 expected_cols.append(expected)
763
764 ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
765 columns=int_dtypes)
766
767 table = pa.Table.from_arrays(arrays, int_dtypes)
768 result = table.to_pandas()
769
770 tm.assert_frame_equal(result, ex_frame)
771
772 def test_array_from_pandas_type_cast(self):
773 arr = np.arange(10, dtype='int64')
774
775 target_type = pa.int8()
776
777 result = pa.array(arr, type=target_type)
778 expected = pa.array(arr.astype('int8'))
779 assert result.equals(expected)
780
781 def test_boolean_no_nulls(self):
782 num_values = 100
783
784 np.random.seed(0)
785
786 df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
787 field = pa.field('bools', pa.bool_())
788 schema = pa.schema([field])
789 _check_pandas_roundtrip(df, expected_schema=schema)
790
791 def test_boolean_nulls(self):
792 # pandas requires upcast to object dtype
793 num_values = 100
794 np.random.seed(0)
795
796 mask = np.random.randint(0, 10, size=num_values) < 3
797 values = np.random.randint(0, 10, size=num_values) < 5
798
799 arr = pa.array(values, mask=mask)
800
801 expected = values.astype(object)
802 expected[mask] = None
803
804 field = pa.field('bools', pa.bool_())
805 schema = pa.schema([field])
806 ex_frame = pd.DataFrame({'bools': expected})
807
808 table = pa.Table.from_arrays([arr], ['bools'])
809 assert table.schema.equals(schema)
810 result = table.to_pandas()
811
812 tm.assert_frame_equal(result, ex_frame)
813
814 def test_boolean_to_int(self):
815 # test from dtype=bool
816 s = pd.Series([True, True, False, True, True] * 2)
817 expected = pd.Series([1, 1, 0, 1, 1] * 2)
818 _check_array_roundtrip(s, expected=expected, type=pa.int64())
819
820 def test_boolean_objects_to_int(self):
821 # test from dtype=object
822 s = pd.Series([True, True, False, True, True] * 2, dtype=object)
823 expected = pd.Series([1, 1, 0, 1, 1] * 2)
824 expected_msg = 'Expected integer, got bool'
825 with pytest.raises(pa.ArrowTypeError, match=expected_msg):
826 _check_array_roundtrip(s, expected=expected, type=pa.int64())
827
828 def test_boolean_nulls_to_float(self):
829 # test from dtype=object
830 s = pd.Series([True, True, False, None, True] * 2)
831 expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
832 _check_array_roundtrip(s, expected=expected, type=pa.float64())
833
834 def test_boolean_multiple_columns(self):
835 # ARROW-6325 (multiple columns resulting in strided conversion)
836 df = pd.DataFrame(np.ones((3, 2), dtype='bool'), columns=['a', 'b'])
837 _check_pandas_roundtrip(df)
838
839 def test_float_object_nulls(self):
840 arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
841 df = pd.DataFrame({'floats': arr})
842 expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
843 field = pa.field('floats', pa.float64())
844 schema = pa.schema([field])
845 _check_pandas_roundtrip(df, expected=expected,
846 expected_schema=schema)
847
848 def test_float_with_null_as_integer(self):
849 # ARROW-2298
850 s = pd.Series([np.nan, 1., 2., np.nan])
851
852 types = [pa.int8(), pa.int16(), pa.int32(), pa.int64(),
853 pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
854 for ty in types:
855 result = pa.array(s, type=ty)
856 expected = pa.array([None, 1, 2, None], type=ty)
857 assert result.equals(expected)
858
859 df = pd.DataFrame({'has_nulls': s})
860 schema = pa.schema([pa.field('has_nulls', ty)])
861 result = pa.Table.from_pandas(df, schema=schema,
862 preserve_index=False)
863 assert result[0].chunk(0).equals(expected)
864
865 def test_int_object_nulls(self):
866 arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
867 df = pd.DataFrame({'ints': arr})
868 expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
869 field = pa.field('ints', pa.int64())
870 schema = pa.schema([field])
871 _check_pandas_roundtrip(df, expected=expected,
872 expected_schema=schema)
873
874 def test_boolean_object_nulls(self):
875 arr = np.array([False, None, True] * 100, dtype=object)
876 df = pd.DataFrame({'bools': arr})
877 field = pa.field('bools', pa.bool_())
878 schema = pa.schema([field])
879 _check_pandas_roundtrip(df, expected_schema=schema)
880
881 def test_all_nulls_cast_numeric(self):
882 arr = np.array([None], dtype=object)
883
884 def _check_type(t):
885 a2 = pa.array(arr, type=t)
886 assert a2.type == t
887 assert a2[0].as_py() is None
888
889 _check_type(pa.int32())
890 _check_type(pa.float64())
891
892 def test_half_floats_from_numpy(self):
893 arr = np.array([1.5, np.nan], dtype=np.float16)
894 a = pa.array(arr, type=pa.float16())
895 x, y = a.to_pylist()
896 assert isinstance(x, np.float16)
897 assert x == 1.5
898 assert isinstance(y, np.float16)
899 assert np.isnan(y)
900
901 a = pa.array(arr, type=pa.float16(), from_pandas=True)
902 x, y = a.to_pylist()
903 assert isinstance(x, np.float16)
904 assert x == 1.5
905 assert y is None
906
907
908@pytest.mark.parametrize('dtype',
909 ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
910def test_array_integer_object_nulls_option(dtype):
911 num_values = 100
912
913 null_mask = np.random.randint(0, 10, size=num_values) < 3
914 values = np.random.randint(0, 100, size=num_values, dtype=dtype)
915
916 array = pa.array(values, mask=null_mask)
917
918 if null_mask.any():
919 expected = values.astype('O')
920 expected[null_mask] = None
921 else:
922 expected = values
923
924 result = array.to_pandas(integer_object_nulls=True)
925
926 np.testing.assert_equal(result, expected)
927
928
929@pytest.mark.parametrize('dtype',
930 ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'])
931def test_table_integer_object_nulls_option(dtype):
932 num_values = 100
933
934 null_mask = np.random.randint(0, 10, size=num_values) < 3
935 values = np.random.randint(0, 100, size=num_values, dtype=dtype)
936
937 array = pa.array(values, mask=null_mask)
938
939 if null_mask.any():
940 expected = values.astype('O')
941 expected[null_mask] = None
942 else:
943 expected = values
944
945 expected = pd.DataFrame({dtype: expected})
946
947 table = pa.Table.from_arrays([array], [dtype])
948 result = table.to_pandas(integer_object_nulls=True)
949
950 tm.assert_frame_equal(result, expected)
951
952
953class TestConvertDateTimeLikeTypes:
954 """
955 Conversion tests for datetime- and timestamp-like types (date64, etc.).
956 """
957
958 def test_timestamps_notimezone_no_nulls(self):
959 df = pd.DataFrame({
960 'datetime64': np.array([
961 '2007-07-13T01:23:34.123456789',
962 '2006-01-13T12:34:56.432539784',
963 '2010-08-13T05:46:57.437699912'],
964 dtype='datetime64[ns]')
965 })
966 field = pa.field('datetime64', pa.timestamp('ns'))
967 schema = pa.schema([field])
968 _check_pandas_roundtrip(
969 df,
970 expected_schema=schema,
971 )
972
973 def test_timestamps_notimezone_nulls(self):
974 df = pd.DataFrame({
975 'datetime64': np.array([
976 '2007-07-13T01:23:34.123456789',
977 None,
978 '2010-08-13T05:46:57.437699912'],
979 dtype='datetime64[ns]')
980 })
981 field = pa.field('datetime64', pa.timestamp('ns'))
982 schema = pa.schema([field])
983 _check_pandas_roundtrip(
984 df,
985 expected_schema=schema,
986 )
987
988 def test_timestamps_with_timezone(self):
989 df = pd.DataFrame({
990 'datetime64': np.array([
991 '2007-07-13T01:23:34.123',
992 '2006-01-13T12:34:56.432',
993 '2010-08-13T05:46:57.437'],
994 dtype='datetime64[ms]')
995 })
996 df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
997 _check_pandas_roundtrip(df)
998
999 _check_series_roundtrip(df['datetime64'])
1000
1001 # drop-in a null and ns instead of ms
1002 df = pd.DataFrame({
1003 'datetime64': np.array([
1004 '2007-07-13T01:23:34.123456789',
1005 None,
1006 '2006-01-13T12:34:56.432539784',
1007 '2010-08-13T05:46:57.437699912'],
1008 dtype='datetime64[ns]')
1009 })
1010 df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
1011
1012 _check_pandas_roundtrip(df)
1013
1014 def test_python_datetime(self):
1015 # ARROW-2106
1016 date_array = [datetime.today() + timedelta(days=x) for x in range(10)]
1017 df = pd.DataFrame({
1018 'datetime': pd.Series(date_array, dtype=object)
1019 })
1020
1021 table = pa.Table.from_pandas(df)
1022 assert isinstance(table[0].chunk(0), pa.TimestampArray)
1023
1024 result = table.to_pandas()
1025 expected_df = pd.DataFrame({
1026 'datetime': date_array
1027 })
1028 tm.assert_frame_equal(expected_df, result)
1029
1030 def test_python_datetime_with_pytz_tzinfo(self):
1031 for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
1032 values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
1033 df = pd.DataFrame({'datetime': values})
1034 _check_pandas_roundtrip(df)
1035
1036 @h.given(st.none() | tzst.timezones())
1037 def test_python_datetime_with_pytz_timezone(self, tz):
1038 values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
1039 df = pd.DataFrame({'datetime': values})
1040 _check_pandas_roundtrip(df)
1041
1042 def test_python_datetime_with_timezone_tzinfo(self):
1043 from datetime import timezone
1044
1045 if Version(pd.__version__) > Version("0.25.0"):
1046 # older pandas versions fail on datetime.timezone.utc (as in input)
1047 # vs pytz.UTC (as in result)
1048 values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=timezone.utc)]
1049 # also test with index to ensure both paths roundtrip (ARROW-9962)
1050 df = pd.DataFrame({'datetime': values}, index=values)
1051 _check_pandas_roundtrip(df, preserve_index=True)
1052
1053 # datetime.timezone is going to be pytz.FixedOffset
1054 hours = 1
1055 tz_timezone = timezone(timedelta(hours=hours))
1056 tz_pytz = pytz.FixedOffset(hours * 60)
1057 values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
1058 values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
1059 df = pd.DataFrame({'datetime': values}, index=values)
1060 df_exp = pd.DataFrame({'datetime': values_exp}, index=values_exp)
1061 _check_pandas_roundtrip(df, expected=df_exp, preserve_index=True)
1062
1063 def test_python_datetime_subclass(self):
1064
1065 class MyDatetime(datetime):
1066 # see https://github.com/pandas-dev/pandas/issues/21142
1067 nanosecond = 0.0
1068
1069 date_array = [MyDatetime(2000, 1, 1, 1, 1, 1)]
1070 df = pd.DataFrame({"datetime": pd.Series(date_array, dtype=object)})
1071
1072 table = pa.Table.from_pandas(df)
1073 assert isinstance(table[0].chunk(0), pa.TimestampArray)
1074
1075 result = table.to_pandas()
1076 expected_df = pd.DataFrame({"datetime": date_array})
1077
1078 # https://github.com/pandas-dev/pandas/issues/21142
1079 expected_df["datetime"] = pd.to_datetime(expected_df["datetime"])
1080
1081 tm.assert_frame_equal(expected_df, result)
1082
1083 def test_python_date_subclass(self):
1084
1085 class MyDate(date):
1086 pass
1087
1088 date_array = [MyDate(2000, 1, 1)]
1089 df = pd.DataFrame({"date": pd.Series(date_array, dtype=object)})
1090
1091 table = pa.Table.from_pandas(df)
1092 assert isinstance(table[0].chunk(0), pa.Date32Array)
1093
1094 result = table.to_pandas()
1095 expected_df = pd.DataFrame(
1096 {"date": np.array([date(2000, 1, 1)], dtype=object)}
1097 )
1098 tm.assert_frame_equal(expected_df, result)
1099
1100 def test_datetime64_to_date32(self):
1101 # ARROW-1718
1102 arr = pa.array([date(2017, 10, 23), None])
1103 c = pa.chunked_array([arr])
1104 s = c.to_pandas()
1105
1106 arr2 = pa.Array.from_pandas(s, type=pa.date32())
1107
1108 assert arr2.equals(arr.cast('date32'))
1109
1110 @pytest.mark.parametrize('mask', [
1111 None,
1112 np.array([True, False, False, True, False, False]),
1113 ])
1114 def test_pandas_datetime_to_date64(self, mask):
1115 s = pd.to_datetime([
1116 '2018-05-10T00:00:00',
1117 '2018-05-11T00:00:00',
1118 '2018-05-12T00:00:00',
1119 '2018-05-10T10:24:01',
1120 '2018-05-11T10:24:01',
1121 '2018-05-12T10:24:01',
1122 ])
1123 arr = pa.Array.from_pandas(s, type=pa.date64(), mask=mask)
1124
1125 data = np.array([
1126 date(2018, 5, 10),
1127 date(2018, 5, 11),
1128 date(2018, 5, 12),
1129 date(2018, 5, 10),
1130 date(2018, 5, 11),
1131 date(2018, 5, 12),
1132 ])
1133 expected = pa.array(data, mask=mask, type=pa.date64())
1134
1135 assert arr.equals(expected)
1136
1137 def test_array_types_date_as_object(self):
1138 data = [date(2000, 1, 1),
1139 None,
1140 date(1970, 1, 1),
1141 date(2040, 2, 26)]
1142 expected_d = np.array(['2000-01-01', None, '1970-01-01',
1143 '2040-02-26'], dtype='datetime64[D]')
1144
1145 expected_ns = np.array(['2000-01-01', None, '1970-01-01',
1146 '2040-02-26'], dtype='datetime64[ns]')
1147
1148 objects = [pa.array(data),
1149 pa.chunked_array([data])]
1150
1151 for obj in objects:
1152 result = obj.to_pandas()
1153 expected_obj = expected_d.astype(object)
1154 assert result.dtype == expected_obj.dtype
1155 npt.assert_array_equal(result, expected_obj)
1156
1157 result = obj.to_pandas(date_as_object=False)
1158 assert result.dtype == expected_ns.dtype
1159 npt.assert_array_equal(result, expected_ns)
1160
1161 def test_table_convert_date_as_object(self):
1162 df = pd.DataFrame({
1163 'date': [date(2000, 1, 1),
1164 None,
1165 date(1970, 1, 1),
1166 date(2040, 2, 26)]})
1167
1168 table = pa.Table.from_pandas(df, preserve_index=False)
1169
1170 df_datetime = table.to_pandas(date_as_object=False)
1171 df_object = table.to_pandas()
1172
1173 tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
1174 check_dtype=True)
1175 tm.assert_frame_equal(df, df_object, check_dtype=True)
1176
1177 def test_date_infer(self):
1178 df = pd.DataFrame({
1179 'date': [date(2000, 1, 1),
1180 None,
1181 date(1970, 1, 1),
1182 date(2040, 2, 26)]})
1183 table = pa.Table.from_pandas(df, preserve_index=False)
1184 field = pa.field('date', pa.date32())
1185
1186 # schema's metadata is generated by from_pandas conversion
1187 expected_schema = pa.schema([field], metadata=table.schema.metadata)
1188 assert table.schema.equals(expected_schema)
1189
1190 result = table.to_pandas()
1191 tm.assert_frame_equal(result, df)
1192
1193 def test_date_mask(self):
1194 arr = np.array([date(2017, 4, 3), date(2017, 4, 4)],
1195 dtype='datetime64[D]')
1196 mask = [True, False]
1197 result = pa.array(arr, mask=np.array(mask))
1198 expected = np.array([None, date(2017, 4, 4)], dtype='datetime64[D]')
1199 expected = pa.array(expected, from_pandas=True)
1200 assert expected.equals(result)
1201
1202 def test_date_objects_typed(self):
1203 arr = np.array([
1204 date(2017, 4, 3),
1205 None,
1206 date(2017, 4, 4),
1207 date(2017, 4, 5)], dtype=object)
1208
1209 arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
1210 arr_i8 = arr_i4.astype('int64') * 86400000
1211 mask = np.array([False, True, False, False])
1212
1213 t32 = pa.date32()
1214 t64 = pa.date64()
1215
1216 a32 = pa.array(arr, type=t32)
1217 a64 = pa.array(arr, type=t64)
1218
1219 a32_expected = pa.array(arr_i4, mask=mask, type=t32)
1220 a64_expected = pa.array(arr_i8, mask=mask, type=t64)
1221
1222 assert a32.equals(a32_expected)
1223 assert a64.equals(a64_expected)
1224
1225 # Test converting back to pandas
1226 colnames = ['date32', 'date64']
1227 table = pa.Table.from_arrays([a32, a64], colnames)
1228
1229 ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
1230 '2017-04-05'],
1231 dtype='datetime64[D]'))
1232 ex_values[1] = pd.NaT.value
1233
1234 ex_datetime64ns = ex_values.astype('datetime64[ns]')
1235 expected_pandas = pd.DataFrame({'date32': ex_datetime64ns,
1236 'date64': ex_datetime64ns},
1237 columns=colnames)
1238 table_pandas = table.to_pandas(date_as_object=False)
1239 tm.assert_frame_equal(table_pandas, expected_pandas)
1240
1241 table_pandas_objects = table.to_pandas()
1242 ex_objects = ex_values.astype('object')
1243 expected_pandas_objects = pd.DataFrame({'date32': ex_objects,
1244 'date64': ex_objects},
1245 columns=colnames)
1246 tm.assert_frame_equal(table_pandas_objects,
1247 expected_pandas_objects)
1248
1249 def test_pandas_null_values(self):
1250 # ARROW-842
1251 pd_NA = getattr(pd, 'NA', None)
1252 values = np.array([datetime(2000, 1, 1), pd.NaT, pd_NA], dtype=object)
1253 values_with_none = np.array([datetime(2000, 1, 1), None, None],
1254 dtype=object)
1255 result = pa.array(values, from_pandas=True)
1256 expected = pa.array(values_with_none, from_pandas=True)
1257 assert result.equals(expected)
1258 assert result.null_count == 2
1259
1260 # ARROW-9407
1261 assert pa.array([pd.NaT], from_pandas=True).type == pa.null()
1262 assert pa.array([pd_NA], from_pandas=True).type == pa.null()
1263
1264 def test_dates_from_integers(self):
1265 t1 = pa.date32()
1266 t2 = pa.date64()
1267
1268 arr = np.array([17259, 17260, 17261], dtype='int32')
1269 arr2 = arr.astype('int64') * 86400000
1270
1271 a1 = pa.array(arr, type=t1)
1272 a2 = pa.array(arr2, type=t2)
1273
1274 expected = date(2017, 4, 3)
1275 assert a1[0].as_py() == expected
1276 assert a2[0].as_py() == expected
1277
1278 def test_pytime_from_pandas(self):
1279 pytimes = [time(1, 2, 3, 1356),
1280 time(4, 5, 6, 1356)]
1281
1282 # microseconds
1283 t1 = pa.time64('us')
1284
1285 aobjs = np.array(pytimes + [None], dtype=object)
1286 parr = pa.array(aobjs)
1287 assert parr.type == t1
1288 assert parr[0].as_py() == pytimes[0]
1289 assert parr[1].as_py() == pytimes[1]
1290 assert parr[2].as_py() is None
1291
1292 # DataFrame
1293 df = pd.DataFrame({'times': aobjs})
1294 batch = pa.RecordBatch.from_pandas(df)
1295 assert batch[0].equals(parr)
1296
1297 # Test ndarray of int64 values
1298 arr = np.array([_pytime_to_micros(v) for v in pytimes],
1299 dtype='int64')
1300
1301 a1 = pa.array(arr, type=pa.time64('us'))
1302 assert a1[0].as_py() == pytimes[0]
1303
1304 a2 = pa.array(arr * 1000, type=pa.time64('ns'))
1305 assert a2[0].as_py() == pytimes[0]
1306
1307 a3 = pa.array((arr / 1000).astype('i4'),
1308 type=pa.time32('ms'))
1309 assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
1310
1311 a4 = pa.array((arr / 1000000).astype('i4'),
1312 type=pa.time32('s'))
1313 assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
1314
1315 def test_arrow_time_to_pandas(self):
1316 pytimes = [time(1, 2, 3, 1356),
1317 time(4, 5, 6, 1356),
1318 time(0, 0, 0)]
1319
1320 expected = np.array(pytimes[:2] + [None])
1321 expected_ms = np.array([x.replace(microsecond=1000)
1322 for x in pytimes[:2]] +
1323 [None])
1324 expected_s = np.array([x.replace(microsecond=0)
1325 for x in pytimes[:2]] +
1326 [None])
1327
1328 arr = np.array([_pytime_to_micros(v) for v in pytimes],
1329 dtype='int64')
1330 arr = np.array([_pytime_to_micros(v) for v in pytimes],
1331 dtype='int64')
1332
1333 null_mask = np.array([False, False, True], dtype=bool)
1334
1335 a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
1336 a2 = pa.array(arr * 1000, mask=null_mask,
1337 type=pa.time64('ns'))
1338
1339 a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
1340 type=pa.time32('ms'))
1341 a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
1342 type=pa.time32('s'))
1343
1344 names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
1345 batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
1346
1347 for arr, expected_values in [(a1, expected),
1348 (a2, expected),
1349 (a3, expected_ms),
1350 (a4, expected_s)]:
1351 result_pandas = arr.to_pandas()
1352 assert (result_pandas.values == expected_values).all()
1353
1354 df = batch.to_pandas()
1355 expected_df = pd.DataFrame({'time64[us]': expected,
1356 'time64[ns]': expected,
1357 'time32[ms]': expected_ms,
1358 'time32[s]': expected_s},
1359 columns=names)
1360
1361 tm.assert_frame_equal(df, expected_df)
1362
1363 def test_numpy_datetime64_columns(self):
1364 datetime64_ns = np.array([
1365 '2007-07-13T01:23:34.123456789',
1366 None,
1367 '2006-01-13T12:34:56.432539784',
1368 '2010-08-13T05:46:57.437699912'],
1369 dtype='datetime64[ns]')
1370 _check_array_from_pandas_roundtrip(datetime64_ns)
1371
1372 datetime64_us = np.array([
1373 '2007-07-13T01:23:34.123456',
1374 None,
1375 '2006-01-13T12:34:56.432539',
1376 '2010-08-13T05:46:57.437699'],
1377 dtype='datetime64[us]')
1378 _check_array_from_pandas_roundtrip(datetime64_us)
1379
1380 datetime64_ms = np.array([
1381 '2007-07-13T01:23:34.123',
1382 None,
1383 '2006-01-13T12:34:56.432',
1384 '2010-08-13T05:46:57.437'],
1385 dtype='datetime64[ms]')
1386 _check_array_from_pandas_roundtrip(datetime64_ms)
1387
1388 datetime64_s = np.array([
1389 '2007-07-13T01:23:34',
1390 None,
1391 '2006-01-13T12:34:56',
1392 '2010-08-13T05:46:57'],
1393 dtype='datetime64[s]')
1394 _check_array_from_pandas_roundtrip(datetime64_s)
1395
1396 def test_timestamp_to_pandas_ns(self):
1397 # non-ns timestamp gets cast to ns on conversion to pandas
1398 arr = pa.array([1, 2, 3], pa.timestamp('ms'))
1399 expected = pd.Series(pd.to_datetime([1, 2, 3], unit='ms'))
1400 s = arr.to_pandas()
1401 tm.assert_series_equal(s, expected)
1402 arr = pa.chunked_array([arr])
1403 s = arr.to_pandas()
1404 tm.assert_series_equal(s, expected)
1405
1406 def test_timestamp_to_pandas_out_of_bounds(self):
1407 # ARROW-7758 check for out of bounds timestamps for non-ns timestamps
1408
1409 for unit in ['s', 'ms', 'us']:
1410 for tz in [None, 'America/New_York']:
1411 arr = pa.array([datetime(1, 1, 1)], pa.timestamp(unit, tz=tz))
1412 table = pa.table({'a': arr})
1413
1414 msg = "would result in out of bounds timestamp"
1415 with pytest.raises(ValueError, match=msg):
1416 arr.to_pandas()
1417
1418 with pytest.raises(ValueError, match=msg):
1419 table.to_pandas()
1420
1421 with pytest.raises(ValueError, match=msg):
1422 # chunked array
1423 table.column('a').to_pandas()
1424
1425 # just ensure those don't give an error, but do not
1426 # check actual garbage output
1427 arr.to_pandas(safe=False)
1428 table.to_pandas(safe=False)
1429 table.column('a').to_pandas(safe=False)
1430
1431 def test_timestamp_to_pandas_empty_chunked(self):
1432 # ARROW-7907 table with chunked array with 0 chunks
1433 table = pa.table({'a': pa.chunked_array([], type=pa.timestamp('us'))})
1434 result = table.to_pandas()
1435 expected = pd.DataFrame({'a': pd.Series([], dtype="datetime64[ns]")})
1436 tm.assert_frame_equal(result, expected)
1437
1438 @pytest.mark.parametrize('dtype', [pa.date32(), pa.date64()])
1439 def test_numpy_datetime64_day_unit(self, dtype):
1440 datetime64_d = np.array([
1441 '2007-07-13',
1442 None,
1443 '2006-01-15',
1444 '2010-08-19'],
1445 dtype='datetime64[D]')
1446 _check_array_from_pandas_roundtrip(datetime64_d, type=dtype)
1447
1448 def test_array_from_pandas_date_with_mask(self):
1449 m = np.array([True, False, True])
1450 data = pd.Series([
1451 date(1990, 1, 1),
1452 date(1991, 1, 1),
1453 date(1992, 1, 1)
1454 ])
1455
1456 result = pa.Array.from_pandas(data, mask=m)
1457
1458 expected = pd.Series([None, date(1991, 1, 1), None])
1459 assert pa.Array.from_pandas(expected).equals(result)
1460
1461 @pytest.mark.skipif(
1462 Version('1.16.0') <= Version(np.__version__) < Version('1.16.1'),
1463 reason='Until numpy/numpy#12745 is resolved')
1464 def test_fixed_offset_timezone(self):
1465 df = pd.DataFrame({
1466 'a': [
1467 pd.Timestamp('2012-11-11 00:00:00+01:00'),
1468 pd.NaT
1469 ]
1470 })
1471 _check_pandas_roundtrip(df)
1472 _check_serialize_components_roundtrip(df)
1473
1474 def test_timedeltas_no_nulls(self):
1475 df = pd.DataFrame({
1476 'timedelta64': np.array([0, 3600000000000, 7200000000000],
1477 dtype='timedelta64[ns]')
1478 })
1479 field = pa.field('timedelta64', pa.duration('ns'))
1480 schema = pa.schema([field])
1481 _check_pandas_roundtrip(
1482 df,
1483 expected_schema=schema,
1484 )
1485
1486 def test_timedeltas_nulls(self):
1487 df = pd.DataFrame({
1488 'timedelta64': np.array([0, None, 7200000000000],
1489 dtype='timedelta64[ns]')
1490 })
1491 field = pa.field('timedelta64', pa.duration('ns'))
1492 schema = pa.schema([field])
1493 _check_pandas_roundtrip(
1494 df,
1495 expected_schema=schema,
1496 )
1497
1498 def test_month_day_nano_interval(self):
1499 from pandas.tseries.offsets import DateOffset
1500 df = pd.DataFrame({
1501 'date_offset': [None,
1502 DateOffset(days=3600, months=3600, microseconds=3,
1503 nanoseconds=600)]
1504 })
1505 schema = pa.schema([('date_offset', pa.month_day_nano_interval())])
1506 _check_pandas_roundtrip(
1507 df,
1508 expected_schema=schema)
1509
1510
1511# ----------------------------------------------------------------------
1512# Conversion tests for string and binary types.
1513
1514
1515class TestConvertStringLikeTypes:
1516
1517 def test_pandas_unicode(self):
1518 repeats = 1000
1519 values = ['foo', None, 'bar', 'mañana', np.nan]
1520 df = pd.DataFrame({'strings': values * repeats})
1521 field = pa.field('strings', pa.string())
1522 schema = pa.schema([field])
1523
1524 _check_pandas_roundtrip(df, expected_schema=schema)
1525
1526 def test_bytes_to_binary(self):
1527 values = ['qux', b'foo', None, bytearray(b'barz'), 'qux', np.nan]
1528 df = pd.DataFrame({'strings': values})
1529
1530 table = pa.Table.from_pandas(df)
1531 assert table[0].type == pa.binary()
1532
1533 values2 = [b'qux', b'foo', None, b'barz', b'qux', np.nan]
1534 expected = pd.DataFrame({'strings': values2})
1535 _check_pandas_roundtrip(df, expected)
1536
1537 @pytest.mark.large_memory
1538 def test_bytes_exceed_2gb(self):
1539 v1 = b'x' * 100000000
1540 v2 = b'x' * 147483646
1541
1542 # ARROW-2227, hit exactly 2GB on the nose
1543 df = pd.DataFrame({
1544 'strings': [v1] * 20 + [v2] + ['x'] * 20
1545 })
1546 arr = pa.array(df['strings'])
1547 assert isinstance(arr, pa.ChunkedArray)
1548 assert arr.num_chunks == 2
1549 arr = None
1550
1551 table = pa.Table.from_pandas(df)
1552 assert table[0].num_chunks == 2
1553
1554 @pytest.mark.large_memory
1555 @pytest.mark.parametrize('char', ['x', b'x'])
1556 def test_auto_chunking_pandas_series_of_strings(self, char):
1557 # ARROW-2367
1558 v1 = char * 100000000
1559 v2 = char * 147483646
1560
1561 df = pd.DataFrame({
1562 'strings': [[v1]] * 20 + [[v2]] + [[b'x']]
1563 })
1564 arr = pa.array(df['strings'], from_pandas=True)
1565 assert isinstance(arr, pa.ChunkedArray)
1566 assert arr.num_chunks == 2
1567 assert len(arr.chunk(0)) == 21
1568 assert len(arr.chunk(1)) == 1
1569
1570 def test_fixed_size_bytes(self):
1571 values = [b'foo', None, bytearray(b'bar'), None, None, b'hey']
1572 df = pd.DataFrame({'strings': values})
1573 schema = pa.schema([pa.field('strings', pa.binary(3))])
1574 table = pa.Table.from_pandas(df, schema=schema)
1575 assert table.schema[0].type == schema[0].type
1576 assert table.schema[0].name == schema[0].name
1577 result = table.to_pandas()
1578 tm.assert_frame_equal(result, df)
1579
1580 def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
1581 values = [b'foo', None, b'ba', None, None, b'hey']
1582 df = pd.DataFrame({'strings': values})
1583 schema = pa.schema([pa.field('strings', pa.binary(3))])
1584 with pytest.raises(pa.ArrowInvalid):
1585 pa.Table.from_pandas(df, schema=schema)
1586
1587 def test_variable_size_bytes(self):
1588 s = pd.Series([b'123', b'', b'a', None])
1589 _check_series_roundtrip(s, type_=pa.binary())
1590
1591 def test_binary_from_bytearray(self):
1592 s = pd.Series([bytearray(b'123'), bytearray(b''), bytearray(b'a'),
1593 None])
1594 # Explicitly set type
1595 _check_series_roundtrip(s, type_=pa.binary())
1596 # Infer type from bytearrays
1597 _check_series_roundtrip(s, expected_pa_type=pa.binary())
1598
1599 def test_large_binary(self):
1600 s = pd.Series([b'123', b'', b'a', None])
1601 _check_series_roundtrip(s, type_=pa.large_binary())
1602 df = pd.DataFrame({'a': s})
1603 _check_pandas_roundtrip(
1604 df, schema=pa.schema([('a', pa.large_binary())]))
1605
1606 def test_large_string(self):
1607 s = pd.Series(['123', '', 'a', None])
1608 _check_series_roundtrip(s, type_=pa.large_string())
1609 df = pd.DataFrame({'a': s})
1610 _check_pandas_roundtrip(
1611 df, schema=pa.schema([('a', pa.large_string())]))
1612
1613 def test_table_empty_str(self):
1614 values = ['', '', '', '', '']
1615 df = pd.DataFrame({'strings': values})
1616 field = pa.field('strings', pa.string())
1617 schema = pa.schema([field])
1618 table = pa.Table.from_pandas(df, schema=schema)
1619
1620 result1 = table.to_pandas(strings_to_categorical=False)
1621 expected1 = pd.DataFrame({'strings': values})
1622 tm.assert_frame_equal(result1, expected1, check_dtype=True)
1623
1624 result2 = table.to_pandas(strings_to_categorical=True)
1625 expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
1626 tm.assert_frame_equal(result2, expected2, check_dtype=True)
1627
1628 def test_selective_categoricals(self):
1629 values = ['', '', '', '', '']
1630 df = pd.DataFrame({'strings': values})
1631 field = pa.field('strings', pa.string())
1632 schema = pa.schema([field])
1633 table = pa.Table.from_pandas(df, schema=schema)
1634 expected_str = pd.DataFrame({'strings': values})
1635 expected_cat = pd.DataFrame({'strings': pd.Categorical(values)})
1636
1637 result1 = table.to_pandas(categories=['strings'])
1638 tm.assert_frame_equal(result1, expected_cat, check_dtype=True)
1639 result2 = table.to_pandas(categories=[])
1640 tm.assert_frame_equal(result2, expected_str, check_dtype=True)
1641 result3 = table.to_pandas(categories=('strings',))
1642 tm.assert_frame_equal(result3, expected_cat, check_dtype=True)
1643 result4 = table.to_pandas(categories=tuple())
1644 tm.assert_frame_equal(result4, expected_str, check_dtype=True)
1645
1646 def test_to_pandas_categorical_zero_length(self):
1647 # ARROW-3586
1648 array = pa.array([], type=pa.int32())
1649 table = pa.Table.from_arrays(arrays=[array], names=['col'])
1650 # This would segfault under 0.11.0
1651 table.to_pandas(categories=['col'])
1652
1653 def test_to_pandas_categories_already_dictionary(self):
1654 # Showed up in ARROW-6434, ARROW-6435
1655 array = pa.array(['foo', 'foo', 'foo', 'bar']).dictionary_encode()
1656 table = pa.Table.from_arrays(arrays=[array], names=['col'])
1657 result = table.to_pandas(categories=['col'])
1658 assert table.to_pandas().equals(result)
1659
1660 def test_table_str_to_categorical_without_na(self):
1661 values = ['a', 'a', 'b', 'b', 'c']
1662 df = pd.DataFrame({'strings': values})
1663 field = pa.field('strings', pa.string())
1664 schema = pa.schema([field])
1665 table = pa.Table.from_pandas(df, schema=schema)
1666
1667 result = table.to_pandas(strings_to_categorical=True)
1668 expected = pd.DataFrame({'strings': pd.Categorical(values)})
1669 tm.assert_frame_equal(result, expected, check_dtype=True)
1670
1671 with pytest.raises(pa.ArrowInvalid):
1672 table.to_pandas(strings_to_categorical=True,
1673 zero_copy_only=True)
1674
1675 def test_table_str_to_categorical_with_na(self):
1676 values = [None, 'a', 'b', np.nan]
1677 df = pd.DataFrame({'strings': values})
1678 field = pa.field('strings', pa.string())
1679 schema = pa.schema([field])
1680 table = pa.Table.from_pandas(df, schema=schema)
1681
1682 result = table.to_pandas(strings_to_categorical=True)
1683 expected = pd.DataFrame({'strings': pd.Categorical(values)})
1684 tm.assert_frame_equal(result, expected, check_dtype=True)
1685
1686 with pytest.raises(pa.ArrowInvalid):
1687 table.to_pandas(strings_to_categorical=True,
1688 zero_copy_only=True)
1689
1690 # Regression test for ARROW-2101
1691 def test_array_of_bytes_to_strings(self):
1692 converted = pa.array(np.array([b'x'], dtype=object), pa.string())
1693 assert converted.type == pa.string()
1694
1695 # Make sure that if an ndarray of bytes is passed to the array
1696 # constructor and the type is string, it will fail if those bytes
1697 # cannot be converted to utf-8
1698 def test_array_of_bytes_to_strings_bad_data(self):
1699 with pytest.raises(
1700 pa.lib.ArrowInvalid,
1701 match="was not a utf8 string"):
1702 pa.array(np.array([b'\x80\x81'], dtype=object), pa.string())
1703
1704 def test_numpy_string_array_to_fixed_size_binary(self):
1705 arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
1706
1707 converted = pa.array(arr, type=pa.binary(3))
1708 expected = pa.array(list(arr), type=pa.binary(3))
1709 assert converted.equals(expected)
1710
1711 mask = np.array([False, True, False])
1712 converted = pa.array(arr, type=pa.binary(3), mask=mask)
1713 expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3))
1714 assert converted.equals(expected)
1715
1716 with pytest.raises(pa.lib.ArrowInvalid,
1717 match=r'Got bytestring of length 3 \(expected 4\)'):
1718 arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
1719 pa.array(arr, type=pa.binary(4))
1720
1721 with pytest.raises(
1722 pa.lib.ArrowInvalid,
1723 match=r'Got bytestring of length 12 \(expected 3\)'):
1724 arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3')
1725 pa.array(arr, type=pa.binary(3))
1726
1727
1728class TestConvertDecimalTypes:
1729 """
1730 Conversion test for decimal types.
1731 """
1732 decimal32 = [
1733 decimal.Decimal('-1234.123'),
1734 decimal.Decimal('1234.439')
1735 ]
1736 decimal64 = [
1737 decimal.Decimal('-129934.123331'),
1738 decimal.Decimal('129534.123731')
1739 ]
1740 decimal128 = [
1741 decimal.Decimal('394092382910493.12341234678'),
1742 decimal.Decimal('-314292388910493.12343437128')
1743 ]
1744
1745 @pytest.mark.parametrize(('values', 'expected_type'), [
1746 pytest.param(decimal32, pa.decimal128(7, 3), id='decimal32'),
1747 pytest.param(decimal64, pa.decimal128(12, 6), id='decimal64'),
1748 pytest.param(decimal128, pa.decimal128(26, 11), id='decimal128')
1749 ])
1750 def test_decimal_from_pandas(self, values, expected_type):
1751 expected = pd.DataFrame({'decimals': values})
1752 table = pa.Table.from_pandas(expected, preserve_index=False)
1753 field = pa.field('decimals', expected_type)
1754
1755 # schema's metadata is generated by from_pandas conversion
1756 expected_schema = pa.schema([field], metadata=table.schema.metadata)
1757 assert table.schema.equals(expected_schema)
1758
1759 @pytest.mark.parametrize('values', [
1760 pytest.param(decimal32, id='decimal32'),
1761 pytest.param(decimal64, id='decimal64'),
1762 pytest.param(decimal128, id='decimal128')
1763 ])
1764 def test_decimal_to_pandas(self, values):
1765 expected = pd.DataFrame({'decimals': values})
1766 converted = pa.Table.from_pandas(expected)
1767 df = converted.to_pandas()
1768 tm.assert_frame_equal(df, expected)
1769
1770 def test_decimal_fails_with_truncation(self):
1771 data1 = [decimal.Decimal('1.234')]
1772 type1 = pa.decimal128(10, 2)
1773 with pytest.raises(pa.ArrowInvalid):
1774 pa.array(data1, type=type1)
1775
1776 data2 = [decimal.Decimal('1.2345')]
1777 type2 = pa.decimal128(10, 3)
1778 with pytest.raises(pa.ArrowInvalid):
1779 pa.array(data2, type=type2)
1780
1781 def test_decimal_with_different_precisions(self):
1782 data = [
1783 decimal.Decimal('0.01'),
1784 decimal.Decimal('0.001'),
1785 ]
1786 series = pd.Series(data)
1787 array = pa.array(series)
1788 assert array.to_pylist() == data
1789 assert array.type == pa.decimal128(3, 3)
1790
1791 array = pa.array(data, type=pa.decimal128(12, 5))
1792 expected = [decimal.Decimal('0.01000'), decimal.Decimal('0.00100')]
1793 assert array.to_pylist() == expected
1794
1795 def test_decimal_with_None_explicit_type(self):
1796 series = pd.Series([decimal.Decimal('3.14'), None])
1797 _check_series_roundtrip(series, type_=pa.decimal128(12, 5))
1798
1799 # Test that having all None values still produces decimal array
1800 series = pd.Series([None] * 2)
1801 _check_series_roundtrip(series, type_=pa.decimal128(12, 5))
1802
1803 def test_decimal_with_None_infer_type(self):
1804 series = pd.Series([decimal.Decimal('3.14'), None])
1805 _check_series_roundtrip(series, expected_pa_type=pa.decimal128(3, 2))
1806
1807 def test_strided_objects(self, tmpdir):
1808 # see ARROW-3053
1809 data = {
1810 'a': {0: 'a'},
1811 'b': {0: decimal.Decimal('0.0')}
1812 }
1813
1814 # This yields strided objects
1815 df = pd.DataFrame.from_dict(data)
1816 _check_pandas_roundtrip(df)
1817
1818
1819class TestConvertListTypes:
1820 """
1821 Conversion tests for list<> types.
1822 """
1823
1824 def test_column_of_arrays(self):
1825 df, schema = dataframe_with_arrays()
1826 _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
1827 table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
1828
1829 # schema's metadata is generated by from_pandas conversion
1830 expected_schema = schema.with_metadata(table.schema.metadata)
1831 assert table.schema.equals(expected_schema)
1832
1833 for column in df.columns:
1834 field = schema.field(column)
1835 _check_array_roundtrip(df[column], type=field.type)
1836
1837 def test_column_of_arrays_to_py(self):
1838 # Test regression in ARROW-1199 not caught in above test
1839 dtype = 'i1'
1840 arr = np.array([
1841 np.arange(10, dtype=dtype),
1842 np.arange(5, dtype=dtype),
1843 None,
1844 np.arange(1, dtype=dtype)
1845 ], dtype=object)
1846 type_ = pa.list_(pa.int8())
1847 parr = pa.array(arr, type=type_)
1848
1849 assert parr[0].as_py() == list(range(10))
1850 assert parr[1].as_py() == list(range(5))
1851 assert parr[2].as_py() is None
1852 assert parr[3].as_py() == [0]
1853
1854 def test_column_of_boolean_list(self):
1855 # ARROW-4370: Table to pandas conversion fails for list of bool
1856 array = pa.array([[True, False], [True]], type=pa.list_(pa.bool_()))
1857 table = pa.Table.from_arrays([array], names=['col1'])
1858 df = table.to_pandas()
1859
1860 expected_df = pd.DataFrame({'col1': [[True, False], [True]]})
1861 tm.assert_frame_equal(df, expected_df)
1862
1863 s = table[0].to_pandas()
1864 tm.assert_series_equal(pd.Series(s), df['col1'], check_names=False)
1865
1866 def test_column_of_decimal_list(self):
1867 array = pa.array([[decimal.Decimal('1'), decimal.Decimal('2')],
1868 [decimal.Decimal('3.3')]],
1869 type=pa.list_(pa.decimal128(2, 1)))
1870 table = pa.Table.from_arrays([array], names=['col1'])
1871 df = table.to_pandas()
1872
1873 expected_df = pd.DataFrame(
1874 {'col1': [[decimal.Decimal('1'), decimal.Decimal('2')],
1875 [decimal.Decimal('3.3')]]})
1876 tm.assert_frame_equal(df, expected_df)
1877
1878 def test_nested_types_from_ndarray_null_entries(self):
1879 # Root cause of ARROW-6435
1880 s = pd.Series(np.array([np.nan, np.nan], dtype=object))
1881
1882 for ty in [pa.list_(pa.int64()),
1883 pa.large_list(pa.int64()),
1884 pa.struct([pa.field('f0', 'int32')])]:
1885 result = pa.array(s, type=ty)
1886 expected = pa.array([None, None], type=ty)
1887 assert result.equals(expected)
1888
1889 with pytest.raises(TypeError):
1890 pa.array(s.values, type=ty)
1891
1892 def test_column_of_lists(self):
1893 df, schema = dataframe_with_lists()
1894 _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
1895 table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
1896
1897 # schema's metadata is generated by from_pandas conversion
1898 expected_schema = schema.with_metadata(table.schema.metadata)
1899 assert table.schema.equals(expected_schema)
1900
1901 for column in df.columns:
1902 field = schema.field(column)
1903 _check_array_roundtrip(df[column], type=field.type)
1904
1905 def test_column_of_lists_first_empty(self):
1906 # ARROW-2124
1907 num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]]
1908 series = pd.Series([np.array(s, dtype=float) for s in num_lists])
1909 arr = pa.array(series)
1910 result = pd.Series(arr.to_pandas())
1911 tm.assert_series_equal(result, series)
1912
1913 def test_column_of_lists_chunked(self):
1914 # ARROW-1357
1915 df = pd.DataFrame({
1916 'lists': np.array([
1917 [1, 2],
1918 None,
1919 [2, 3],
1920 [4, 5],
1921 [6, 7],
1922 [8, 9]
1923 ], dtype=object)
1924 })
1925
1926 schema = pa.schema([
1927 pa.field('lists', pa.list_(pa.int64()))
1928 ])
1929
1930 t1 = pa.Table.from_pandas(df[:2], schema=schema)
1931 t2 = pa.Table.from_pandas(df[2:], schema=schema)
1932
1933 table = pa.concat_tables([t1, t2])
1934 result = table.to_pandas()
1935
1936 tm.assert_frame_equal(result, df)
1937
1938 def test_empty_column_of_lists_chunked(self):
1939 df = pd.DataFrame({
1940 'lists': np.array([], dtype=object)
1941 })
1942
1943 schema = pa.schema([
1944 pa.field('lists', pa.list_(pa.int64()))
1945 ])
1946
1947 table = pa.Table.from_pandas(df, schema=schema)
1948 result = table.to_pandas()
1949
1950 tm.assert_frame_equal(result, df)
1951
1952 def test_column_of_lists_chunked2(self):
1953 data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11],
1954 [12, 13], [14, 15], [16, 17]]
1955 data2 = [[8, 9], [18, 19]]
1956
1957 a1 = pa.array(data1)
1958 a2 = pa.array(data2)
1959
1960 t1 = pa.Table.from_arrays([a1], names=['a'])
1961 t2 = pa.Table.from_arrays([a2], names=['a'])
1962
1963 concatenated = pa.concat_tables([t1, t2])
1964
1965 result = concatenated.to_pandas()
1966 expected = pd.DataFrame({'a': data1 + data2})
1967
1968 tm.assert_frame_equal(result, expected)
1969
1970 def test_column_of_lists_strided(self):
1971 df, schema = dataframe_with_lists()
1972 df = pd.concat([df] * 6, ignore_index=True)
1973
1974 arr = df['int64'].values[::3]
1975 assert arr.strides[0] != 8
1976
1977 _check_array_roundtrip(arr)
1978
1979 def test_nested_lists_all_none(self):
1980 data = np.array([[None, None], None], dtype=object)
1981
1982 arr = pa.array(data)
1983 expected = pa.array(list(data))
1984 assert arr.equals(expected)
1985 assert arr.type == pa.list_(pa.null())
1986
1987 data2 = np.array([None, None, [None, None],
1988 np.array([None, None], dtype=object)],
1989 dtype=object)
1990 arr = pa.array(data2)
1991 expected = pa.array([None, None, [None, None], [None, None]])
1992 assert arr.equals(expected)
1993
1994 def test_nested_lists_all_empty(self):
1995 # ARROW-2128
1996 data = pd.Series([[], [], []])
1997 arr = pa.array(data)
1998 expected = pa.array(list(data))
1999 assert arr.equals(expected)
2000 assert arr.type == pa.list_(pa.null())
2001
2002 def test_nested_list_first_empty(self):
2003 # ARROW-2711
2004 data = pd.Series([[], ["a"]])
2005 arr = pa.array(data)
2006 expected = pa.array(list(data))
2007 assert arr.equals(expected)
2008 assert arr.type == pa.list_(pa.string())
2009
2010 def test_nested_smaller_ints(self):
2011 # ARROW-1345, ARROW-2008, there were some type inference bugs happening
2012 # before
2013 data = pd.Series([np.array([1, 2, 3], dtype='i1'), None])
2014 result = pa.array(data)
2015 result2 = pa.array(data.values)
2016 expected = pa.array([[1, 2, 3], None], type=pa.list_(pa.int8()))
2017 assert result.equals(expected)
2018 assert result2.equals(expected)
2019
2020 data3 = pd.Series([np.array([1, 2, 3], dtype='f4'), None])
2021 result3 = pa.array(data3)
2022 expected3 = pa.array([[1, 2, 3], None], type=pa.list_(pa.float32()))
2023 assert result3.equals(expected3)
2024
2025 def test_infer_lists(self):
2026 data = OrderedDict([
2027 ('nan_ints', [[None, 1], [2, 3]]),
2028 ('ints', [[0, 1], [2, 3]]),
2029 ('strs', [[None, 'b'], ['c', 'd']]),
2030 ('nested_strs', [[[None, 'b'], ['c', 'd']], None])
2031 ])
2032 df = pd.DataFrame(data)
2033
2034 expected_schema = pa.schema([
2035 pa.field('nan_ints', pa.list_(pa.int64())),
2036 pa.field('ints', pa.list_(pa.int64())),
2037 pa.field('strs', pa.list_(pa.string())),
2038 pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
2039 ])
2040
2041 _check_pandas_roundtrip(df, expected_schema=expected_schema)
2042
2043 def test_fixed_size_list(self):
2044 # ARROW-7365
2045 fixed_ty = pa.list_(pa.int64(), list_size=4)
2046 variable_ty = pa.list_(pa.int64())
2047
2048 data = [[0, 1, 2, 3], None, [4, 5, 6, 7], [8, 9, 10, 11]]
2049 fixed_arr = pa.array(data, type=fixed_ty)
2050 variable_arr = pa.array(data, type=variable_ty)
2051
2052 result = fixed_arr.to_pandas()
2053 expected = variable_arr.to_pandas()
2054
2055 for left, right in zip(result, expected):
2056 if left is None:
2057 assert right is None
2058 npt.assert_array_equal(left, right)
2059
2060 def test_infer_numpy_array(self):
2061 data = OrderedDict([
2062 ('ints', [
2063 np.array([0, 1], dtype=np.int64),
2064 np.array([2, 3], dtype=np.int64)
2065 ])
2066 ])
2067 df = pd.DataFrame(data)
2068 expected_schema = pa.schema([
2069 pa.field('ints', pa.list_(pa.int64()))
2070 ])
2071
2072 _check_pandas_roundtrip(df, expected_schema=expected_schema)
2073
2074 def test_to_list_of_structs_pandas(self):
2075 ints = pa.array([1, 2, 3], pa.int32())
2076 strings = pa.array([['a', 'b'], ['c', 'd'], ['e', 'f']],
2077 pa.list_(pa.string()))
2078 structs = pa.StructArray.from_arrays([ints, strings], ['f1', 'f2'])
2079 data = pa.ListArray.from_arrays([0, 1, 3], structs)
2080
2081 expected = pd.Series([
2082 [{'f1': 1, 'f2': ['a', 'b']}],
2083 [{'f1': 2, 'f2': ['c', 'd']},
2084 {'f1': 3, 'f2': ['e', 'f']}]
2085 ])
2086
2087 series = pd.Series(data.to_pandas())
2088 tm.assert_series_equal(series, expected)
2089
2090 @pytest.mark.parametrize('t,data,expected', [
2091 (
2092 pa.int64,
2093 [[1, 2], [3], None],
2094 [None, [3], None]
2095 ),
2096 (
2097 pa.string,
2098 [['aaa', 'bb'], ['c'], None],
2099 [None, ['c'], None]
2100 ),
2101 (
2102 pa.null,
2103 [[None, None], [None], None],
2104 [None, [None], None]
2105 )
2106 ])
2107 def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
2108 m = np.array([True, False, True])
2109
2110 s = pd.Series(data)
2111 result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))
2112
2113 assert pa.Array.from_pandas(expected,
2114 type=pa.list_(t())).equals(result)
2115
2116 def test_empty_list_roundtrip(self):
2117 empty_list_array = np.empty((3,), dtype=object)
2118 empty_list_array.fill([])
2119
2120 df = pd.DataFrame({'a': np.array(['1', '2', '3']),
2121 'b': empty_list_array})
2122 tbl = pa.Table.from_pandas(df)
2123
2124 result = tbl.to_pandas()
2125
2126 tm.assert_frame_equal(result, df)
2127
2128 def test_array_from_nested_arrays(self):
2129 df, schema = dataframe_with_arrays()
2130 for field in schema:
2131 arr = df[field.name].values
2132 expected = pa.array(list(arr), type=field.type)
2133 result = pa.array(arr)
2134 assert result.type == field.type # == list<scalar>
2135 assert result.equals(expected)
2136
2137 def test_nested_large_list(self):
2138 s = (pa.array([[[1, 2, 3], [4]], None],
2139 type=pa.large_list(pa.large_list(pa.int64())))
2140 .to_pandas())
2141 tm.assert_series_equal(
2142 s, pd.Series([[[1, 2, 3], [4]], None], dtype=object),
2143 check_names=False)
2144
2145 def test_large_binary_list(self):
2146 for list_type_factory in (pa.list_, pa.large_list):
2147 s = (pa.array([["aa", "bb"], None, ["cc"], []],
2148 type=list_type_factory(pa.large_binary()))
2149 .to_pandas())
2150 tm.assert_series_equal(
2151 s, pd.Series([[b"aa", b"bb"], None, [b"cc"], []]),
2152 check_names=False)
2153 s = (pa.array([["aa", "bb"], None, ["cc"], []],
2154 type=list_type_factory(pa.large_string()))
2155 .to_pandas())
2156 tm.assert_series_equal(
2157 s, pd.Series([["aa", "bb"], None, ["cc"], []]),
2158 check_names=False)
2159
2160 def test_list_of_dictionary(self):
2161 child = pa.array(["foo", "bar", None, "foo"]).dictionary_encode()
2162 arr = pa.ListArray.from_arrays([0, 1, 3, 3, 4], child)
2163
2164 # Expected a Series of lists
2165 expected = pd.Series(arr.to_pylist())
2166 tm.assert_series_equal(arr.to_pandas(), expected)
2167
2168 # Same but with nulls
2169 arr = arr.take([0, 1, None, 3])
2170 expected[2] = None
2171 tm.assert_series_equal(arr.to_pandas(), expected)
2172
2173 @pytest.mark.large_memory
2174 def test_auto_chunking_on_list_overflow(self):
2175 # ARROW-9976
2176 n = 2**21
2177 df = pd.DataFrame.from_dict({
2178 "a": list(np.zeros((n, 2**10), dtype='uint8')),
2179 "b": range(n)
2180 })
2181 table = pa.Table.from_pandas(df)
2182
2183 column_a = table[0]
2184 assert column_a.num_chunks == 2
2185 assert len(column_a.chunk(0)) == 2**21 - 1
2186 assert len(column_a.chunk(1)) == 1
2187
2188 def test_map_array_roundtrip(self):
2189 data = [[(b'a', 1), (b'b', 2)],
2190 [(b'c', 3)],
2191 [(b'd', 4), (b'e', 5), (b'f', 6)],
2192 [(b'g', 7)]]
2193
2194 df = pd.DataFrame({"map": data})
2195 schema = pa.schema([("map", pa.map_(pa.binary(), pa.int32()))])
2196
2197 _check_pandas_roundtrip(df, schema=schema)
2198
2199 def test_map_array_chunked(self):
2200 data1 = [[(b'a', 1), (b'b', 2)],
2201 [(b'c', 3)],
2202 [(b'd', 4), (b'e', 5), (b'f', 6)],
2203 [(b'g', 7)]]
2204 data2 = [[(k, v * 2) for k, v in row] for row in data1]
2205
2206 arr1 = pa.array(data1, type=pa.map_(pa.binary(), pa.int32()))
2207 arr2 = pa.array(data2, type=pa.map_(pa.binary(), pa.int32()))
2208 arr = pa.chunked_array([arr1, arr2])
2209
2210 expected = pd.Series(data1 + data2)
2211 actual = arr.to_pandas()
2212 tm.assert_series_equal(actual, expected, check_names=False)
2213
2214 def test_map_array_with_nulls(self):
2215 data = [[(b'a', 1), (b'b', 2)],
2216 None,
2217 [(b'd', 4), (b'e', 5), (b'f', None)],
2218 [(b'g', 7)]]
2219
2220 # None value in item array causes upcast to float
2221 expected = [[(k, float(v) if v is not None else None) for k, v in row]
2222 if row is not None else None for row in data]
2223 expected = pd.Series(expected)
2224
2225 arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32()))
2226 actual = arr.to_pandas()
2227 tm.assert_series_equal(actual, expected, check_names=False)
2228
2229 def test_map_array_dictionary_encoded(self):
2230 offsets = pa.array([0, 3, 5])
2231 items = pa.array(['a', 'b', 'c', 'a', 'd']).dictionary_encode()
2232 keys = pa.array(list(range(len(items))))
2233 arr = pa.MapArray.from_arrays(offsets, keys, items)
2234
2235 # Dictionary encoded values converted to dense
2236 expected = pd.Series(
2237 [[(0, 'a'), (1, 'b'), (2, 'c')], [(3, 'a'), (4, 'd')]])
2238
2239 actual = arr.to_pandas()
2240 tm.assert_series_equal(actual, expected, check_names=False)
2241
2242
2243class TestConvertStructTypes:
2244 """
2245 Conversion tests for struct types.
2246 """
2247
2248 def test_pandas_roundtrip(self):
2249 df = pd.DataFrame({'dicts': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
2250
2251 expected_schema = pa.schema([
2252 ('dicts', pa.struct([('a', pa.int64()), ('b', pa.int64())])),
2253 ])
2254
2255 _check_pandas_roundtrip(df, expected_schema=expected_schema)
2256
2257 # specifying schema explicitly in from_pandas
2258 _check_pandas_roundtrip(
2259 df, schema=expected_schema, expected_schema=expected_schema)
2260
2261 def test_to_pandas(self):
2262 ints = pa.array([None, 2, 3], type=pa.int64())
2263 strs = pa.array(['a', None, 'c'], type=pa.string())
2264 bools = pa.array([True, False, None], type=pa.bool_())
2265 arr = pa.StructArray.from_arrays(
2266 [ints, strs, bools],
2267 ['ints', 'strs', 'bools'])
2268
2269 expected = pd.Series([
2270 {'ints': None, 'strs': 'a', 'bools': True},
2271 {'ints': 2, 'strs': None, 'bools': False},
2272 {'ints': 3, 'strs': 'c', 'bools': None},
2273 ])
2274
2275 series = pd.Series(arr.to_pandas())
2276 tm.assert_series_equal(series, expected)
2277
2278 def test_to_pandas_multiple_chunks(self):
2279 # ARROW-11855
2280 gc.collect()
2281 bytes_start = pa.total_allocated_bytes()
2282 ints1 = pa.array([1], type=pa.int64())
2283 ints2 = pa.array([2], type=pa.int64())
2284 arr1 = pa.StructArray.from_arrays([ints1], ['ints'])
2285 arr2 = pa.StructArray.from_arrays([ints2], ['ints'])
2286 arr = pa.chunked_array([arr1, arr2])
2287
2288 expected = pd.Series([
2289 {'ints': 1},
2290 {'ints': 2}
2291 ])
2292
2293 series = pd.Series(arr.to_pandas())
2294 tm.assert_series_equal(series, expected)
2295
2296 del series
2297 del arr
2298 del arr1
2299 del arr2
2300 del ints1
2301 del ints2
2302 bytes_end = pa.total_allocated_bytes()
2303 assert bytes_end == bytes_start
2304
2305 def test_from_numpy(self):
2306 dt = np.dtype([('x', np.int32),
2307 (('y_title', 'y'), np.bool_)])
2308 ty = pa.struct([pa.field('x', pa.int32()),
2309 pa.field('y', pa.bool_())])
2310
2311 data = np.array([], dtype=dt)
2312 arr = pa.array(data, type=ty)
2313 assert arr.to_pylist() == []
2314
2315 data = np.array([(42, True), (43, False)], dtype=dt)
2316 arr = pa.array(data, type=ty)
2317 assert arr.to_pylist() == [{'x': 42, 'y': True},
2318 {'x': 43, 'y': False}]
2319
2320 # With mask
2321 arr = pa.array(data, mask=np.bool_([False, True]), type=ty)
2322 assert arr.to_pylist() == [{'x': 42, 'y': True}, None]
2323
2324 # Trivial struct type
2325 dt = np.dtype([])
2326 ty = pa.struct([])
2327
2328 data = np.array([], dtype=dt)
2329 arr = pa.array(data, type=ty)
2330 assert arr.to_pylist() == []
2331
2332 data = np.array([(), ()], dtype=dt)
2333 arr = pa.array(data, type=ty)
2334 assert arr.to_pylist() == [{}, {}]
2335
2336 def test_from_numpy_nested(self):
2337 # Note: an object field inside a struct
2338 dt = np.dtype([('x', np.dtype([('xx', np.int8),
2339 ('yy', np.bool_)])),
2340 ('y', np.int16),
2341 ('z', np.object_)])
2342 # Note: itemsize is not a multiple of sizeof(object)
2343 assert dt.itemsize == 12
2344 ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()),
2345 pa.field('yy', pa.bool_())])),
2346 pa.field('y', pa.int16()),
2347 pa.field('z', pa.string())])
2348
2349 data = np.array([], dtype=dt)
2350 arr = pa.array(data, type=ty)
2351 assert arr.to_pylist() == []
2352
2353 data = np.array([
2354 ((1, True), 2, 'foo'),
2355 ((3, False), 4, 'bar')], dtype=dt)
2356 arr = pa.array(data, type=ty)
2357 assert arr.to_pylist() == [
2358 {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'},
2359 {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}]
2360
2361 @pytest.mark.slow
2362 @pytest.mark.large_memory
2363 def test_from_numpy_large(self):
2364 # Exercise rechunking + nulls
2365 target_size = 3 * 1024**3 # 4GB
2366 dt = np.dtype([('x', np.float64), ('y', 'object')])
2367 bs = 65536 - dt.itemsize
2368 block = b'.' * bs
2369 n = target_size // (bs + dt.itemsize)
2370 data = np.zeros(n, dtype=dt)
2371 data['x'] = np.random.random_sample(n)
2372 data['y'] = block
2373 # Add implicit nulls
2374 data['x'][data['x'] < 0.2] = np.nan
2375
2376 ty = pa.struct([pa.field('x', pa.float64()),
2377 pa.field('y', pa.binary())])
2378 arr = pa.array(data, type=ty, from_pandas=True)
2379 assert arr.num_chunks == 2
2380
2381 def iter_chunked_array(arr):
2382 for chunk in arr.iterchunks():
2383 yield from chunk
2384
2385 def check(arr, data, mask=None):
2386 assert len(arr) == len(data)
2387 xs = data['x']
2388 ys = data['y']
2389 for i, obj in enumerate(iter_chunked_array(arr)):
2390 try:
2391 d = obj.as_py()
2392 if mask is not None and mask[i]:
2393 assert d is None
2394 else:
2395 x = xs[i]
2396 if np.isnan(x):
2397 assert d['x'] is None
2398 else:
2399 assert d['x'] == x
2400 assert d['y'] == ys[i]
2401 except Exception:
2402 print("Failed at index", i)
2403 raise
2404
2405 check(arr, data)
2406 del arr
2407
2408 # Now with explicit mask
2409 mask = np.random.random_sample(n) < 0.2
2410 arr = pa.array(data, type=ty, mask=mask, from_pandas=True)
2411 assert arr.num_chunks == 2
2412
2413 check(arr, data, mask)
2414 del arr
2415
2416 def test_from_numpy_bad_input(self):
2417 ty = pa.struct([pa.field('x', pa.int32()),
2418 pa.field('y', pa.bool_())])
2419 dt = np.dtype([('x', np.int32),
2420 ('z', np.bool_)])
2421
2422 data = np.array([], dtype=dt)
2423 with pytest.raises(ValueError,
2424 match="Missing field 'y'"):
2425 pa.array(data, type=ty)
2426 data = np.int32([])
2427 with pytest.raises(TypeError,
2428 match="Expected struct array"):
2429 pa.array(data, type=ty)
2430
2431 def test_from_tuples(self):
2432 df = pd.DataFrame({'tuples': [(1, 2), (3, 4)]})
2433 expected_df = pd.DataFrame(
2434 {'tuples': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]})
2435
2436 # conversion from tuples works when specifying expected struct type
2437 struct_type = pa.struct([('a', pa.int64()), ('b', pa.int64())])
2438
2439 arr = np.asarray(df['tuples'])
2440 _check_array_roundtrip(
2441 arr, expected=expected_df['tuples'], type=struct_type)
2442
2443 expected_schema = pa.schema([('tuples', struct_type)])
2444 _check_pandas_roundtrip(
2445 df, expected=expected_df, schema=expected_schema,
2446 expected_schema=expected_schema)
2447
2448 def test_struct_of_dictionary(self):
2449 names = ['ints', 'strs']
2450 children = [pa.array([456, 789, 456]).dictionary_encode(),
2451 pa.array(["foo", "foo", None]).dictionary_encode()]
2452 arr = pa.StructArray.from_arrays(children, names=names)
2453
2454 # Expected a Series of {field name: field value} dicts
2455 rows_as_tuples = zip(*(child.to_pylist() for child in children))
2456 rows_as_dicts = [dict(zip(names, row)) for row in rows_as_tuples]
2457
2458 expected = pd.Series(rows_as_dicts)
2459 tm.assert_series_equal(arr.to_pandas(), expected)
2460
2461 # Same but with nulls
2462 arr = arr.take([0, None, 2])
2463 expected[1] = None
2464 tm.assert_series_equal(arr.to_pandas(), expected)
2465
2466
2467class TestZeroCopyConversion:
2468 """
2469 Tests that zero-copy conversion works with some types.
2470 """
2471
2472 def test_zero_copy_success(self):
2473 result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
2474 npt.assert_array_equal(result, [0, 1, 2])
2475
2476 def test_zero_copy_dictionaries(self):
2477 arr = pa.DictionaryArray.from_arrays(
2478 np.array([0, 0]),
2479 np.array([5]))
2480
2481 result = arr.to_pandas(zero_copy_only=True)
2482 values = pd.Categorical([5, 5])
2483
2484 tm.assert_series_equal(pd.Series(result), pd.Series(values),
2485 check_names=False)
2486
2487 def test_zero_copy_timestamp(self):
2488 arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
2489 result = pa.array(arr).to_pandas(zero_copy_only=True)
2490 npt.assert_array_equal(result, arr)
2491
2492 def test_zero_copy_duration(self):
2493 arr = np.array([1], dtype='timedelta64[ns]')
2494 result = pa.array(arr).to_pandas(zero_copy_only=True)
2495 npt.assert_array_equal(result, arr)
2496
2497 def check_zero_copy_failure(self, arr):
2498 with pytest.raises(pa.ArrowInvalid):
2499 arr.to_pandas(zero_copy_only=True)
2500
2501 def test_zero_copy_failure_on_object_types(self):
2502 self.check_zero_copy_failure(pa.array(['A', 'B', 'C']))
2503
2504 def test_zero_copy_failure_with_int_when_nulls(self):
2505 self.check_zero_copy_failure(pa.array([0, 1, None]))
2506
2507 def test_zero_copy_failure_with_float_when_nulls(self):
2508 self.check_zero_copy_failure(pa.array([0.0, 1.0, None]))
2509
2510 def test_zero_copy_failure_on_bool_types(self):
2511 self.check_zero_copy_failure(pa.array([True, False]))
2512
2513 def test_zero_copy_failure_on_list_types(self):
2514 arr = pa.array([[1, 2], [8, 9]], type=pa.list_(pa.int64()))
2515 self.check_zero_copy_failure(arr)
2516
2517 def test_zero_copy_failure_on_timestamp_with_nulls(self):
2518 arr = np.array([1, None], dtype='datetime64[ns]')
2519 self.check_zero_copy_failure(pa.array(arr))
2520
2521 def test_zero_copy_failure_on_duration_with_nulls(self):
2522 arr = np.array([1, None], dtype='timedelta64[ns]')
2523 self.check_zero_copy_failure(pa.array(arr))
2524
2525
2526def _non_threaded_conversion():
2527 df = _alltypes_example()
2528 _check_pandas_roundtrip(df, use_threads=False)
2529 _check_pandas_roundtrip(df, use_threads=False, as_batch=True)
2530
2531
2532def _threaded_conversion():
2533 df = _alltypes_example()
2534 _check_pandas_roundtrip(df, use_threads=True)
2535 _check_pandas_roundtrip(df, use_threads=True, as_batch=True)
2536
2537
2538class TestConvertMisc:
2539 """
2540 Miscellaneous conversion tests.
2541 """
2542
2543 type_pairs = [
2544 (np.int8, pa.int8()),
2545 (np.int16, pa.int16()),
2546 (np.int32, pa.int32()),
2547 (np.int64, pa.int64()),
2548 (np.uint8, pa.uint8()),
2549 (np.uint16, pa.uint16()),
2550 (np.uint32, pa.uint32()),
2551 (np.uint64, pa.uint64()),
2552 (np.float16, pa.float16()),
2553 (np.float32, pa.float32()),
2554 (np.float64, pa.float64()),
2555 # XXX unsupported
2556 # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
2557 (np.object_, pa.string()),
2558 (np.object_, pa.binary()),
2559 (np.object_, pa.binary(10)),
2560 (np.object_, pa.list_(pa.int64())),
2561 ]
2562
2563 def test_all_none_objects(self):
2564 df = pd.DataFrame({'a': [None, None, None]})
2565 _check_pandas_roundtrip(df)
2566
2567 def test_all_none_category(self):
2568 df = pd.DataFrame({'a': [None, None, None]})
2569 df['a'] = df['a'].astype('category')
2570 _check_pandas_roundtrip(df)
2571
2572 def test_empty_arrays(self):
2573 for dtype, pa_type in self.type_pairs:
2574 arr = np.array([], dtype=dtype)
2575 _check_array_roundtrip(arr, type=pa_type)
2576
2577 def test_non_threaded_conversion(self):
2578 _non_threaded_conversion()
2579
2580 def test_threaded_conversion_multiprocess(self):
2581 # Parallel conversion should work from child processes too (ARROW-2963)
2582 pool = mp.Pool(2)
2583 try:
2584 pool.apply(_threaded_conversion)
2585 finally:
2586 pool.close()
2587 pool.join()
2588
2589 def test_category(self):
2590 repeats = 5
2591 v1 = ['foo', None, 'bar', 'qux', np.nan]
2592 v2 = [4, 5, 6, 7, 8]
2593 v3 = [b'foo', None, b'bar', b'qux', np.nan]
2594
2595 arrays = {
2596 'cat_strings': pd.Categorical(v1 * repeats),
2597 'cat_strings_with_na': pd.Categorical(v1 * repeats,
2598 categories=['foo', 'bar']),
2599 'cat_ints': pd.Categorical(v2 * repeats),
2600 'cat_binary': pd.Categorical(v3 * repeats),
2601 'cat_strings_ordered': pd.Categorical(
2602 v1 * repeats, categories=['bar', 'qux', 'foo'],
2603 ordered=True),
2604 'ints': v2 * repeats,
2605 'ints2': v2 * repeats,
2606 'strings': v1 * repeats,
2607 'strings2': v1 * repeats,
2608 'strings3': v3 * repeats}
2609 df = pd.DataFrame(arrays)
2610 _check_pandas_roundtrip(df)
2611
2612 for k in arrays:
2613 _check_array_roundtrip(arrays[k])
2614
2615 def test_category_implicit_from_pandas(self):
2616 # ARROW-3374
2617 def _check(v):
2618 arr = pa.array(v)
2619 result = arr.to_pandas()
2620 tm.assert_series_equal(pd.Series(result), pd.Series(v))
2621
2622 arrays = [
2623 pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']),
2624 pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'],
2625 ordered=True)
2626 ]
2627 for arr in arrays:
2628 _check(arr)
2629
2630 def test_empty_category(self):
2631 # ARROW-2443
2632 df = pd.DataFrame({'cat': pd.Categorical([])})
2633 _check_pandas_roundtrip(df)
2634
2635 def test_category_zero_chunks(self):
2636 # ARROW-5952
2637 for pa_type, dtype in [(pa.string(), 'object'), (pa.int64(), 'int64')]:
2638 a = pa.chunked_array([], pa.dictionary(pa.int8(), pa_type))
2639 result = a.to_pandas()
2640 expected = pd.Categorical([], categories=np.array([], dtype=dtype))
2641 tm.assert_series_equal(pd.Series(result), pd.Series(expected))
2642
2643 table = pa.table({'a': a})
2644 result = table.to_pandas()
2645 expected = pd.DataFrame({'a': expected})
2646 tm.assert_frame_equal(result, expected)
2647
2648 @pytest.mark.parametrize(
2649 "data,error_type",
2650 [
2651 ({"a": ["a", 1, 2.0]}, pa.ArrowTypeError),
2652 ({"a": ["a", 1, 2.0]}, pa.ArrowTypeError),
2653 ({"a": [1, True]}, pa.ArrowTypeError),
2654 ({"a": [True, "a"]}, pa.ArrowInvalid),
2655 ({"a": [1, "a"]}, pa.ArrowInvalid),
2656 ({"a": [1.0, "a"]}, pa.ArrowInvalid),
2657 ],
2658 )
2659 def test_mixed_types_fails(self, data, error_type):
2660 df = pd.DataFrame(data)
2661 msg = "Conversion failed for column a with type object"
2662 with pytest.raises(error_type, match=msg):
2663 pa.Table.from_pandas(df)
2664
2665 def test_strided_data_import(self):
2666 cases = []
2667
2668 columns = ['a', 'b', 'c']
2669 N, K = 100, 3
2670 random_numbers = np.random.randn(N, K).copy() * 100
2671
2672 numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
2673 'f4', 'f8']
2674
2675 for type_name in numeric_dtypes:
2676 cases.append(random_numbers.astype(type_name))
2677
2678 # strings
2679 cases.append(np.array([random_ascii(10) for i in range(N * K)],
2680 dtype=object)
2681 .reshape(N, K).copy())
2682
2683 # booleans
2684 boolean_objects = (np.array([True, False, True] * N, dtype=object)
2685 .reshape(N, K).copy())
2686
2687 # add some nulls, so dtype comes back as objects
2688 boolean_objects[5] = None
2689 cases.append(boolean_objects)
2690
2691 cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
2692 dtype='datetime64[ms]')
2693 .reshape(N, K).copy())
2694
2695 strided_mask = (random_numbers > 0).astype(bool)[:, 0]
2696
2697 for case in cases:
2698 df = pd.DataFrame(case, columns=columns)
2699 col = df['a']
2700
2701 _check_pandas_roundtrip(df)
2702 _check_array_roundtrip(col)
2703 _check_array_roundtrip(col, mask=strided_mask)
2704
2705 def test_all_nones(self):
2706 def _check_series(s):
2707 converted = pa.array(s)
2708 assert isinstance(converted, pa.NullArray)
2709 assert len(converted) == 3
2710 assert converted.null_count == 3
2711 for item in converted:
2712 assert item is pa.NA
2713
2714 _check_series(pd.Series([None] * 3, dtype=object))
2715 _check_series(pd.Series([np.nan] * 3, dtype=object))
2716 _check_series(pd.Series([None, np.nan, None], dtype=object))
2717
2718 def test_partial_schema(self):
2719 data = OrderedDict([
2720 ('a', [0, 1, 2, 3, 4]),
2721 ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
2722 ('c', [-10, -5, 0, 5, 10])
2723 ])
2724 df = pd.DataFrame(data)
2725
2726 partial_schema = pa.schema([
2727 pa.field('c', pa.int64()),
2728 pa.field('a', pa.int64())
2729 ])
2730
2731 _check_pandas_roundtrip(df, schema=partial_schema,
2732 expected=df[['c', 'a']],
2733 expected_schema=partial_schema)
2734
2735 def test_table_batch_empty_dataframe(self):
2736 df = pd.DataFrame({})
2737 _check_pandas_roundtrip(df)
2738 _check_pandas_roundtrip(df, as_batch=True)
2739
2740 df2 = pd.DataFrame({}, index=[0, 1, 2])
2741 _check_pandas_roundtrip(df2, preserve_index=True)
2742 _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
2743
2744 def test_convert_empty_table(self):
2745 arr = pa.array([], type=pa.int64())
2746 empty_objects = pd.Series(np.array([], dtype=object))
2747 tm.assert_series_equal(arr.to_pandas(),
2748 pd.Series(np.array([], dtype=np.int64)))
2749 arr = pa.array([], type=pa.string())
2750 tm.assert_series_equal(arr.to_pandas(), empty_objects)
2751 arr = pa.array([], type=pa.list_(pa.int64()))
2752 tm.assert_series_equal(arr.to_pandas(), empty_objects)
2753 arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
2754 tm.assert_series_equal(arr.to_pandas(), empty_objects)
2755
2756 def test_non_natural_stride(self):
2757 """
2758 ARROW-2172: converting from a Numpy array with a stride that's
2759 not a multiple of itemsize.
2760 """
2761 dtype = np.dtype([('x', np.int32), ('y', np.int16)])
2762 data = np.array([(42, -1), (-43, 2)], dtype=dtype)
2763 assert data.strides == (6,)
2764 arr = pa.array(data['x'], type=pa.int32())
2765 assert arr.to_pylist() == [42, -43]
2766 arr = pa.array(data['y'], type=pa.int16())
2767 assert arr.to_pylist() == [-1, 2]
2768
2769 def test_array_from_strided_numpy_array(self):
2770 # ARROW-5651
2771 np_arr = np.arange(0, 10, dtype=np.float32)[1:-1:2]
2772 pa_arr = pa.array(np_arr, type=pa.float64())
2773 expected = pa.array([1.0, 3.0, 5.0, 7.0], type=pa.float64())
2774 pa_arr.equals(expected)
2775
2776 def test_safe_unsafe_casts(self):
2777 # ARROW-2799
2778 df = pd.DataFrame({
2779 'A': list('abc'),
2780 'B': np.linspace(0, 1, 3)
2781 })
2782
2783 schema = pa.schema([
2784 pa.field('A', pa.string()),
2785 pa.field('B', pa.int32())
2786 ])
2787
2788 with pytest.raises(ValueError):
2789 pa.Table.from_pandas(df, schema=schema)
2790
2791 table = pa.Table.from_pandas(df, schema=schema, safe=False)
2792 assert table.column('B').type == pa.int32()
2793
2794 def test_error_sparse(self):
2795 # ARROW-2818
2796 try:
2797 df = pd.DataFrame({'a': pd.arrays.SparseArray([1, np.nan, 3])})
2798 except AttributeError:
2799 # pandas.arrays module introduced in pandas 0.24
2800 df = pd.DataFrame({'a': pd.SparseArray([1, np.nan, 3])})
2801 with pytest.raises(TypeError, match="Sparse pandas data"):
2802 pa.Table.from_pandas(df)
2803
2804
2805def test_safe_cast_from_float_with_nans_to_int():
2806 # TODO(kszucs): write tests for creating Date32 and Date64 arrays, see
2807 # ARROW-4258 and https://github.com/apache/arrow/pull/3395
2808 values = pd.Series([1, 2, None, 4])
2809 arr = pa.Array.from_pandas(values, type=pa.int32(), safe=True)
2810 expected = pa.array([1, 2, None, 4], type=pa.int32())
2811 assert arr.equals(expected)
2812
2813
2814def _fully_loaded_dataframe_example():
2815 index = pd.MultiIndex.from_arrays([
2816 pd.date_range('2000-01-01', periods=5).repeat(2),
2817 np.tile(np.array(['foo', 'bar'], dtype=object), 5)
2818 ])
2819
2820 c1 = pd.date_range('2000-01-01', periods=10)
2821 data = {
2822 0: c1,
2823 1: c1.tz_localize('utc'),
2824 2: c1.tz_localize('US/Eastern'),
2825 3: c1[::2].tz_localize('utc').repeat(2).astype('category'),
2826 4: ['foo', 'bar'] * 5,
2827 5: pd.Series(['foo', 'bar'] * 5).astype('category').values,
2828 6: [True, False] * 5,
2829 7: np.random.randn(10),
2830 8: np.random.randint(0, 100, size=10),
2831 9: pd.period_range('2013', periods=10, freq='M')
2832 }
2833
2834 if Version(pd.__version__) >= Version('0.21'):
2835 # There is an issue with pickling IntervalIndex in pandas 0.20.x
2836 data[10] = pd.interval_range(start=1, freq=1, periods=10)
2837
2838 return pd.DataFrame(data, index=index)
2839
2840
2841@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
2842def test_roundtrip_with_bytes_unicode(columns):
2843 df = pd.DataFrame(columns=columns)
2844 table1 = pa.Table.from_pandas(df)
2845 table2 = pa.Table.from_pandas(table1.to_pandas())
2846 assert table1.equals(table2)
2847 assert table1.schema.equals(table2.schema)
2848 assert table1.schema.metadata == table2.schema.metadata
2849
2850
2851def _check_serialize_components_roundtrip(pd_obj):
2852 with pytest.warns(FutureWarning):
2853 ctx = pa.default_serialization_context()
2854
2855 with pytest.warns(FutureWarning):
2856 components = ctx.serialize(pd_obj).to_components()
2857 with pytest.warns(FutureWarning):
2858 deserialized = ctx.deserialize_components(components)
2859
2860 if isinstance(pd_obj, pd.DataFrame):
2861 tm.assert_frame_equal(pd_obj, deserialized)
2862 else:
2863 tm.assert_series_equal(pd_obj, deserialized)
2864
2865
2866@pytest.mark.skipif(
2867 Version('1.16.0') <= Version(np.__version__) < Version('1.16.1'),
2868 reason='Until numpy/numpy#12745 is resolved')
2869def test_serialize_deserialize_pandas():
2870 # ARROW-1784, serialize and deserialize DataFrame by decomposing
2871 # BlockManager
2872 df = _fully_loaded_dataframe_example()
2873 _check_serialize_components_roundtrip(df)
2874
2875
2876def test_serialize_deserialize_empty_pandas():
2877 # ARROW-7996, serialize and deserialize empty pandas objects
2878 df = pd.DataFrame({'col1': [], 'col2': [], 'col3': []})
2879 _check_serialize_components_roundtrip(df)
2880
2881 series = pd.Series([], dtype=np.float32, name='col')
2882 _check_serialize_components_roundtrip(series)
2883
2884
2885def _pytime_from_micros(val):
2886 microseconds = val % 1000000
2887 val //= 1000000
2888 seconds = val % 60
2889 val //= 60
2890 minutes = val % 60
2891 hours = val // 60
2892 return time(hours, minutes, seconds, microseconds)
2893
2894
2895def _pytime_to_micros(pytime):
2896 return (pytime.hour * 3600000000 +
2897 pytime.minute * 60000000 +
2898 pytime.second * 1000000 +
2899 pytime.microsecond)
2900
2901
2902def test_convert_unsupported_type_error_message():
2903 # ARROW-1454
2904
2905 # custom python objects
2906 class A:
2907 pass
2908
2909 df = pd.DataFrame({'a': [A(), A()]})
2910
2911 msg = 'Conversion failed for column a with type object'
2912 with pytest.raises(ValueError, match=msg):
2913 pa.Table.from_pandas(df)
2914
2915 # period unsupported for pandas <= 0.25
2916 if Version(pd.__version__) <= Version('0.25'):
2917 df = pd.DataFrame({
2918 'a': pd.period_range('2000-01-01', periods=20),
2919 })
2920
2921 msg = 'Conversion failed for column a with type (period|object)'
2922 with pytest.raises((TypeError, ValueError), match=msg):
2923 pa.Table.from_pandas(df)
2924
2925
2926# ----------------------------------------------------------------------
2927# Hypothesis tests
2928
2929
2930@h.given(past.arrays(past.pandas_compatible_types))
2931def test_array_to_pandas_roundtrip(arr):
2932 s = arr.to_pandas()
2933 restored = pa.array(s, type=arr.type, from_pandas=True)
2934 assert restored.equals(arr)
2935
2936
2937# ----------------------------------------------------------------------
2938# Test object deduplication in to_pandas
2939
2940
2941def _generate_dedup_example(nunique, repeats):
2942 unique_values = [rands(10) for i in range(nunique)]
2943 return unique_values * repeats
2944
2945
2946def _assert_nunique(obj, expected):
2947 assert len({id(x) for x in obj}) == expected
2948
2949
2950def test_to_pandas_deduplicate_strings_array_types():
2951 nunique = 100
2952 repeats = 10
2953 values = _generate_dedup_example(nunique, repeats)
2954
2955 for arr in [pa.array(values, type=pa.binary()),
2956 pa.array(values, type=pa.utf8()),
2957 pa.chunked_array([values, values])]:
2958 _assert_nunique(arr.to_pandas(), nunique)
2959 _assert_nunique(arr.to_pandas(deduplicate_objects=False), len(arr))
2960
2961
2962def test_to_pandas_deduplicate_strings_table_types():
2963 nunique = 100
2964 repeats = 10
2965 values = _generate_dedup_example(nunique, repeats)
2966
2967 arr = pa.array(values)
2968 rb = pa.RecordBatch.from_arrays([arr], ['foo'])
2969 tbl = pa.Table.from_batches([rb])
2970
2971 for obj in [rb, tbl]:
2972 _assert_nunique(obj.to_pandas()['foo'], nunique)
2973 _assert_nunique(obj.to_pandas(deduplicate_objects=False)['foo'],
2974 len(obj))
2975
2976
2977def test_to_pandas_deduplicate_integers_as_objects():
2978 nunique = 100
2979 repeats = 10
2980
2981 # Python automatically interns smaller integers
2982 unique_values = list(np.random.randint(10000000, 1000000000, size=nunique))
2983 unique_values[nunique // 2] = None
2984
2985 arr = pa.array(unique_values * repeats)
2986
2987 _assert_nunique(arr.to_pandas(integer_object_nulls=True), nunique)
2988 _assert_nunique(arr.to_pandas(integer_object_nulls=True,
2989 deduplicate_objects=False),
2990 # Account for None
2991 (nunique - 1) * repeats + 1)
2992
2993
2994def test_to_pandas_deduplicate_date_time():
2995 nunique = 100
2996 repeats = 10
2997
2998 unique_values = list(range(nunique))
2999
3000 cases = [
3001 # raw type, array type, to_pandas options
3002 ('int32', 'date32', {'date_as_object': True}),
3003 ('int64', 'date64', {'date_as_object': True}),
3004 ('int32', 'time32[ms]', {}),
3005 ('int64', 'time64[us]', {})
3006 ]
3007
3008 for raw_type, array_type, pandas_options in cases:
3009 raw_arr = pa.array(unique_values * repeats, type=raw_type)
3010 casted_arr = raw_arr.cast(array_type)
3011
3012 _assert_nunique(casted_arr.to_pandas(**pandas_options),
3013 nunique)
3014 _assert_nunique(casted_arr.to_pandas(deduplicate_objects=False,
3015 **pandas_options),
3016 len(casted_arr))
3017
3018
3019# ---------------------------------------------------------------------
3020
3021def test_table_from_pandas_checks_field_nullability():
3022 # ARROW-2136
3023 df = pd.DataFrame({'a': [1.2, 2.1, 3.1],
3024 'b': [np.nan, 'string', 'foo']})
3025 schema = pa.schema([pa.field('a', pa.float64(), nullable=False),
3026 pa.field('b', pa.utf8(), nullable=False)])
3027
3028 with pytest.raises(ValueError):
3029 pa.Table.from_pandas(df, schema=schema)
3030
3031
3032def test_table_from_pandas_keeps_column_order_of_dataframe():
3033 df1 = pd.DataFrame(OrderedDict([
3034 ('partition', [0, 0, 1, 1]),
3035 ('arrays', [[0, 1, 2], [3, 4], None, None]),
3036 ('floats', [None, None, 1.1, 3.3])
3037 ]))
3038 df2 = df1[['floats', 'partition', 'arrays']]
3039
3040 schema1 = pa.schema([
3041 ('partition', pa.int64()),
3042 ('arrays', pa.list_(pa.int64())),
3043 ('floats', pa.float64()),
3044 ])
3045 schema2 = pa.schema([
3046 ('floats', pa.float64()),
3047 ('partition', pa.int64()),
3048 ('arrays', pa.list_(pa.int64()))
3049 ])
3050
3051 table1 = pa.Table.from_pandas(df1, preserve_index=False)
3052 table2 = pa.Table.from_pandas(df2, preserve_index=False)
3053
3054 assert table1.schema.equals(schema1)
3055 assert table2.schema.equals(schema2)
3056
3057
3058def test_table_from_pandas_keeps_column_order_of_schema():
3059 # ARROW-3766
3060 df = pd.DataFrame(OrderedDict([
3061 ('partition', [0, 0, 1, 1]),
3062 ('arrays', [[0, 1, 2], [3, 4], None, None]),
3063 ('floats', [None, None, 1.1, 3.3])
3064 ]))
3065
3066 schema = pa.schema([
3067 ('floats', pa.float64()),
3068 ('arrays', pa.list_(pa.int32())),
3069 ('partition', pa.int32())
3070 ])
3071
3072 df1 = df[df.partition == 0]
3073 df2 = df[df.partition == 1][['floats', 'partition', 'arrays']]
3074
3075 table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
3076 table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
3077
3078 assert table1.schema.equals(schema)
3079 assert table1.schema.equals(table2.schema)
3080
3081
3082def test_table_from_pandas_columns_argument_only_does_filtering():
3083 df = pd.DataFrame(OrderedDict([
3084 ('partition', [0, 0, 1, 1]),
3085 ('arrays', [[0, 1, 2], [3, 4], None, None]),
3086 ('floats', [None, None, 1.1, 3.3])
3087 ]))
3088
3089 columns1 = ['arrays', 'floats', 'partition']
3090 schema1 = pa.schema([
3091 ('arrays', pa.list_(pa.int64())),
3092 ('floats', pa.float64()),
3093 ('partition', pa.int64())
3094 ])
3095
3096 columns2 = ['floats', 'partition']
3097 schema2 = pa.schema([
3098 ('floats', pa.float64()),
3099 ('partition', pa.int64())
3100 ])
3101
3102 table1 = pa.Table.from_pandas(df, columns=columns1, preserve_index=False)
3103 table2 = pa.Table.from_pandas(df, columns=columns2, preserve_index=False)
3104
3105 assert table1.schema.equals(schema1)
3106 assert table2.schema.equals(schema2)
3107
3108
3109def test_table_from_pandas_columns_and_schema_are_mutually_exclusive():
3110 df = pd.DataFrame(OrderedDict([
3111 ('partition', [0, 0, 1, 1]),
3112 ('arrays', [[0, 1, 2], [3, 4], None, None]),
3113 ('floats', [None, None, 1.1, 3.3])
3114 ]))
3115 schema = pa.schema([
3116 ('partition', pa.int32()),
3117 ('arrays', pa.list_(pa.int32())),
3118 ('floats', pa.float64()),
3119 ])
3120 columns = ['arrays', 'floats']
3121
3122 with pytest.raises(ValueError):
3123 pa.Table.from_pandas(df, schema=schema, columns=columns)
3124
3125
3126def test_table_from_pandas_keeps_schema_nullability():
3127 # ARROW-5169
3128 df = pd.DataFrame({'a': [1, 2, 3, 4]})
3129
3130 schema = pa.schema([
3131 pa.field('a', pa.int64(), nullable=False),
3132 ])
3133
3134 table = pa.Table.from_pandas(df)
3135 assert table.schema.field('a').nullable is True
3136 table = pa.Table.from_pandas(df, schema=schema)
3137 assert table.schema.field('a').nullable is False
3138
3139
3140def test_table_from_pandas_schema_index_columns():
3141 # ARROW-5220
3142 df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]})
3143
3144 schema = pa.schema([
3145 ('a', pa.int64()),
3146 ('b', pa.float64()),
3147 ('index', pa.int32()),
3148 ])
3149
3150 # schema includes index with name not in dataframe
3151 with pytest.raises(KeyError, match="name 'index' present in the"):
3152 pa.Table.from_pandas(df, schema=schema)
3153
3154 df.index.name = 'index'
3155
3156 # schema includes correct index name -> roundtrip works
3157 _check_pandas_roundtrip(df, schema=schema, preserve_index=True,
3158 expected_schema=schema)
3159
3160 # schema includes correct index name but preserve_index=False
3161 with pytest.raises(ValueError, match="'preserve_index=False' was"):
3162 pa.Table.from_pandas(df, schema=schema, preserve_index=False)
3163
3164 # in case of preserve_index=None -> RangeIndex serialized as metadata
3165 # clashes with the index in the schema
3166 with pytest.raises(ValueError, match="name 'index' is present in the "
3167 "schema, but it is a RangeIndex"):
3168 pa.Table.from_pandas(df, schema=schema, preserve_index=None)
3169
3170 df.index = pd.Index([0, 1, 2], name='index')
3171
3172 # for non-RangeIndex, both preserve_index=None and True work
3173 _check_pandas_roundtrip(df, schema=schema, preserve_index=None,
3174 expected_schema=schema)
3175 _check_pandas_roundtrip(df, schema=schema, preserve_index=True,
3176 expected_schema=schema)
3177
3178 # schema has different order (index column not at the end)
3179 schema = pa.schema([
3180 ('index', pa.int32()),
3181 ('a', pa.int64()),
3182 ('b', pa.float64()),
3183 ])
3184 _check_pandas_roundtrip(df, schema=schema, preserve_index=None,
3185 expected_schema=schema)
3186 _check_pandas_roundtrip(df, schema=schema, preserve_index=True,
3187 expected_schema=schema)
3188
3189 # schema does not include the index -> index is not included as column
3190 # even though preserve_index=True/None
3191 schema = pa.schema([
3192 ('a', pa.int64()),
3193 ('b', pa.float64()),
3194 ])
3195 expected = df.copy()
3196 expected = expected.reset_index(drop=True)
3197 _check_pandas_roundtrip(df, schema=schema, preserve_index=None,
3198 expected_schema=schema, expected=expected)
3199 _check_pandas_roundtrip(df, schema=schema, preserve_index=True,
3200 expected_schema=schema, expected=expected)
3201
3202 # dataframe with a MultiIndex
3203 df.index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)],
3204 names=['level1', 'level2'])
3205 schema = pa.schema([
3206 ('level1', pa.string()),
3207 ('level2', pa.int64()),
3208 ('a', pa.int64()),
3209 ('b', pa.float64()),
3210 ])
3211 _check_pandas_roundtrip(df, schema=schema, preserve_index=True,
3212 expected_schema=schema)
3213 _check_pandas_roundtrip(df, schema=schema, preserve_index=None,
3214 expected_schema=schema)
3215
3216 # only one of the levels of the MultiIndex is included
3217 schema = pa.schema([
3218 ('level2', pa.int64()),
3219 ('a', pa.int64()),
3220 ('b', pa.float64()),
3221 ])
3222 expected = df.copy()
3223 expected = expected.reset_index('level1', drop=True)
3224 _check_pandas_roundtrip(df, schema=schema, preserve_index=True,
3225 expected_schema=schema, expected=expected)
3226 _check_pandas_roundtrip(df, schema=schema, preserve_index=None,
3227 expected_schema=schema, expected=expected)
3228
3229
3230def test_table_from_pandas_schema_index_columns__unnamed_index():
3231 # ARROW-6999 - unnamed indices in specified schema
3232 df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]})
3233
3234 expected_schema = pa.schema([
3235 ('a', pa.int64()),
3236 ('b', pa.float64()),
3237 ('__index_level_0__', pa.int64()),
3238 ])
3239
3240 schema = pa.Schema.from_pandas(df, preserve_index=True)
3241 table = pa.Table.from_pandas(df, preserve_index=True, schema=schema)
3242 assert table.schema.remove_metadata().equals(expected_schema)
3243
3244 # non-RangeIndex (preserved by default)
3245 df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}, index=[0, 1, 2])
3246 schema = pa.Schema.from_pandas(df)
3247 table = pa.Table.from_pandas(df, schema=schema)
3248 assert table.schema.remove_metadata().equals(expected_schema)
3249
3250
3251def test_table_from_pandas_schema_with_custom_metadata():
3252 # ARROW-7087 - metadata disappear from pandas
3253 df = pd.DataFrame()
3254 schema = pa.Schema.from_pandas(df).with_metadata({'meta': 'True'})
3255 table = pa.Table.from_pandas(df, schema=schema)
3256 assert table.schema.metadata.get(b'meta') == b'True'
3257
3258
3259def test_table_from_pandas_schema_field_order_metadat():
3260 # ARROW-10532
3261 # ensure that a different field order in specified schema doesn't
3262 # mangle metadata
3263 df = pd.DataFrame({
3264 "datetime": pd.date_range("2020-01-01T00:00:00Z", freq="H", periods=2),
3265 "float": np.random.randn(2)
3266 })
3267
3268 schema = pa.schema([
3269 pa.field("float", pa.float32(), nullable=True),
3270 pa.field("datetime", pa.timestamp("s", tz="UTC"), nullable=False)
3271 ])
3272
3273 table = pa.Table.from_pandas(df, schema=schema)
3274 assert table.schema.equals(schema)
3275 metadata_float = table.schema.pandas_metadata["columns"][0]
3276 assert metadata_float["name"] == "float"
3277 assert metadata_float["metadata"] is None
3278 metadata_datetime = table.schema.pandas_metadata["columns"][1]
3279 assert metadata_datetime["name"] == "datetime"
3280 assert metadata_datetime["metadata"] == {'timezone': 'UTC'}
3281
3282 result = table.to_pandas()
3283 expected = df[["float", "datetime"]].astype({"float": "float32"})
3284 tm.assert_frame_equal(result, expected)
3285
3286
3287# ----------------------------------------------------------------------
3288# RecordBatch, Table
3289
3290
3291def test_recordbatch_from_to_pandas():
3292 data = pd.DataFrame({
3293 'c1': np.array([1, 2, 3, 4, 5], dtype='int64'),
3294 'c2': np.array([1, 2, 3, 4, 5], dtype='uint32'),
3295 'c3': np.random.randn(5),
3296 'c4': ['foo', 'bar', None, 'baz', 'qux'],
3297 'c5': [False, True, False, True, False]
3298 })
3299
3300 batch = pa.RecordBatch.from_pandas(data)
3301 result = batch.to_pandas()
3302 tm.assert_frame_equal(data, result)
3303
3304
3305def test_recordbatchlist_to_pandas():
3306 data1 = pd.DataFrame({
3307 'c1': np.array([1, 1, 2], dtype='uint32'),
3308 'c2': np.array([1.0, 2.0, 3.0], dtype='float64'),
3309 'c3': [True, None, False],
3310 'c4': ['foo', 'bar', None]
3311 })
3312
3313 data2 = pd.DataFrame({
3314 'c1': np.array([3, 5], dtype='uint32'),
3315 'c2': np.array([4.0, 5.0], dtype='float64'),
3316 'c3': [True, True],
3317 'c4': ['baz', 'qux']
3318 })
3319
3320 batch1 = pa.RecordBatch.from_pandas(data1)
3321 batch2 = pa.RecordBatch.from_pandas(data2)
3322
3323 table = pa.Table.from_batches([batch1, batch2])
3324 result = table.to_pandas()
3325 data = pd.concat([data1, data2]).reset_index(drop=True)
3326 tm.assert_frame_equal(data, result)
3327
3328
3329def test_recordbatch_table_pass_name_to_pandas():
3330 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0'])
3331 t = pa.table([pa.array([1, 2, 3, 4])], names=['a0'])
3332 assert rb[0].to_pandas().name == 'a0'
3333 assert t[0].to_pandas().name == 'a0'
3334
3335
3336# ----------------------------------------------------------------------
3337# Metadata serialization
3338
3339
3340@pytest.mark.parametrize(
3341 ('type', 'expected'),
3342 [
3343 (pa.null(), 'empty'),
3344 (pa.bool_(), 'bool'),
3345 (pa.int8(), 'int8'),
3346 (pa.int16(), 'int16'),
3347 (pa.int32(), 'int32'),
3348 (pa.int64(), 'int64'),
3349 (pa.uint8(), 'uint8'),
3350 (pa.uint16(), 'uint16'),
3351 (pa.uint32(), 'uint32'),
3352 (pa.uint64(), 'uint64'),
3353 (pa.float16(), 'float16'),
3354 (pa.float32(), 'float32'),
3355 (pa.float64(), 'float64'),
3356 (pa.date32(), 'date'),
3357 (pa.date64(), 'date'),
3358 (pa.binary(), 'bytes'),
3359 (pa.binary(length=4), 'bytes'),
3360 (pa.string(), 'unicode'),
3361 (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
3362 (pa.decimal128(18, 3), 'decimal'),
3363 (pa.timestamp('ms'), 'datetime'),
3364 (pa.timestamp('us', 'UTC'), 'datetimetz'),
3365 (pa.time32('s'), 'time'),
3366 (pa.time64('us'), 'time')
3367 ]
3368)
3369def test_logical_type(type, expected):
3370 assert get_logical_type(type) == expected
3371
3372
3373# ----------------------------------------------------------------------
3374# to_pandas uses MemoryPool
3375
3376def test_array_uses_memory_pool():
3377 # ARROW-6570
3378 N = 10000
3379 arr = pa.array(np.arange(N, dtype=np.int64),
3380 mask=np.random.randint(0, 2, size=N).astype(np.bool_))
3381
3382 # In the case the gc is caught loafing
3383 gc.collect()
3384
3385 prior_allocation = pa.total_allocated_bytes()
3386
3387 x = arr.to_pandas()
3388 assert pa.total_allocated_bytes() == (prior_allocation + N * 8)
3389 x = None # noqa
3390 gc.collect()
3391
3392 assert pa.total_allocated_bytes() == prior_allocation
3393
3394 # zero copy does not allocate memory
3395 arr = pa.array(np.arange(N, dtype=np.int64))
3396
3397 prior_allocation = pa.total_allocated_bytes()
3398 x = arr.to_pandas() # noqa
3399 assert pa.total_allocated_bytes() == prior_allocation
3400
3401
3402def test_singleton_blocks_zero_copy():
3403 # Part of ARROW-3789
3404 t = pa.table([pa.array(np.arange(1000, dtype=np.int64))], ['f0'])
3405
3406 # Zero copy if split_blocks=True
3407 _check_to_pandas_memory_unchanged(t, split_blocks=True)
3408
3409 prior_allocation = pa.total_allocated_bytes()
3410 result = t.to_pandas()
3411 assert result['f0'].values.flags.writeable
3412 assert pa.total_allocated_bytes() > prior_allocation
3413
3414
3415def _check_to_pandas_memory_unchanged(obj, **kwargs):
3416 prior_allocation = pa.total_allocated_bytes()
3417 x = obj.to_pandas(**kwargs) # noqa
3418
3419 # Memory allocation unchanged -- either zero copy or self-destructing
3420 assert pa.total_allocated_bytes() == prior_allocation
3421
3422
3423def test_to_pandas_split_blocks():
3424 # ARROW-3789
3425 t = pa.table([
3426 pa.array([1, 2, 3, 4, 5], type='i1'),
3427 pa.array([1, 2, 3, 4, 5], type='i4'),
3428 pa.array([1, 2, 3, 4, 5], type='i8'),
3429 pa.array([1, 2, 3, 4, 5], type='f4'),
3430 pa.array([1, 2, 3, 4, 5], type='f8'),
3431 pa.array([1, 2, 3, 4, 5], type='f8'),
3432 pa.array([1, 2, 3, 4, 5], type='f8'),
3433 pa.array([1, 2, 3, 4, 5], type='f8'),
3434 ], ['f{}'.format(i) for i in range(8)])
3435
3436 _check_blocks_created(t, 8)
3437 _check_to_pandas_memory_unchanged(t, split_blocks=True)
3438
3439
3440def _check_blocks_created(t, number):
3441 x = t.to_pandas(split_blocks=True)
3442 assert len(x._data.blocks) == number
3443
3444
3445def test_to_pandas_self_destruct():
3446 K = 50
3447
3448 def _make_table():
3449 return pa.table([
3450 # Slice to force a copy
3451 pa.array(np.random.randn(10000)[::2])
3452 for i in range(K)
3453 ], ['f{}'.format(i) for i in range(K)])
3454
3455 t = _make_table()
3456 _check_to_pandas_memory_unchanged(t, split_blocks=True, self_destruct=True)
3457
3458 # Check non-split-block behavior
3459 t = _make_table()
3460 _check_to_pandas_memory_unchanged(t, self_destruct=True)
3461
3462
3463def test_table_uses_memory_pool():
3464 N = 10000
3465 arr = pa.array(np.arange(N, dtype=np.int64))
3466 t = pa.table([arr, arr, arr], ['f0', 'f1', 'f2'])
3467
3468 prior_allocation = pa.total_allocated_bytes()
3469 x = t.to_pandas()
3470
3471 assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8)
3472
3473 # Check successful garbage collection
3474 x = None # noqa
3475 gc.collect()
3476 assert pa.total_allocated_bytes() == prior_allocation
3477
3478
3479def test_object_leak_in_numpy_array():
3480 # ARROW-6876
3481 arr = pa.array([{'a': 1}])
3482 np_arr = arr.to_pandas()
3483 assert np_arr.dtype == np.dtype('object')
3484 obj = np_arr[0]
3485 refcount = sys.getrefcount(obj)
3486 assert sys.getrefcount(obj) == refcount
3487 del np_arr
3488 assert sys.getrefcount(obj) == refcount - 1
3489
3490
3491def test_object_leak_in_dataframe():
3492 # ARROW-6876
3493 arr = pa.array([{'a': 1}])
3494 table = pa.table([arr], ['f0'])
3495 col = table.to_pandas()['f0']
3496 assert col.dtype == np.dtype('object')
3497 obj = col[0]
3498 refcount = sys.getrefcount(obj)
3499 assert sys.getrefcount(obj) == refcount
3500 del col
3501 assert sys.getrefcount(obj) == refcount - 1
3502
3503
3504# ----------------------------------------------------------------------
3505# Some nested array tests array tests
3506
3507
3508def test_array_from_py_float32():
3509 data = [[1.2, 3.4], [9.0, 42.0]]
3510
3511 t = pa.float32()
3512
3513 arr1 = pa.array(data[0], type=t)
3514 arr2 = pa.array(data, type=pa.list_(t))
3515
3516 expected1 = np.array(data[0], dtype=np.float32)
3517 expected2 = pd.Series([np.array(data[0], dtype=np.float32),
3518 np.array(data[1], dtype=np.float32)])
3519
3520 assert arr1.type == t
3521 assert arr1.equals(pa.array(expected1))
3522 assert arr2.equals(pa.array(expected2))
3523
3524
3525# ----------------------------------------------------------------------
3526# Timestamp tests
3527
3528
3529def test_cast_timestamp_unit():
3530 # ARROW-1680
3531 val = datetime.now()
3532 s = pd.Series([val])
3533 s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York')
3534
3535 us_with_tz = pa.timestamp('us', tz='America/New_York')
3536
3537 arr = pa.Array.from_pandas(s_nyc, type=us_with_tz)
3538
3539 # ARROW-1906
3540 assert arr.type == us_with_tz
3541
3542 arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us'))
3543
3544 assert arr[0].as_py() == s_nyc[0].to_pydatetime()
3545 assert arr2[0].as_py() == s[0].to_pydatetime()
3546
3547 # Disallow truncation
3548 arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
3549 expected = pa.array([123], type='int64').cast(pa.timestamp('s'))
3550
3551 # sanity check that the cast worked right
3552 assert arr.type == pa.timestamp('ms')
3553
3554 target = pa.timestamp('s')
3555 with pytest.raises(ValueError):
3556 arr.cast(target)
3557
3558 result = arr.cast(target, safe=False)
3559 assert result.equals(expected)
3560
3561 # ARROW-1949
3562 series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
3563 expected = pa.array([0, 0, 1], type=pa.timestamp('us'))
3564
3565 with pytest.raises(ValueError):
3566 pa.array(series, type=pa.timestamp('us'))
3567
3568 with pytest.raises(ValueError):
3569 pa.Array.from_pandas(series, type=pa.timestamp('us'))
3570
3571 result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
3572 assert result.equals(expected)
3573
3574 result = pa.array(series, type=pa.timestamp('us'), safe=False)
3575 assert result.equals(expected)
3576
3577
3578def test_nested_with_timestamp_tz_round_trip():
3579 ts = pd.Timestamp.now()
3580 ts_dt = ts.to_pydatetime()
3581 arr = pa.array([ts_dt], type=pa.timestamp('us', tz='America/New_York'))
3582 struct = pa.StructArray.from_arrays([arr, arr], ['start', 'stop'])
3583
3584 result = struct.to_pandas()
3585 restored = pa.array(result)
3586 assert restored.equals(struct)
3587
3588
3589def test_nested_with_timestamp_tz():
3590 # ARROW-7723
3591 ts = pd.Timestamp.now()
3592 ts_dt = ts.to_pydatetime()
3593
3594 # XXX: Ensure that this data does not get promoted to nanoseconds (and thus
3595 # integers) to preserve behavior in 0.15.1
3596 for unit in ['s', 'ms', 'us']:
3597 if unit in ['s', 'ms']:
3598 # This is used for verifying timezone conversion to micros are not
3599 # important
3600 def truncate(x): return x.replace(microsecond=0)
3601 else:
3602 def truncate(x): return x
3603 arr = pa.array([ts], type=pa.timestamp(unit))
3604 arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York'))
3605
3606 arr3 = pa.StructArray.from_arrays([arr, arr], ['start', 'stop'])
3607 arr4 = pa.StructArray.from_arrays([arr2, arr2], ['start', 'stop'])
3608
3609 result = arr3.to_pandas()
3610 assert isinstance(result[0]['start'], datetime)
3611 assert result[0]['start'].tzinfo is None
3612 assert isinstance(result[0]['stop'], datetime)
3613 assert result[0]['stop'].tzinfo is None
3614
3615 result = arr4.to_pandas()
3616 assert isinstance(result[0]['start'], datetime)
3617 assert result[0]['start'].tzinfo is not None
3618 utc_dt = result[0]['start'].astimezone(timezone.utc)
3619 assert truncate(utc_dt).replace(tzinfo=None) == truncate(ts_dt)
3620 assert isinstance(result[0]['stop'], datetime)
3621 assert result[0]['stop'].tzinfo is not None
3622
3623 # same conversion for table
3624 result = pa.table({'a': arr3}).to_pandas()
3625 assert isinstance(result['a'][0]['start'], datetime)
3626 assert result['a'][0]['start'].tzinfo is None
3627 assert isinstance(result['a'][0]['stop'], datetime)
3628 assert result['a'][0]['stop'].tzinfo is None
3629
3630 result = pa.table({'a': arr4}).to_pandas()
3631 assert isinstance(result['a'][0]['start'], datetime)
3632 assert result['a'][0]['start'].tzinfo is not None
3633 assert isinstance(result['a'][0]['stop'], datetime)
3634 assert result['a'][0]['stop'].tzinfo is not None
3635
3636
3637# ----------------------------------------------------------------------
3638# DictionaryArray tests
3639
3640
3641def test_dictionary_with_pandas():
3642 src_indices = np.repeat([0, 1, 2], 2)
3643 dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
3644 mask = np.array([False, False, True, False, False, False])
3645
3646 for index_type in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32',
3647 'uint64', 'int64']:
3648 indices = src_indices.astype(index_type)
3649 d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
3650 d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
3651
3652 if index_type[0] == 'u':
3653 # TODO: unsigned dictionary indices to pandas
3654 with pytest.raises(TypeError):
3655 d1.to_pandas()
3656 continue
3657
3658 pandas1 = d1.to_pandas()
3659 ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
3660
3661 tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
3662
3663 pandas2 = d2.to_pandas()
3664 assert pandas2.isnull().sum() == 1
3665
3666 # Unsigned integers converted to signed
3667 signed_indices = indices
3668 if index_type[0] == 'u':
3669 signed_indices = indices.astype(index_type[1:])
3670 ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1,
3671 signed_indices),
3672 categories=dictionary)
3673
3674 tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2))
3675
3676
3677def random_strings(n, item_size, pct_null=0, dictionary=None):
3678 if dictionary is not None:
3679 result = dictionary[np.random.randint(0, len(dictionary), size=n)]
3680 else:
3681 result = np.array([random_ascii(item_size) for i in range(n)],
3682 dtype=object)
3683
3684 if pct_null > 0:
3685 result[np.random.rand(n) < pct_null] = None
3686
3687 return result
3688
3689
3690def test_variable_dictionary_to_pandas():
3691 np.random.seed(12345)
3692
3693 d1 = pa.array(random_strings(100, 32), type='string')
3694 d2 = pa.array(random_strings(100, 16), type='string')
3695 d3 = pa.array(random_strings(10000, 10), type='string')
3696
3697 a1 = pa.DictionaryArray.from_arrays(
3698 np.random.randint(0, len(d1), size=1000, dtype='i4'),
3699 d1
3700 )
3701 a2 = pa.DictionaryArray.from_arrays(
3702 np.random.randint(0, len(d2), size=1000, dtype='i4'),
3703 d2
3704 )
3705
3706 # With some nulls
3707 a3 = pa.DictionaryArray.from_arrays(
3708 np.random.randint(0, len(d3), size=1000, dtype='i4'), d3)
3709
3710 i4 = pa.array(
3711 np.random.randint(0, len(d3), size=1000, dtype='i4'),
3712 mask=np.random.rand(1000) < 0.1
3713 )
3714 a4 = pa.DictionaryArray.from_arrays(i4, d3)
3715
3716 expected_dict = pa.concat_arrays([d1, d2, d3])
3717
3718 a = pa.chunked_array([a1, a2, a3, a4])
3719 a_dense = pa.chunked_array([a1.cast('string'),
3720 a2.cast('string'),
3721 a3.cast('string'),
3722 a4.cast('string')])
3723
3724 result = a.to_pandas()
3725 result_dense = a_dense.to_pandas()
3726
3727 assert (result.cat.categories == expected_dict.to_pandas()).all()
3728
3729 expected_dense = result.astype('str')
3730 expected_dense[result_dense.isnull()] = None
3731 tm.assert_series_equal(result_dense, expected_dense)
3732
3733
3734def test_dictionary_encoded_nested_to_pandas():
3735 # ARROW-6899
3736 child = pa.array(['a', 'a', 'a', 'b', 'b']).dictionary_encode()
3737
3738 arr = pa.ListArray.from_arrays([0, 3, 5], child)
3739
3740 result = arr.to_pandas()
3741 expected = pd.Series([np.array(['a', 'a', 'a'], dtype=object),
3742 np.array(['b', 'b'], dtype=object)])
3743
3744 tm.assert_series_equal(result, expected)
3745
3746
3747def test_dictionary_from_pandas():
3748 cat = pd.Categorical(['a', 'b', 'a'])
3749 expected_type = pa.dictionary(pa.int8(), pa.string())
3750
3751 result = pa.array(cat)
3752 assert result.to_pylist() == ['a', 'b', 'a']
3753 assert result.type.equals(expected_type)
3754
3755 # with missing values in categorical
3756 cat = pd.Categorical(['a', 'b', None, 'a'])
3757
3758 result = pa.array(cat)
3759 assert result.to_pylist() == ['a', 'b', None, 'a']
3760 assert result.type.equals(expected_type)
3761
3762 # with additional mask
3763 result = pa.array(cat, mask=np.array([False, False, False, True]))
3764 assert result.to_pylist() == ['a', 'b', None, None]
3765 assert result.type.equals(expected_type)
3766
3767
3768def test_dictionary_from_pandas_specified_type():
3769 # ARROW-7168 - ensure specified type is always respected
3770
3771 # the same as cat = pd.Categorical(['a', 'b']) but explicit about dtypes
3772 cat = pd.Categorical.from_codes(
3773 np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object))
3774
3775 # different index type -> allow this
3776 # (the type of the 'codes' in pandas is not part of the data type)
3777 typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string())
3778 result = pa.array(cat, type=typ)
3779 assert result.type.equals(typ)
3780 assert result.to_pylist() == ['a', 'b']
3781
3782 # mismatching values type -> raise error
3783 typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
3784 with pytest.raises(pa.ArrowInvalid):
3785 result = pa.array(cat, type=typ)
3786
3787 # mismatching order -> raise error (for now a deprecation warning)
3788 typ = pa.dictionary(
3789 index_type=pa.int8(), value_type=pa.string(), ordered=True)
3790 with pytest.warns(FutureWarning, match="The 'ordered' flag of the passed"):
3791 result = pa.array(cat, type=typ)
3792 assert result.to_pylist() == ['a', 'b']
3793
3794 # with mask
3795 typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string())
3796 result = pa.array(cat, type=typ, mask=np.array([False, True]))
3797 assert result.type.equals(typ)
3798 assert result.to_pylist() == ['a', None]
3799
3800 # empty categorical -> be flexible in values type to allow
3801 cat = pd.Categorical([])
3802
3803 typ = pa.dictionary(index_type=pa.int8(), value_type=pa.string())
3804 result = pa.array(cat, type=typ)
3805 assert result.type.equals(typ)
3806 assert result.to_pylist() == []
3807 typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
3808 result = pa.array(cat, type=typ)
3809 assert result.type.equals(typ)
3810 assert result.to_pylist() == []
3811
3812 # passing non-dictionary type
3813 cat = pd.Categorical(['a', 'b'])
3814 result = pa.array(cat, type=pa.string())
3815 expected = pa.array(['a', 'b'], type=pa.string())
3816 assert result.equals(expected)
3817 assert result.to_pylist() == ['a', 'b']
3818
3819
3820# ----------------------------------------------------------------------
3821# Array protocol in pandas conversions tests
3822
3823
3824def test_array_protocol():
3825 if Version(pd.__version__) < Version('0.24.0'):
3826 pytest.skip('IntegerArray only introduced in 0.24')
3827
3828 df = pd.DataFrame({'a': pd.Series([1, 2, None], dtype='Int64')})
3829
3830 if Version(pd.__version__) < Version('0.26.0.dev'):
3831 # with pandas<=0.25, trying to convert nullable integer errors
3832 with pytest.raises(TypeError):
3833 pa.table(df)
3834 else:
3835 # __arrow_array__ added to pandas IntegerArray in 0.26.0.dev
3836
3837 # default conversion
3838 result = pa.table(df)
3839 expected = pa.array([1, 2, None], pa.int64())
3840 assert result[0].chunk(0).equals(expected)
3841
3842 # with specifying schema
3843 schema = pa.schema([('a', pa.float64())])
3844 result = pa.table(df, schema=schema)
3845 expected2 = pa.array([1, 2, None], pa.float64())
3846 assert result[0].chunk(0).equals(expected2)
3847
3848 # pass Series to pa.array
3849 result = pa.array(df['a'])
3850 assert result.equals(expected)
3851 result = pa.array(df['a'], type=pa.float64())
3852 assert result.equals(expected2)
3853
3854 # pass actual ExtensionArray to pa.array
3855 result = pa.array(df['a'].values)
3856 assert result.equals(expected)
3857 result = pa.array(df['a'].values, type=pa.float64())
3858 assert result.equals(expected2)
3859
3860
3861class DummyExtensionType(pa.PyExtensionType):
3862
3863 def __init__(self):
3864 pa.PyExtensionType.__init__(self, pa.int64())
3865
3866 def __reduce__(self):
3867 return DummyExtensionType, ()
3868
3869
3870def PandasArray__arrow_array__(self, type=None):
3871 # hardcode dummy return regardless of self - we only want to check that
3872 # this method is correctly called
3873 storage = pa.array([1, 2, 3], type=pa.int64())
3874 return pa.ExtensionArray.from_storage(DummyExtensionType(), storage)
3875
3876
3877def test_array_protocol_pandas_extension_types(monkeypatch):
3878 # ARROW-7022 - ensure protocol works for Period / Interval extension dtypes
3879
3880 if Version(pd.__version__) < Version('0.24.0'):
3881 pytest.skip('Period/IntervalArray only introduced in 0.24')
3882
3883 storage = pa.array([1, 2, 3], type=pa.int64())
3884 expected = pa.ExtensionArray.from_storage(DummyExtensionType(), storage)
3885
3886 monkeypatch.setattr(pd.arrays.PeriodArray, "__arrow_array__",
3887 PandasArray__arrow_array__, raising=False)
3888 monkeypatch.setattr(pd.arrays.IntervalArray, "__arrow_array__",
3889 PandasArray__arrow_array__, raising=False)
3890 for arr in [pd.period_range("2012-01-01", periods=3, freq="D").array,
3891 pd.interval_range(1, 4).array]:
3892 result = pa.array(arr)
3893 assert result.equals(expected)
3894 result = pa.array(pd.Series(arr))
3895 assert result.equals(expected)
3896 result = pa.array(pd.Index(arr))
3897 assert result.equals(expected)
3898 result = pa.table(pd.DataFrame({'a': arr})).column('a').chunk(0)
3899 assert result.equals(expected)
3900
3901
3902# ----------------------------------------------------------------------
3903# Pandas ExtensionArray support
3904
3905
3906def _Int64Dtype__from_arrow__(self, array):
3907 # for test only deal with single chunk for now
3908 # TODO: do we require handling of chunked arrays in the protocol?
3909 if isinstance(array, pa.Array):
3910 arr = array
3911 else:
3912 # ChunkedArray - here only deal with a single chunk for the test
3913 arr = array.chunk(0)
3914 buflist = arr.buffers()
3915 data = np.frombuffer(buflist[-1], dtype='int64')[
3916 arr.offset:arr.offset + len(arr)]
3917 bitmask = buflist[0]
3918 if bitmask is not None:
3919 mask = pa.BooleanArray.from_buffers(
3920 pa.bool_(), len(arr), [None, bitmask])
3921 mask = np.asarray(mask)
3922 else:
3923 mask = np.ones(len(arr), dtype=bool)
3924 int_arr = pd.arrays.IntegerArray(data.copy(), ~mask, copy=False)
3925 return int_arr
3926
3927
3928def test_convert_to_extension_array(monkeypatch):
3929 if Version(pd.__version__) < Version("0.26.0.dev"):
3930 pytest.skip("Conversion from IntegerArray to arrow not yet supported")
3931
3932 import pandas.core.internals as _int
3933
3934 # table converted from dataframe with extension types (so pandas_metadata
3935 # has this information)
3936 df = pd.DataFrame(
3937 {'a': [1, 2, 3], 'b': pd.array([2, 3, 4], dtype='Int64'),
3938 'c': [4, 5, 6]})
3939 table = pa.table(df)
3940
3941 # Int64Dtype is recognized -> convert to extension block by default
3942 # for a proper roundtrip
3943 result = table.to_pandas()
3944 assert not isinstance(result._data.blocks[0], _int.ExtensionBlock)
3945 assert result._data.blocks[0].values.dtype == np.dtype("int64")
3946 assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
3947 tm.assert_frame_equal(result, df)
3948
3949 # test with missing values
3950 df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
3951 table2 = pa.table(df2)
3952 result = table2.to_pandas()
3953 assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
3954 tm.assert_frame_equal(result, df2)
3955
3956 # monkeypatch pandas Int64Dtype to *not* have the protocol method
3957 if Version(pd.__version__) < Version("1.3.0.dev"):
3958 monkeypatch.delattr(
3959 pd.core.arrays.integer._IntegerDtype, "__from_arrow__")
3960 else:
3961 monkeypatch.delattr(
3962 pd.core.arrays.integer.NumericDtype, "__from_arrow__")
3963 # Int64Dtype has no __from_arrow__ -> use normal conversion
3964 result = table.to_pandas()
3965 assert len(result._data.blocks) == 1
3966 assert not isinstance(result._data.blocks[0], _int.ExtensionBlock)
3967
3968
3969class MyCustomIntegerType(pa.PyExtensionType):
3970
3971 def __init__(self):
3972 pa.PyExtensionType.__init__(self, pa.int64())
3973
3974 def __reduce__(self):
3975 return MyCustomIntegerType, ()
3976
3977 def to_pandas_dtype(self):
3978 return pd.Int64Dtype()
3979
3980
3981def test_conversion_extensiontype_to_extensionarray(monkeypatch):
3982 # converting extension type to linked pandas ExtensionDtype/Array
3983 import pandas.core.internals as _int
3984
3985 if Version(pd.__version__) < Version("0.24.0"):
3986 pytest.skip("ExtensionDtype introduced in pandas 0.24")
3987
3988 storage = pa.array([1, 2, 3, 4], pa.int64())
3989 arr = pa.ExtensionArray.from_storage(MyCustomIntegerType(), storage)
3990 table = pa.table({'a': arr})
3991
3992 if Version(pd.__version__) < Version("0.26.0.dev"):
3993 # ensure pandas Int64Dtype has the protocol method (for older pandas)
3994 monkeypatch.setattr(
3995 pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__,
3996 raising=False)
3997
3998 # extension type points to Int64Dtype, which knows how to create a
3999 # pandas ExtensionArray
4000 result = arr.to_pandas()
4001 assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
4002 expected = pd.Series([1, 2, 3, 4], dtype='Int64')
4003 tm.assert_series_equal(result, expected)
4004
4005 result = table.to_pandas()
4006 assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
4007 expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
4008 tm.assert_frame_equal(result, expected)
4009
4010 # monkeypatch pandas Int64Dtype to *not* have the protocol method
4011 # (remove the version added above and the actual version for recent pandas)
4012 if Version(pd.__version__) < Version("0.26.0.dev"):
4013 monkeypatch.delattr(pd.Int64Dtype, "__from_arrow__")
4014 elif Version(pd.__version__) < Version("1.3.0.dev"):
4015 monkeypatch.delattr(
4016 pd.core.arrays.integer._IntegerDtype, "__from_arrow__")
4017 else:
4018 monkeypatch.delattr(
4019 pd.core.arrays.integer.NumericDtype, "__from_arrow__")
4020
4021 result = arr.to_pandas()
4022 assert not isinstance(result._data.blocks[0], _int.ExtensionBlock)
4023 expected = pd.Series([1, 2, 3, 4])
4024 tm.assert_series_equal(result, expected)
4025
4026 with pytest.raises(ValueError):
4027 table.to_pandas()
4028
4029
4030def test_to_pandas_extension_dtypes_mapping():
4031 if Version(pd.__version__) < Version("0.26.0.dev"):
4032 pytest.skip("Conversion to pandas IntegerArray not yet supported")
4033
4034 table = pa.table({'a': pa.array([1, 2, 3], pa.int64())})
4035
4036 # default use numpy dtype
4037 result = table.to_pandas()
4038 assert result['a'].dtype == np.dtype('int64')
4039
4040 # specify to override the default
4041 result = table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
4042 assert isinstance(result['a'].dtype, pd.Int64Dtype)
4043
4044 # types that return None in function get normal conversion
4045 table = pa.table({'a': pa.array([1, 2, 3], pa.int32())})
4046 result = table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
4047 assert result['a'].dtype == np.dtype('int32')
4048
4049 # `types_mapper` overrules the pandas metadata
4050 table = pa.table(pd.DataFrame({'a': pd.array([1, 2, 3], dtype="Int64")}))
4051 result = table.to_pandas()
4052 assert isinstance(result['a'].dtype, pd.Int64Dtype)
4053 result = table.to_pandas(
4054 types_mapper={pa.int64(): pd.PeriodDtype('D')}.get)
4055 assert isinstance(result['a'].dtype, pd.PeriodDtype)
4056
4057
4058def test_array_to_pandas():
4059 if Version(pd.__version__) < Version("1.1"):
4060 pytest.skip("ExtensionDtype to_pandas method missing")
4061
4062 for arr in [pd.period_range("2012-01-01", periods=3, freq="D").array,
4063 pd.interval_range(1, 4).array]:
4064 result = pa.array(arr).to_pandas()
4065 expected = pd.Series(arr)
4066 tm.assert_series_equal(result, expected)
4067
4068 # TODO implement proper conversion for chunked array
4069 # result = pa.table({"col": arr})["col"].to_pandas()
4070 # expected = pd.Series(arr, name="col")
4071 # tm.assert_series_equal(result, expected)
4072
4073
4074# ----------------------------------------------------------------------
4075# Legacy metadata compatibility tests
4076
4077
4078def test_metadata_compat_range_index_pre_0_12():
4079 # Forward compatibility for metadata created from pandas.RangeIndex
4080 # prior to pyarrow 0.13.0
4081 a_values = ['foo', 'bar', None, 'baz']
4082 b_values = ['a', 'a', 'b', 'b']
4083 a_arrow = pa.array(a_values, type='utf8')
4084 b_arrow = pa.array(b_values, type='utf8')
4085
4086 rng_index_arrow = pa.array([0, 2, 4, 6], type='int64')
4087
4088 gen_name_0 = '__index_level_0__'
4089 gen_name_1 = '__index_level_1__'
4090
4091 # Case 1: named RangeIndex
4092 e1 = pd.DataFrame({
4093 'a': a_values
4094 }, index=pd.RangeIndex(0, 8, step=2, name='qux'))
4095 t1 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
4096 names=['a', 'qux'])
4097 t1 = t1.replace_schema_metadata({
4098 b'pandas': json.dumps(
4099 {'index_columns': ['qux'],
4100 'column_indexes': [{'name': None,
4101 'field_name': None,
4102 'pandas_type': 'unicode',
4103 'numpy_type': 'object',
4104 'metadata': {'encoding': 'UTF-8'}}],
4105 'columns': [{'name': 'a',
4106 'field_name': 'a',
4107 'pandas_type': 'unicode',
4108 'numpy_type': 'object',
4109 'metadata': None},
4110 {'name': 'qux',
4111 'field_name': 'qux',
4112 'pandas_type': 'int64',
4113 'numpy_type': 'int64',
4114 'metadata': None}],
4115 'pandas_version': '0.23.4'}
4116 )})
4117 r1 = t1.to_pandas()
4118 tm.assert_frame_equal(r1, e1)
4119
4120 # Case 2: named RangeIndex, but conflicts with an actual column
4121 e2 = pd.DataFrame({
4122 'qux': a_values
4123 }, index=pd.RangeIndex(0, 8, step=2, name='qux'))
4124 t2 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
4125 names=['qux', gen_name_0])
4126 t2 = t2.replace_schema_metadata({
4127 b'pandas': json.dumps(
4128 {'index_columns': [gen_name_0],
4129 'column_indexes': [{'name': None,
4130 'field_name': None,
4131 'pandas_type': 'unicode',
4132 'numpy_type': 'object',
4133 'metadata': {'encoding': 'UTF-8'}}],
4134 'columns': [{'name': 'a',
4135 'field_name': 'a',
4136 'pandas_type': 'unicode',
4137 'numpy_type': 'object',
4138 'metadata': None},
4139 {'name': 'qux',
4140 'field_name': gen_name_0,
4141 'pandas_type': 'int64',
4142 'numpy_type': 'int64',
4143 'metadata': None}],
4144 'pandas_version': '0.23.4'}
4145 )})
4146 r2 = t2.to_pandas()
4147 tm.assert_frame_equal(r2, e2)
4148
4149 # Case 3: unnamed RangeIndex
4150 e3 = pd.DataFrame({
4151 'a': a_values
4152 }, index=pd.RangeIndex(0, 8, step=2, name=None))
4153 t3 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
4154 names=['a', gen_name_0])
4155 t3 = t3.replace_schema_metadata({
4156 b'pandas': json.dumps(
4157 {'index_columns': [gen_name_0],
4158 'column_indexes': [{'name': None,
4159 'field_name': None,
4160 'pandas_type': 'unicode',
4161 'numpy_type': 'object',
4162 'metadata': {'encoding': 'UTF-8'}}],
4163 'columns': [{'name': 'a',
4164 'field_name': 'a',
4165 'pandas_type': 'unicode',
4166 'numpy_type': 'object',
4167 'metadata': None},
4168 {'name': None,
4169 'field_name': gen_name_0,
4170 'pandas_type': 'int64',
4171 'numpy_type': 'int64',
4172 'metadata': None}],
4173 'pandas_version': '0.23.4'}
4174 )})
4175 r3 = t3.to_pandas()
4176 tm.assert_frame_equal(r3, e3)
4177
4178 # Case 4: MultiIndex with named RangeIndex
4179 e4 = pd.DataFrame({
4180 'a': a_values
4181 }, index=[pd.RangeIndex(0, 8, step=2, name='qux'), b_values])
4182 t4 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
4183 names=['a', 'qux', gen_name_1])
4184 t4 = t4.replace_schema_metadata({
4185 b'pandas': json.dumps(
4186 {'index_columns': ['qux', gen_name_1],
4187 'column_indexes': [{'name': None,
4188 'field_name': None,
4189 'pandas_type': 'unicode',
4190 'numpy_type': 'object',
4191 'metadata': {'encoding': 'UTF-8'}}],
4192 'columns': [{'name': 'a',
4193 'field_name': 'a',
4194 'pandas_type': 'unicode',
4195 'numpy_type': 'object',
4196 'metadata': None},
4197 {'name': 'qux',
4198 'field_name': 'qux',
4199 'pandas_type': 'int64',
4200 'numpy_type': 'int64',
4201 'metadata': None},
4202 {'name': None,
4203 'field_name': gen_name_1,
4204 'pandas_type': 'unicode',
4205 'numpy_type': 'object',
4206 'metadata': None}],
4207 'pandas_version': '0.23.4'}
4208 )})
4209 r4 = t4.to_pandas()
4210 tm.assert_frame_equal(r4, e4)
4211
4212 # Case 4: MultiIndex with unnamed RangeIndex
4213 e5 = pd.DataFrame({
4214 'a': a_values
4215 }, index=[pd.RangeIndex(0, 8, step=2, name=None), b_values])
4216 t5 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
4217 names=['a', gen_name_0, gen_name_1])
4218 t5 = t5.replace_schema_metadata({
4219 b'pandas': json.dumps(
4220 {'index_columns': [gen_name_0, gen_name_1],
4221 'column_indexes': [{'name': None,
4222 'field_name': None,
4223 'pandas_type': 'unicode',
4224 'numpy_type': 'object',
4225 'metadata': {'encoding': 'UTF-8'}}],
4226 'columns': [{'name': 'a',
4227 'field_name': 'a',
4228 'pandas_type': 'unicode',
4229 'numpy_type': 'object',
4230 'metadata': None},
4231 {'name': None,
4232 'field_name': gen_name_0,
4233 'pandas_type': 'int64',
4234 'numpy_type': 'int64',
4235 'metadata': None},
4236 {'name': None,
4237 'field_name': gen_name_1,
4238 'pandas_type': 'unicode',
4239 'numpy_type': 'object',
4240 'metadata': None}],
4241 'pandas_version': '0.23.4'}
4242 )})
4243 r5 = t5.to_pandas()
4244 tm.assert_frame_equal(r5, e5)
4245
4246
4247def test_metadata_compat_missing_field_name():
4248 # Combination of missing field name but with index column as metadata.
4249 # This combo occurs in the latest versions of fastparquet (0.3.2), but not
4250 # in pyarrow itself (since field_name was added in 0.8, index as metadata
4251 # only added later)
4252
4253 a_values = [1, 2, 3, 4]
4254 b_values = ['a', 'b', 'c', 'd']
4255 a_arrow = pa.array(a_values, type='int64')
4256 b_arrow = pa.array(b_values, type='utf8')
4257
4258 expected = pd.DataFrame({
4259 'a': a_values,
4260 'b': b_values,
4261 }, index=pd.RangeIndex(0, 8, step=2, name='qux'))
4262 table = pa.table({'a': a_arrow, 'b': b_arrow})
4263
4264 # metadata generated by fastparquet 0.3.2 with missing field_names
4265 table = table.replace_schema_metadata({
4266 b'pandas': json.dumps({
4267 'column_indexes': [
4268 {'field_name': None,
4269 'metadata': None,
4270 'name': None,
4271 'numpy_type': 'object',
4272 'pandas_type': 'mixed-integer'}
4273 ],
4274 'columns': [
4275 {'metadata': None,
4276 'name': 'a',
4277 'numpy_type': 'int64',
4278 'pandas_type': 'int64'},
4279 {'metadata': None,
4280 'name': 'b',
4281 'numpy_type': 'object',
4282 'pandas_type': 'unicode'}
4283 ],
4284 'index_columns': [
4285 {'kind': 'range',
4286 'name': 'qux',
4287 'start': 0,
4288 'step': 2,
4289 'stop': 8}
4290 ],
4291 'pandas_version': '0.25.0'}
4292
4293 )})
4294 result = table.to_pandas()
4295 tm.assert_frame_equal(result, expected)
4296
4297
4298def test_metadata_index_name_not_json_serializable():
4299 name = np.int64(6) # not json serializable by default
4300 table = pa.table(pd.DataFrame(index=pd.RangeIndex(0, 4, name=name)))
4301 metadata = table.schema.pandas_metadata
4302 assert metadata['index_columns'][0]['name'] == '6'
4303
4304
4305def test_metadata_index_name_is_json_serializable():
4306 name = 6 # json serializable by default
4307 table = pa.table(pd.DataFrame(index=pd.RangeIndex(0, 4, name=name)))
4308 metadata = table.schema.pandas_metadata
4309 assert metadata['index_columns'][0]['name'] == 6
4310
4311
4312def make_df_with_timestamps():
4313 # Some of the milliseconds timestamps deliberately don't fit in the range
4314 # that is possible with nanosecond timestamps.
4315 df = pd.DataFrame({
4316 'dateTimeMs': [
4317 np.datetime64('0001-01-01 00:00', 'ms'),
4318 np.datetime64('2012-05-02 12:35', 'ms'),
4319 np.datetime64('2012-05-03 15:42', 'ms'),
4320 np.datetime64('3000-05-03 15:42', 'ms'),
4321 ],
4322 'dateTimeNs': [
4323 np.datetime64('1991-01-01 00:00', 'ns'),
4324 np.datetime64('2012-05-02 12:35', 'ns'),
4325 np.datetime64('2012-05-03 15:42', 'ns'),
4326 np.datetime64('2050-05-03 15:42', 'ns'),
4327 ],
4328 })
4329 # Not part of what we're testing, just ensuring that the inputs are what we
4330 # expect.
4331 assert (df.dateTimeMs.dtype, df.dateTimeNs.dtype) == (
4332 # O == object, <M8[ns] == timestamp64[ns]
4333 np.dtype("O"), np.dtype("<M8[ns]")
4334 )
4335 return df
4336
4337
4338@pytest.mark.parquet
4339def test_timestamp_as_object_parquet(tempdir):
4340 # Timestamps can be stored as Parquet and reloaded into Pandas with no loss
4341 # of information if the timestamp_as_object option is True.
4342 df = make_df_with_timestamps()
4343 table = pa.Table.from_pandas(df)
4344 filename = tempdir / "timestamps_from_pandas.parquet"
4345 pq.write_table(table, filename, version="2.0")
4346 result = pq.read_table(filename)
4347 df2 = result.to_pandas(timestamp_as_object=True)
4348 tm.assert_frame_equal(df, df2)
4349
4350
4351def test_timestamp_as_object_out_of_range():
4352 # Out of range timestamps can be converted Arrow and reloaded into Pandas
4353 # with no loss of information if the timestamp_as_object option is True.
4354 df = make_df_with_timestamps()
4355 table = pa.Table.from_pandas(df)
4356 df2 = table.to_pandas(timestamp_as_object=True)
4357 tm.assert_frame_equal(df, df2)
4358
4359
4360@pytest.mark.parametrize("resolution", ["s", "ms", "us"])
4361@pytest.mark.parametrize("tz", [None, "America/New_York"])
4362# One datetime outside nanosecond range, one inside nanosecond range:
4363@pytest.mark.parametrize("dt", [datetime(1553, 1, 1), datetime(2020, 1, 1)])
4364def test_timestamp_as_object_non_nanosecond(resolution, tz, dt):
4365 # Timestamps can be converted Arrow and reloaded into Pandas with no loss
4366 # of information if the timestamp_as_object option is True.
4367 arr = pa.array([dt], type=pa.timestamp(resolution, tz=tz))
4368 table = pa.table({'a': arr})
4369
4370 for result in [
4371 arr.to_pandas(timestamp_as_object=True),
4372 table.to_pandas(timestamp_as_object=True)['a']
4373 ]:
4374 assert result.dtype == object
4375 assert isinstance(result[0], datetime)
4376 if tz:
4377 assert result[0].tzinfo is not None
4378 expected = result[0].tzinfo.fromutc(dt)
4379 else:
4380 assert result[0].tzinfo is None
4381 expected = dt
4382 assert result[0] == expected
4383
4384
4385def test_threaded_pandas_import():
4386 invoke_script("pandas_threaded_import.py")