]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | # Licensed to the Apache Software Foundation (ASF) under one |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | import io | |
19 | import os | |
20 | import sys | |
21 | import tempfile | |
22 | import pytest | |
23 | import hypothesis as h | |
24 | import hypothesis.strategies as st | |
25 | ||
26 | import numpy as np | |
27 | ||
28 | import pyarrow as pa | |
29 | import pyarrow.tests.strategies as past | |
30 | from pyarrow.feather import (read_feather, write_feather, read_table, | |
31 | FeatherDataset) | |
32 | ||
33 | ||
34 | try: | |
35 | from pandas.testing import assert_frame_equal | |
36 | import pandas as pd | |
37 | import pyarrow.pandas_compat | |
38 | except ImportError: | |
39 | pass | |
40 | ||
41 | ||
42 | @pytest.fixture(scope='module') | |
43 | def datadir(base_datadir): | |
44 | return base_datadir / 'feather' | |
45 | ||
46 | ||
47 | def random_path(prefix='feather_'): | |
48 | return tempfile.mktemp(prefix=prefix) | |
49 | ||
50 | ||
51 | @pytest.fixture(scope="module", params=[1, 2]) | |
52 | def version(request): | |
53 | yield request.param | |
54 | ||
55 | ||
56 | @pytest.fixture(scope="module", params=[None, "uncompressed", "lz4", "zstd"]) | |
57 | def compression(request): | |
58 | if request.param in ['lz4', 'zstd'] and not pa.Codec.is_available( | |
59 | request.param): | |
60 | pytest.skip(f'{request.param} is not available') | |
61 | yield request.param | |
62 | ||
63 | ||
64 | TEST_FILES = None | |
65 | ||
66 | ||
67 | def setup_module(module): | |
68 | global TEST_FILES | |
69 | TEST_FILES = [] | |
70 | ||
71 | ||
72 | def teardown_module(module): | |
73 | for path in TEST_FILES: | |
74 | try: | |
75 | os.remove(path) | |
76 | except os.error: | |
77 | pass | |
78 | ||
79 | ||
80 | @pytest.mark.pandas | |
81 | def test_file_not_exist(): | |
82 | with pytest.raises(pa.ArrowIOError): | |
83 | read_feather('test_invalid_file') | |
84 | ||
85 | ||
86 | def _check_pandas_roundtrip(df, expected=None, path=None, | |
87 | columns=None, use_threads=False, | |
88 | version=None, compression=None, | |
89 | compression_level=None): | |
90 | if path is None: | |
91 | path = random_path() | |
92 | ||
93 | TEST_FILES.append(path) | |
94 | write_feather(df, path, compression=compression, | |
95 | compression_level=compression_level, version=version) | |
96 | if not os.path.exists(path): | |
97 | raise Exception('file not written') | |
98 | ||
99 | result = read_feather(path, columns, use_threads=use_threads) | |
100 | if expected is None: | |
101 | expected = df | |
102 | ||
103 | assert_frame_equal(result, expected) | |
104 | ||
105 | ||
106 | def _check_arrow_roundtrip(table, path=None, compression=None): | |
107 | if path is None: | |
108 | path = random_path() | |
109 | ||
110 | TEST_FILES.append(path) | |
111 | write_feather(table, path, compression=compression) | |
112 | if not os.path.exists(path): | |
113 | raise Exception('file not written') | |
114 | ||
115 | result = read_table(path) | |
116 | assert result.equals(table) | |
117 | ||
118 | ||
119 | def _assert_error_on_write(df, exc, path=None, version=2): | |
120 | # check that we are raising the exception | |
121 | # on writing | |
122 | ||
123 | if path is None: | |
124 | path = random_path() | |
125 | ||
126 | TEST_FILES.append(path) | |
127 | ||
128 | def f(): | |
129 | write_feather(df, path, version=version) | |
130 | ||
131 | pytest.raises(exc, f) | |
132 | ||
133 | ||
134 | def test_dataset(version): | |
135 | num_values = (100, 100) | |
136 | num_files = 5 | |
137 | paths = [random_path() for i in range(num_files)] | |
138 | data = { | |
139 | "col_" + str(i): np.random.randn(num_values[0]) | |
140 | for i in range(num_values[1]) | |
141 | } | |
142 | table = pa.table(data) | |
143 | ||
144 | TEST_FILES.extend(paths) | |
145 | for index, path in enumerate(paths): | |
146 | rows = ( | |
147 | index * (num_values[0] // num_files), | |
148 | (index + 1) * (num_values[0] // num_files), | |
149 | ) | |
150 | ||
151 | write_feather(table[rows[0]: rows[1]], path, version=version) | |
152 | ||
153 | data = FeatherDataset(paths).read_table() | |
154 | assert data.equals(table) | |
155 | ||
156 | ||
157 | @pytest.mark.pandas | |
158 | def test_float_no_nulls(version): | |
159 | data = {} | |
160 | numpy_dtypes = ['f4', 'f8'] | |
161 | num_values = 100 | |
162 | ||
163 | for dtype in numpy_dtypes: | |
164 | values = np.random.randn(num_values) | |
165 | data[dtype] = values.astype(dtype) | |
166 | ||
167 | df = pd.DataFrame(data) | |
168 | _check_pandas_roundtrip(df, version=version) | |
169 | ||
170 | ||
171 | @pytest.mark.pandas | |
172 | def test_read_table(version): | |
173 | num_values = (100, 100) | |
174 | path = random_path() | |
175 | ||
176 | TEST_FILES.append(path) | |
177 | ||
178 | values = np.random.randint(0, 100, size=num_values) | |
179 | columns = ['col_' + str(i) for i in range(100)] | |
180 | table = pa.Table.from_arrays(values, columns) | |
181 | ||
182 | write_feather(table, path, version=version) | |
183 | ||
184 | result = read_table(path) | |
185 | assert result.equals(table) | |
186 | ||
187 | # Test without memory mapping | |
188 | result = read_table(path, memory_map=False) | |
189 | assert result.equals(table) | |
190 | ||
191 | result = read_feather(path, memory_map=False) | |
192 | assert_frame_equal(table.to_pandas(), result) | |
193 | ||
194 | ||
195 | @pytest.mark.pandas | |
196 | def test_float_nulls(version): | |
197 | num_values = 100 | |
198 | ||
199 | path = random_path() | |
200 | TEST_FILES.append(path) | |
201 | ||
202 | null_mask = np.random.randint(0, 10, size=num_values) < 3 | |
203 | dtypes = ['f4', 'f8'] | |
204 | expected_cols = [] | |
205 | ||
206 | arrays = [] | |
207 | for name in dtypes: | |
208 | values = np.random.randn(num_values).astype(name) | |
209 | arrays.append(pa.array(values, mask=null_mask)) | |
210 | ||
211 | values[null_mask] = np.nan | |
212 | ||
213 | expected_cols.append(values) | |
214 | ||
215 | table = pa.table(arrays, names=dtypes) | |
216 | _check_arrow_roundtrip(table) | |
217 | ||
218 | df = table.to_pandas() | |
219 | _check_pandas_roundtrip(df, version=version) | |
220 | ||
221 | ||
222 | @pytest.mark.pandas | |
223 | def test_integer_no_nulls(version): | |
224 | data, arr = {}, [] | |
225 | ||
226 | numpy_dtypes = ['i1', 'i2', 'i4', 'i8', | |
227 | 'u1', 'u2', 'u4', 'u8'] | |
228 | num_values = 100 | |
229 | ||
230 | for dtype in numpy_dtypes: | |
231 | values = np.random.randint(0, 100, size=num_values) | |
232 | data[dtype] = values.astype(dtype) | |
233 | arr.append(values.astype(dtype)) | |
234 | ||
235 | df = pd.DataFrame(data) | |
236 | _check_pandas_roundtrip(df, version=version) | |
237 | ||
238 | table = pa.table(arr, names=numpy_dtypes) | |
239 | _check_arrow_roundtrip(table) | |
240 | ||
241 | ||
242 | @pytest.mark.pandas | |
243 | def test_platform_numpy_integers(version): | |
244 | data = {} | |
245 | ||
246 | numpy_dtypes = ['longlong'] | |
247 | num_values = 100 | |
248 | ||
249 | for dtype in numpy_dtypes: | |
250 | values = np.random.randint(0, 100, size=num_values) | |
251 | data[dtype] = values.astype(dtype) | |
252 | ||
253 | df = pd.DataFrame(data) | |
254 | _check_pandas_roundtrip(df, version=version) | |
255 | ||
256 | ||
257 | @pytest.mark.pandas | |
258 | def test_integer_with_nulls(version): | |
259 | # pandas requires upcast to float dtype | |
260 | path = random_path() | |
261 | TEST_FILES.append(path) | |
262 | ||
263 | int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] | |
264 | num_values = 100 | |
265 | ||
266 | arrays = [] | |
267 | null_mask = np.random.randint(0, 10, size=num_values) < 3 | |
268 | expected_cols = [] | |
269 | for name in int_dtypes: | |
270 | values = np.random.randint(0, 100, size=num_values) | |
271 | arrays.append(pa.array(values, mask=null_mask)) | |
272 | ||
273 | expected = values.astype('f8') | |
274 | expected[null_mask] = np.nan | |
275 | ||
276 | expected_cols.append(expected) | |
277 | ||
278 | table = pa.table(arrays, names=int_dtypes) | |
279 | _check_arrow_roundtrip(table) | |
280 | ||
281 | df = table.to_pandas() | |
282 | _check_pandas_roundtrip(df, version=version) | |
283 | ||
284 | ||
285 | @pytest.mark.pandas | |
286 | def test_boolean_no_nulls(version): | |
287 | num_values = 100 | |
288 | ||
289 | np.random.seed(0) | |
290 | ||
291 | df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) | |
292 | _check_pandas_roundtrip(df, version=version) | |
293 | ||
294 | ||
295 | @pytest.mark.pandas | |
296 | def test_boolean_nulls(version): | |
297 | # pandas requires upcast to object dtype | |
298 | path = random_path() | |
299 | TEST_FILES.append(path) | |
300 | ||
301 | num_values = 100 | |
302 | np.random.seed(0) | |
303 | ||
304 | mask = np.random.randint(0, 10, size=num_values) < 3 | |
305 | values = np.random.randint(0, 10, size=num_values) < 5 | |
306 | ||
307 | table = pa.table([pa.array(values, mask=mask)], names=['bools']) | |
308 | _check_arrow_roundtrip(table) | |
309 | ||
310 | df = table.to_pandas() | |
311 | _check_pandas_roundtrip(df, version=version) | |
312 | ||
313 | ||
314 | def test_buffer_bounds_error(version): | |
315 | # ARROW-1676 | |
316 | path = random_path() | |
317 | TEST_FILES.append(path) | |
318 | ||
319 | for i in range(16, 256): | |
320 | table = pa.Table.from_arrays( | |
321 | [pa.array([None] + list(range(i)), type=pa.float64())], | |
322 | names=["arr"] | |
323 | ) | |
324 | _check_arrow_roundtrip(table) | |
325 | ||
326 | ||
327 | def test_boolean_object_nulls(version): | |
328 | repeats = 100 | |
329 | table = pa.Table.from_arrays( | |
330 | [np.array([False, None, True] * repeats, dtype=object)], | |
331 | names=["arr"] | |
332 | ) | |
333 | _check_arrow_roundtrip(table) | |
334 | ||
335 | ||
336 | @pytest.mark.pandas | |
337 | def test_delete_partial_file_on_error(version): | |
338 | if sys.platform == 'win32': | |
339 | pytest.skip('Windows hangs on to file handle for some reason') | |
340 | ||
341 | class CustomClass: | |
342 | pass | |
343 | ||
344 | # strings will fail | |
345 | df = pd.DataFrame( | |
346 | { | |
347 | 'numbers': range(5), | |
348 | 'strings': [b'foo', None, 'bar', CustomClass(), np.nan]}, | |
349 | columns=['numbers', 'strings']) | |
350 | ||
351 | path = random_path() | |
352 | try: | |
353 | write_feather(df, path, version=version) | |
354 | except Exception: | |
355 | pass | |
356 | ||
357 | assert not os.path.exists(path) | |
358 | ||
359 | ||
360 | @pytest.mark.pandas | |
361 | def test_strings(version): | |
362 | repeats = 1000 | |
363 | ||
364 | # Mixed bytes, unicode, strings coerced to binary | |
365 | values = [b'foo', None, 'bar', 'qux', np.nan] | |
366 | df = pd.DataFrame({'strings': values * repeats}) | |
367 | ||
368 | ex_values = [b'foo', None, b'bar', b'qux', np.nan] | |
369 | expected = pd.DataFrame({'strings': ex_values * repeats}) | |
370 | _check_pandas_roundtrip(df, expected, version=version) | |
371 | ||
372 | # embedded nulls are ok | |
373 | values = ['foo', None, 'bar', 'qux', None] | |
374 | df = pd.DataFrame({'strings': values * repeats}) | |
375 | expected = pd.DataFrame({'strings': values * repeats}) | |
376 | _check_pandas_roundtrip(df, expected, version=version) | |
377 | ||
378 | values = ['foo', None, 'bar', 'qux', np.nan] | |
379 | df = pd.DataFrame({'strings': values * repeats}) | |
380 | expected = pd.DataFrame({'strings': values * repeats}) | |
381 | _check_pandas_roundtrip(df, expected, version=version) | |
382 | ||
383 | ||
384 | @pytest.mark.pandas | |
385 | def test_empty_strings(version): | |
386 | df = pd.DataFrame({'strings': [''] * 10}) | |
387 | _check_pandas_roundtrip(df, version=version) | |
388 | ||
389 | ||
390 | @pytest.mark.pandas | |
391 | def test_all_none(version): | |
392 | df = pd.DataFrame({'all_none': [None] * 10}) | |
393 | _check_pandas_roundtrip(df, version=version) | |
394 | ||
395 | ||
396 | @pytest.mark.pandas | |
397 | def test_all_null_category(version): | |
398 | # ARROW-1188 | |
399 | df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)}) | |
400 | df = df.assign(B=df.B.astype("category")) | |
401 | _check_pandas_roundtrip(df, version=version) | |
402 | ||
403 | ||
404 | @pytest.mark.pandas | |
405 | def test_multithreaded_read(version): | |
406 | data = {'c{}'.format(i): [''] * 10 | |
407 | for i in range(100)} | |
408 | df = pd.DataFrame(data) | |
409 | _check_pandas_roundtrip(df, use_threads=True, version=version) | |
410 | ||
411 | ||
412 | @pytest.mark.pandas | |
413 | def test_nan_as_null(version): | |
414 | # Create a nan that is not numpy.nan | |
415 | values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10) | |
416 | df = pd.DataFrame({'strings': values}) | |
417 | _check_pandas_roundtrip(df, version=version) | |
418 | ||
419 | ||
420 | @pytest.mark.pandas | |
421 | def test_category(version): | |
422 | repeats = 1000 | |
423 | values = ['foo', None, 'bar', 'qux', np.nan] | |
424 | df = pd.DataFrame({'strings': values * repeats}) | |
425 | df['strings'] = df['strings'].astype('category') | |
426 | ||
427 | values = ['foo', None, 'bar', 'qux', None] | |
428 | expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)}) | |
429 | _check_pandas_roundtrip(df, expected, version=version) | |
430 | ||
431 | ||
432 | @pytest.mark.pandas | |
433 | def test_timestamp(version): | |
434 | df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) | |
435 | df['with_tz'] = (df.naive.dt.tz_localize('utc') | |
436 | .dt.tz_convert('America/Los_Angeles')) | |
437 | ||
438 | _check_pandas_roundtrip(df, version=version) | |
439 | ||
440 | ||
441 | @pytest.mark.pandas | |
442 | def test_timestamp_with_nulls(version): | |
443 | df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1), | |
444 | None, | |
445 | pd.Timestamp(2016, 1, 3)]}) | |
446 | df['with_tz'] = df.test.dt.tz_localize('utc') | |
447 | ||
448 | _check_pandas_roundtrip(df, version=version) | |
449 | ||
450 | ||
451 | @pytest.mark.pandas | |
452 | @pytest.mark.xfail(reason="not supported", raises=TypeError) | |
453 | def test_timedelta_with_nulls_v1(): | |
454 | df = pd.DataFrame({'test': [pd.Timedelta('1 day'), | |
455 | None, | |
456 | pd.Timedelta('3 day')]}) | |
457 | _check_pandas_roundtrip(df, version=1) | |
458 | ||
459 | ||
460 | @pytest.mark.pandas | |
461 | def test_timedelta_with_nulls(): | |
462 | df = pd.DataFrame({'test': [pd.Timedelta('1 day'), | |
463 | None, | |
464 | pd.Timedelta('3 day')]}) | |
465 | _check_pandas_roundtrip(df, version=2) | |
466 | ||
467 | ||
468 | @pytest.mark.pandas | |
469 | def test_out_of_float64_timestamp_with_nulls(version): | |
470 | df = pd.DataFrame( | |
471 | {'test': pd.DatetimeIndex([1451606400000000001, | |
472 | None, 14516064000030405])}) | |
473 | df['with_tz'] = df.test.dt.tz_localize('utc') | |
474 | _check_pandas_roundtrip(df, version=version) | |
475 | ||
476 | ||
477 | @pytest.mark.pandas | |
478 | def test_non_string_columns(version): | |
479 | df = pd.DataFrame({0: [1, 2, 3, 4], | |
480 | 1: [True, False, True, False]}) | |
481 | ||
482 | expected = df.rename(columns=str) | |
483 | _check_pandas_roundtrip(df, expected, version=version) | |
484 | ||
485 | ||
486 | @pytest.mark.pandas | |
487 | @pytest.mark.skipif(not os.path.supports_unicode_filenames, | |
488 | reason='unicode filenames not supported') | |
489 | def test_unicode_filename(version): | |
490 | # GH #209 | |
491 | name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8') | |
492 | df = pd.DataFrame({'foo': [1, 2, 3, 4]}) | |
493 | _check_pandas_roundtrip(df, path=random_path(prefix=name), | |
494 | version=version) | |
495 | ||
496 | ||
497 | @pytest.mark.pandas | |
498 | def test_read_columns(version): | |
499 | df = pd.DataFrame({ | |
500 | 'foo': [1, 2, 3, 4], | |
501 | 'boo': [5, 6, 7, 8], | |
502 | 'woo': [1, 3, 5, 7] | |
503 | }) | |
504 | expected = df[['boo', 'woo']] | |
505 | ||
506 | _check_pandas_roundtrip(df, expected, version=version, | |
507 | columns=['boo', 'woo']) | |
508 | ||
509 | ||
510 | def test_overwritten_file(version): | |
511 | path = random_path() | |
512 | TEST_FILES.append(path) | |
513 | ||
514 | num_values = 100 | |
515 | np.random.seed(0) | |
516 | ||
517 | values = np.random.randint(0, 10, size=num_values) | |
518 | ||
519 | table = pa.table({'ints': values}) | |
520 | write_feather(table, path) | |
521 | ||
522 | table = pa.table({'more_ints': values[0:num_values//2]}) | |
523 | _check_arrow_roundtrip(table, path=path) | |
524 | ||
525 | ||
526 | @pytest.mark.pandas | |
527 | def test_filelike_objects(version): | |
528 | buf = io.BytesIO() | |
529 | ||
530 | # the copy makes it non-strided | |
531 | df = pd.DataFrame(np.arange(12).reshape(4, 3), | |
532 | columns=['a', 'b', 'c']).copy() | |
533 | write_feather(df, buf, version=version) | |
534 | ||
535 | buf.seek(0) | |
536 | ||
537 | result = read_feather(buf) | |
538 | assert_frame_equal(result, df) | |
539 | ||
540 | ||
541 | @pytest.mark.pandas | |
542 | @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") | |
543 | @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") | |
544 | def test_sparse_dataframe(version): | |
545 | if not pa.pandas_compat._pandas_api.has_sparse: | |
546 | pytest.skip("version of pandas does not support SparseDataFrame") | |
547 | # GH #221 | |
548 | data = {'A': [0, 1, 2], | |
549 | 'B': [1, 0, 1]} | |
550 | df = pd.DataFrame(data).to_sparse(fill_value=1) | |
551 | expected = df.to_dense() | |
552 | _check_pandas_roundtrip(df, expected, version=version) | |
553 | ||
554 | ||
555 | @pytest.mark.pandas | |
556 | def test_duplicate_columns_pandas(): | |
557 | ||
558 | # https://github.com/wesm/feather/issues/53 | |
559 | # not currently able to handle duplicate columns | |
560 | df = pd.DataFrame(np.arange(12).reshape(4, 3), | |
561 | columns=list('aaa')).copy() | |
562 | _assert_error_on_write(df, ValueError) | |
563 | ||
564 | ||
565 | def test_duplicate_columns(): | |
566 | # only works for version 2 | |
567 | table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'a', 'b']) | |
568 | _check_arrow_roundtrip(table) | |
569 | _assert_error_on_write(table, ValueError, version=1) | |
570 | ||
571 | ||
572 | @pytest.mark.pandas | |
573 | def test_unsupported(): | |
574 | # https://github.com/wesm/feather/issues/240 | |
575 | # serializing actual python objects | |
576 | ||
577 | # custom python objects | |
578 | class A: | |
579 | pass | |
580 | ||
581 | df = pd.DataFrame({'a': [A(), A()]}) | |
582 | _assert_error_on_write(df, ValueError) | |
583 | ||
584 | # non-strings | |
585 | df = pd.DataFrame({'a': ['a', 1, 2.0]}) | |
586 | _assert_error_on_write(df, TypeError) | |
587 | ||
588 | ||
589 | @pytest.mark.pandas | |
590 | def test_v2_set_chunksize(): | |
591 | df = pd.DataFrame({'A': np.arange(1000)}) | |
592 | table = pa.table(df) | |
593 | ||
594 | buf = io.BytesIO() | |
595 | write_feather(table, buf, chunksize=250, version=2) | |
596 | ||
597 | result = buf.getvalue() | |
598 | ||
599 | ipc_file = pa.ipc.open_file(pa.BufferReader(result)) | |
600 | assert ipc_file.num_record_batches == 4 | |
601 | assert len(ipc_file.get_batch(0)) == 250 | |
602 | ||
603 | ||
604 | @pytest.mark.pandas | |
605 | @pytest.mark.lz4 | |
606 | @pytest.mark.snappy | |
607 | @pytest.mark.zstd | |
608 | def test_v2_compression_options(): | |
609 | df = pd.DataFrame({'A': np.arange(1000)}) | |
610 | ||
611 | cases = [ | |
612 | # compression, compression_level | |
613 | ('uncompressed', None), | |
614 | ('lz4', None), | |
615 | ('zstd', 1), | |
616 | ('zstd', 10) | |
617 | ] | |
618 | ||
619 | for compression, compression_level in cases: | |
620 | _check_pandas_roundtrip(df, compression=compression, | |
621 | compression_level=compression_level) | |
622 | ||
623 | buf = io.BytesIO() | |
624 | ||
625 | # LZ4 doesn't support compression_level | |
626 | with pytest.raises(pa.ArrowInvalid, | |
627 | match="doesn't support setting a compression level"): | |
628 | write_feather(df, buf, compression='lz4', compression_level=10) | |
629 | ||
630 | # Trying to compress with V1 | |
631 | with pytest.raises( | |
632 | ValueError, | |
633 | match="Feather V1 files do not support compression option"): | |
634 | write_feather(df, buf, compression='lz4', version=1) | |
635 | ||
636 | # Trying to set chunksize with V1 | |
637 | with pytest.raises( | |
638 | ValueError, | |
639 | match="Feather V1 files do not support chunksize option"): | |
640 | write_feather(df, buf, chunksize=4096, version=1) | |
641 | ||
642 | # Unsupported compressor | |
643 | with pytest.raises(ValueError, | |
644 | match='compression="snappy" not supported'): | |
645 | write_feather(df, buf, compression='snappy') | |
646 | ||
647 | ||
648 | def test_v2_lz4_default_compression(): | |
649 | # ARROW-8750: Make sure that the compression=None option selects lz4 if | |
650 | # it's available | |
651 | if not pa.Codec.is_available('lz4_frame'): | |
652 | pytest.skip("LZ4 compression support is not built in C++") | |
653 | ||
654 | # some highly compressible data | |
655 | t = pa.table([np.repeat(0, 100000)], names=['f0']) | |
656 | ||
657 | buf = io.BytesIO() | |
658 | write_feather(t, buf) | |
659 | default_result = buf.getvalue() | |
660 | ||
661 | buf = io.BytesIO() | |
662 | write_feather(t, buf, compression='uncompressed') | |
663 | uncompressed_result = buf.getvalue() | |
664 | ||
665 | assert len(default_result) < len(uncompressed_result) | |
666 | ||
667 | ||
668 | def test_v1_unsupported_types(): | |
669 | table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0']) | |
670 | ||
671 | buf = io.BytesIO() | |
672 | with pytest.raises(TypeError, | |
673 | match=("Unsupported Feather V1 type: " | |
674 | "list<item: int64>. " | |
675 | "Use V2 format to serialize all Arrow types.")): | |
676 | write_feather(table, buf, version=1) | |
677 | ||
678 | ||
679 | @pytest.mark.slow | |
680 | @pytest.mark.pandas | |
681 | def test_large_dataframe(version): | |
682 | df = pd.DataFrame({'A': np.arange(400000000)}) | |
683 | _check_pandas_roundtrip(df, version=version) | |
684 | ||
685 | ||
686 | @pytest.mark.large_memory | |
687 | @pytest.mark.pandas | |
688 | def test_chunked_binary_error_message(): | |
689 | # ARROW-3058: As Feather does not yet support chunked columns, we at least | |
690 | # make sure it's clear to the user what is going on | |
691 | ||
692 | # 2^31 + 1 bytes | |
693 | values = [b'x'] + [ | |
694 | b'x' * (1 << 20) | |
695 | ] * 2 * (1 << 10) | |
696 | df = pd.DataFrame({'byte_col': values}) | |
697 | ||
698 | # Works fine with version 2 | |
699 | buf = io.BytesIO() | |
700 | write_feather(df, buf, version=2) | |
701 | result = read_feather(pa.BufferReader(buf.getvalue())) | |
702 | assert_frame_equal(result, df) | |
703 | ||
704 | with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " | |
705 | "capacity of a Feather binary column. This restriction " | |
706 | "may be lifted in the future"): | |
707 | write_feather(df, io.BytesIO(), version=1) | |
708 | ||
709 | ||
710 | def test_feather_without_pandas(tempdir, version): | |
711 | # ARROW-8345 | |
712 | table = pa.table([pa.array([1, 2, 3])], names=['f0']) | |
713 | path = str(tempdir / "data.feather") | |
714 | _check_arrow_roundtrip(table, path) | |
715 | ||
716 | ||
717 | @pytest.mark.pandas | |
718 | def test_read_column_selection(version): | |
719 | # ARROW-8641 | |
720 | df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c']) | |
721 | ||
722 | # select columns as string names or integer indices | |
723 | _check_pandas_roundtrip( | |
724 | df, columns=['a', 'c'], expected=df[['a', 'c']], version=version) | |
725 | _check_pandas_roundtrip( | |
726 | df, columns=[0, 2], expected=df[['a', 'c']], version=version) | |
727 | ||
728 | # different order is followed | |
729 | _check_pandas_roundtrip( | |
730 | df, columns=['b', 'a'], expected=df[['b', 'a']], version=version) | |
731 | _check_pandas_roundtrip( | |
732 | df, columns=[1, 0], expected=df[['b', 'a']], version=version) | |
733 | ||
734 | ||
735 | def test_read_column_duplicated_selection(tempdir, version): | |
736 | # duplicated columns in the column selection | |
737 | table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'c']) | |
738 | path = str(tempdir / "data.feather") | |
739 | write_feather(table, path, version=version) | |
740 | ||
741 | expected = pa.table([[1, 2, 3], [4, 5, 6], [1, 2, 3]], | |
742 | names=['a', 'b', 'a']) | |
743 | for col_selection in [['a', 'b', 'a'], [0, 1, 0]]: | |
744 | result = read_table(path, columns=col_selection) | |
745 | assert result.equals(expected) | |
746 | ||
747 | ||
748 | def test_read_column_duplicated_in_file(tempdir): | |
749 | # duplicated columns in feather file (only works for feather v2) | |
750 | table = pa.table([[1, 2, 3], [4, 5, 6], [7, 8, 9]], names=['a', 'b', 'a']) | |
751 | path = str(tempdir / "data.feather") | |
752 | write_feather(table, path, version=2) | |
753 | ||
754 | # no selection works fine | |
755 | result = read_table(path) | |
756 | assert result.equals(table) | |
757 | ||
758 | # selection with indices works | |
759 | result = read_table(path, columns=[0, 2]) | |
760 | assert result.column_names == ['a', 'a'] | |
761 | ||
762 | # selection with column names errors | |
763 | with pytest.raises(ValueError): | |
764 | read_table(path, columns=['a', 'b']) | |
765 | ||
766 | ||
767 | def test_nested_types(compression): | |
768 | # https://issues.apache.org/jira/browse/ARROW-8860 | |
769 | table = pa.table({'col': pa.StructArray.from_arrays( | |
770 | [[0, 1, 2], [1, 2, 3]], names=["f1", "f2"])}) | |
771 | _check_arrow_roundtrip(table, compression=compression) | |
772 | ||
773 | table = pa.table({'col': pa.array([[1, 2], [3, 4]])}) | |
774 | _check_arrow_roundtrip(table, compression=compression) | |
775 | ||
776 | table = pa.table({'col': pa.array([[[1, 2], [3, 4]], [[5, 6], None]])}) | |
777 | _check_arrow_roundtrip(table, compression=compression) | |
778 | ||
779 | ||
780 | @h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"])) | |
781 | def test_roundtrip(table, compression): | |
782 | _check_arrow_roundtrip(table, compression=compression) | |
783 | ||
784 | ||
785 | @pytest.mark.lz4 | |
786 | def test_feather_v017_experimental_compression_backward_compatibility(datadir): | |
787 | # ARROW-11163 - ensure newer pyarrow versions can read the old feather | |
788 | # files from version 0.17.0 with experimental compression support (before | |
789 | # it was officially added to IPC format in 1.0.0) | |
790 | ||
791 | # file generated with: | |
792 | # table = pa.table({'a': range(5)}) | |
793 | # from pyarrow import feather | |
794 | # feather.write_feather( | |
795 | # table, "v0.17.0.version=2-compression=lz4.feather", | |
796 | # compression="lz4", version=2) | |
797 | expected = pa.table({'a': range(5)}) | |
798 | result = read_table(datadir / "v0.17.0.version=2-compression=lz4.feather") | |
799 | assert result.equals(expected) |