]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | # Licensed to the Apache Software Foundation (ASF) under one |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | import numpy as np | |
19 | import pandas as pd | |
20 | import pandas.util.testing as tm | |
21 | ||
22 | import pyarrow as pa | |
23 | ||
24 | ||
25 | class PandasConversionsBase(object): | |
26 | def setup(self, n, dtype): | |
27 | if dtype == 'float64_nans': | |
28 | arr = np.arange(n).astype('float64') | |
29 | arr[arr % 10 == 0] = np.nan | |
30 | else: | |
31 | arr = np.arange(n).astype(dtype) | |
32 | self.data = pd.DataFrame({'column': arr}) | |
33 | ||
34 | ||
35 | class PandasConversionsToArrow(PandasConversionsBase): | |
36 | param_names = ('size', 'dtype') | |
37 | params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str')) | |
38 | ||
39 | def time_from_series(self, n, dtype): | |
40 | pa.Table.from_pandas(self.data) | |
41 | ||
42 | ||
43 | class PandasConversionsFromArrow(PandasConversionsBase): | |
44 | param_names = ('size', 'dtype') | |
45 | params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str')) | |
46 | ||
47 | def setup(self, n, dtype): | |
48 | super(PandasConversionsFromArrow, self).setup(n, dtype) | |
49 | self.arrow_data = pa.Table.from_pandas(self.data) | |
50 | ||
51 | def time_to_series(self, n, dtype): | |
52 | self.arrow_data.to_pandas() | |
53 | ||
54 | ||
55 | class ToPandasStrings(object): | |
56 | ||
57 | param_names = ('uniqueness', 'total') | |
58 | params = ((0.001, 0.01, 0.1, 0.5), (1000000,)) | |
59 | string_length = 25 | |
60 | ||
61 | def setup(self, uniqueness, total): | |
62 | nunique = int(total * uniqueness) | |
63 | unique_values = [tm.rands(self.string_length) for i in range(nunique)] | |
64 | values = unique_values * (total // nunique) | |
65 | self.arr = pa.array(values, type=pa.string()) | |
66 | self.table = pa.Table.from_arrays([self.arr], ['f0']) | |
67 | ||
68 | def time_to_pandas_dedup(self, *args): | |
69 | self.arr.to_pandas() | |
70 | ||
71 | def time_to_pandas_no_dedup(self, *args): | |
72 | self.arr.to_pandas(deduplicate_objects=False) | |
73 | ||
74 | ||
75 | class ZeroCopyPandasRead(object): | |
76 | ||
77 | def setup(self): | |
78 | # Transpose to make column-major | |
79 | values = np.random.randn(10, 100000) | |
80 | ||
81 | df = pd.DataFrame(values.T) | |
82 | ctx = pa.default_serialization_context() | |
83 | ||
84 | self.serialized = ctx.serialize(df) | |
85 | self.as_buffer = self.serialized.to_buffer() | |
86 | self.as_components = self.serialized.to_components() | |
87 | ||
88 | def time_deserialize_from_buffer(self): | |
89 | pa.deserialize(self.as_buffer) | |
90 | ||
91 | def time_deserialize_from_components(self): | |
92 | pa.deserialize_components(self.as_components) | |
93 | ||
94 | ||
95 | class SerializeDeserializePandas(object): | |
96 | ||
97 | def setup(self): | |
98 | # 10 million length | |
99 | n = 10000000 | |
100 | self.df = pd.DataFrame({'data': np.random.randn(n)}) | |
101 | self.serialized = pa.serialize_pandas(self.df) | |
102 | ||
103 | def time_serialize_pandas(self): | |
104 | pa.serialize_pandas(self.df) | |
105 | ||
106 | def time_deserialize_pandas(self): | |
107 | pa.deserialize_pandas(self.serialized) | |
108 | ||
109 | ||
110 | class TableFromPandasMicroperformance(object): | |
111 | # ARROW-4629 | |
112 | ||
113 | def setup(self): | |
114 | ser = pd.Series(range(10000)) | |
115 | df = pd.DataFrame({col: ser.copy(deep=True) for col in range(100)}) | |
116 | # Simulate a real dataset by converting some columns to strings | |
117 | self.df = df.astype({col: str for col in range(50)}) | |
118 | ||
119 | def time_Table_from_pandas(self): | |
120 | for _ in range(50): | |
121 | pa.Table.from_pandas(self.df, nthreads=1) |