]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/python/benchmarks/convert_pandas.py
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / python / benchmarks / convert_pandas.py
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 import numpy as np
19 import pandas as pd
20 import pandas.util.testing as tm
21
22 import pyarrow as pa
23
24
25 class PandasConversionsBase(object):
26 def setup(self, n, dtype):
27 if dtype == 'float64_nans':
28 arr = np.arange(n).astype('float64')
29 arr[arr % 10 == 0] = np.nan
30 else:
31 arr = np.arange(n).astype(dtype)
32 self.data = pd.DataFrame({'column': arr})
33
34
35 class PandasConversionsToArrow(PandasConversionsBase):
36 param_names = ('size', 'dtype')
37 params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
38
39 def time_from_series(self, n, dtype):
40 pa.Table.from_pandas(self.data)
41
42
43 class PandasConversionsFromArrow(PandasConversionsBase):
44 param_names = ('size', 'dtype')
45 params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
46
47 def setup(self, n, dtype):
48 super(PandasConversionsFromArrow, self).setup(n, dtype)
49 self.arrow_data = pa.Table.from_pandas(self.data)
50
51 def time_to_series(self, n, dtype):
52 self.arrow_data.to_pandas()
53
54
55 class ToPandasStrings(object):
56
57 param_names = ('uniqueness', 'total')
58 params = ((0.001, 0.01, 0.1, 0.5), (1000000,))
59 string_length = 25
60
61 def setup(self, uniqueness, total):
62 nunique = int(total * uniqueness)
63 unique_values = [tm.rands(self.string_length) for i in range(nunique)]
64 values = unique_values * (total // nunique)
65 self.arr = pa.array(values, type=pa.string())
66 self.table = pa.Table.from_arrays([self.arr], ['f0'])
67
68 def time_to_pandas_dedup(self, *args):
69 self.arr.to_pandas()
70
71 def time_to_pandas_no_dedup(self, *args):
72 self.arr.to_pandas(deduplicate_objects=False)
73
74
75 class ZeroCopyPandasRead(object):
76
77 def setup(self):
78 # Transpose to make column-major
79 values = np.random.randn(10, 100000)
80
81 df = pd.DataFrame(values.T)
82 ctx = pa.default_serialization_context()
83
84 self.serialized = ctx.serialize(df)
85 self.as_buffer = self.serialized.to_buffer()
86 self.as_components = self.serialized.to_components()
87
88 def time_deserialize_from_buffer(self):
89 pa.deserialize(self.as_buffer)
90
91 def time_deserialize_from_components(self):
92 pa.deserialize_components(self.as_components)
93
94
95 class SerializeDeserializePandas(object):
96
97 def setup(self):
98 # 10 million length
99 n = 10000000
100 self.df = pd.DataFrame({'data': np.random.randn(n)})
101 self.serialized = pa.serialize_pandas(self.df)
102
103 def time_serialize_pandas(self):
104 pa.serialize_pandas(self.df)
105
106 def time_deserialize_pandas(self):
107 pa.deserialize_pandas(self.serialized)
108
109
110 class TableFromPandasMicroperformance(object):
111 # ARROW-4629
112
113 def setup(self):
114 ser = pd.Series(range(10000))
115 df = pd.DataFrame({col: ser.copy(deep=True) for col in range(100)})
116 # Simulate a real dataset by converting some columns to strings
117 self.df = df.astype({col: str for col in range(50)})
118
119 def time_Table_from_pandas(self):
120 for _ in range(50):
121 pa.Table.from_pandas(self.df, nthreads=1)