1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
20 import pandas
.util
.testing
as tm
25 class PandasConversionsBase(object):
26 def setup(self
, n
, dtype
):
27 if dtype
== 'float64_nans':
28 arr
= np
.arange(n
).astype('float64')
29 arr
[arr
% 10 == 0] = np
.nan
31 arr
= np
.arange(n
).astype(dtype
)
32 self
.data
= pd
.DataFrame({'column': arr
})
35 class PandasConversionsToArrow(PandasConversionsBase
):
36 param_names
= ('size', 'dtype')
37 params
= ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
39 def time_from_series(self
, n
, dtype
):
40 pa
.Table
.from_pandas(self
.data
)
43 class PandasConversionsFromArrow(PandasConversionsBase
):
44 param_names
= ('size', 'dtype')
45 params
= ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
47 def setup(self
, n
, dtype
):
48 super(PandasConversionsFromArrow
, self
).setup(n
, dtype
)
49 self
.arrow_data
= pa
.Table
.from_pandas(self
.data
)
51 def time_to_series(self
, n
, dtype
):
52 self
.arrow_data
.to_pandas()
55 class ToPandasStrings(object):
57 param_names
= ('uniqueness', 'total')
58 params
= ((0.001, 0.01, 0.1, 0.5), (1000000,))
61 def setup(self
, uniqueness
, total
):
62 nunique
= int(total
* uniqueness
)
63 unique_values
= [tm
.rands(self
.string_length
) for i
in range(nunique
)]
64 values
= unique_values
* (total
// nunique
)
65 self
.arr
= pa
.array(values
, type=pa
.string())
66 self
.table
= pa
.Table
.from_arrays([self
.arr
], ['f0'])
68 def time_to_pandas_dedup(self
, *args
):
71 def time_to_pandas_no_dedup(self
, *args
):
72 self
.arr
.to_pandas(deduplicate_objects
=False)
75 class ZeroCopyPandasRead(object):
78 # Transpose to make column-major
79 values
= np
.random
.randn(10, 100000)
81 df
= pd
.DataFrame(values
.T
)
82 ctx
= pa
.default_serialization_context()
84 self
.serialized
= ctx
.serialize(df
)
85 self
.as_buffer
= self
.serialized
.to_buffer()
86 self
.as_components
= self
.serialized
.to_components()
88 def time_deserialize_from_buffer(self
):
89 pa
.deserialize(self
.as_buffer
)
91 def time_deserialize_from_components(self
):
92 pa
.deserialize_components(self
.as_components
)
95 class SerializeDeserializePandas(object):
100 self
.df
= pd
.DataFrame({'data': np
.random
.randn(n
)})
101 self
.serialized
= pa
.serialize_pandas(self
.df
)
103 def time_serialize_pandas(self
):
104 pa
.serialize_pandas(self
.df
)
106 def time_deserialize_pandas(self
):
107 pa
.deserialize_pandas(self
.serialized
)
110 class TableFromPandasMicroperformance(object):
114 ser
= pd
.Series(range(10000))
115 df
= pd
.DataFrame({col
: ser
.copy(deep
=True) for col
in range(100)})
116 # Simulate a real dataset by converting some columns to strings
117 self
.df
= df
.astype({col
: str for col
in range(50)})
119 def time_Table_from_pandas(self
):
121 pa
.Table
.from_pandas(self
.df
, nthreads
=1)