ceph/src/arrow/python/benchmarks/convert_pandas.py

   1 # Licensed to the Apache Software Foundation (ASF) under one
   2 # or more contributor license agreements.  See the NOTICE file
   3 # distributed with this work for additional information
   4 # regarding copyright ownership.  The ASF licenses this file
   5 # to you under the Apache License, Version 2.0 (the
   6 # "License"); you may not use this file except in compliance
   7 # with the License.  You may obtain a copy of the License at
   8 #
   9 #   http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing,
  12 # software distributed under the License is distributed on an
  13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 # KIND, either express or implied.  See the License for the
  15 # specific language governing permissions and limitations
  16 # under the License.
  17
  18 import numpy as np
  19 import pandas as pd
  20 import pandas.util.testing as tm
  21
  22 import pyarrow as pa
  23
  24
  25 class PandasConversionsBase(object):
  26     def setup(self, n, dtype):
  27         if dtype == 'float64_nans':
  28             arr = np.arange(n).astype('float64')
  29             arr[arr % 10 == 0] = np.nan
  30         else:
  31             arr = np.arange(n).astype(dtype)
  32         self.data = pd.DataFrame({'column': arr})
  33
  34
  35 class PandasConversionsToArrow(PandasConversionsBase):
  36     param_names = ('size', 'dtype')
  37     params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
  38
  39     def time_from_series(self, n, dtype):
  40         pa.Table.from_pandas(self.data)
  41
  42
  43 class PandasConversionsFromArrow(PandasConversionsBase):
  44     param_names = ('size', 'dtype')
  45     params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
  46
  47     def setup(self, n, dtype):
  48         super(PandasConversionsFromArrow, self).setup(n, dtype)
  49         self.arrow_data = pa.Table.from_pandas(self.data)
  50
  51     def time_to_series(self, n, dtype):
  52         self.arrow_data.to_pandas()
  53
  54
  55 class ToPandasStrings(object):
  56
  57     param_names = ('uniqueness', 'total')
  58     params = ((0.001, 0.01, 0.1, 0.5), (1000000,))
  59     string_length = 25
  60
  61     def setup(self, uniqueness, total):
  62         nunique = int(total * uniqueness)
  63         unique_values = [tm.rands(self.string_length) for i in range(nunique)]
  64         values = unique_values * (total // nunique)
  65         self.arr = pa.array(values, type=pa.string())
  66         self.table = pa.Table.from_arrays([self.arr], ['f0'])
  67
  68     def time_to_pandas_dedup(self, *args):
  69         self.arr.to_pandas()
  70
  71     def time_to_pandas_no_dedup(self, *args):
  72         self.arr.to_pandas(deduplicate_objects=False)
  73
  74
  75 class ZeroCopyPandasRead(object):
  76
  77     def setup(self):
  78         # Transpose to make column-major
  79         values = np.random.randn(10, 100000)
  80
  81         df = pd.DataFrame(values.T)
  82         ctx = pa.default_serialization_context()
  83
  84         self.serialized = ctx.serialize(df)
  85         self.as_buffer = self.serialized.to_buffer()
  86         self.as_components = self.serialized.to_components()
  87
  88     def time_deserialize_from_buffer(self):
  89         pa.deserialize(self.as_buffer)
  90
  91     def time_deserialize_from_components(self):
  92         pa.deserialize_components(self.as_components)
  93
  94
  95 class SerializeDeserializePandas(object):
  96
  97     def setup(self):
  98         # 10 million length
  99         n = 10000000
 100         self.df = pd.DataFrame({'data': np.random.randn(n)})
 101         self.serialized = pa.serialize_pandas(self.df)
 102
 103     def time_serialize_pandas(self):
 104         pa.serialize_pandas(self.df)
 105
 106     def time_deserialize_pandas(self):
 107         pa.deserialize_pandas(self.serialized)
 108
 109
 110 class TableFromPandasMicroperformance(object):
 111     # ARROW-4629
 112
 113     def setup(self):
 114         ser = pd.Series(range(10000))
 115         df = pd.DataFrame({col: ser.copy(deep=True) for col in range(100)})
 116         # Simulate a real dataset by converting some columns to strings
 117         self.df = df.astype({col: str for col in range(50)})
 118
 119     def time_Table_from_pandas(self):
 120         for _ in range(50):
 121             pa.Table.from_pandas(self.df, nthreads=1)