ceph/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml

   1 # Licensed to the Apache Software Foundation (ASF) under one
   2 # or more contributor license agreements.  See the NOTICE file
   3 # distributed with this work for additional information
   4 # regarding copyright ownership.  The ASF licenses this file
   5 # to you under the Apache License, Version 2.0 (the
   6 # "License"); you may not use this file except in compliance
   7 # with the License.  You may obtain a copy of the License at
   8 #
   9 #   http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing,
  12 # software distributed under the License is distributed on an
  13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 # KIND, either express or implied.  See the License for the
  15 # specific language governing permissions and limitations
  16 # under the License.
  17
  18 contexts:
  19   - name: master
  20     prelude: |
  21       $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
  22       $LOAD_PATH.unshift(File.expand_path("lib"))
  23 prelude: |-
  24   require "arrow"
  25   require "faker"
  26
  27   state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
  28   Faker::Config.random = Random.new(state)
  29
  30   n_rows = 1000
  31   n_columns = 10
  32   type = Arrow::DictionaryDataType.new(:int8, :string, true)
  33
  34   fields = n_columns.times.map {|i| ["column_#{i}".to_sym, type] }.to_h
  35   schema = Arrow::Schema.new(**fields)
  36   dictionary = Arrow::StringArray.new(
  37     100.times.map { Faker::Book.genre }.uniq.sort
  38   )
  39   indices = Arrow::Int8Array.new(
  40     n_rows.times.map {
  41       Faker::Number.within(range: 0 ... dictionary.length)
  42     }
  43   )
  44   arrays = n_columns.times.map do
  45     Arrow::DictionaryArray.new(
  46       type,
  47       indices,
  48       dictionary,
  49     )
  50   end
  51   record_batch = Arrow::RecordBatch.new(schema, n_rows, arrays)
  52
  53   def pure_ruby_raw_records(record_batch)
  54     n_rows = record_batch.n_rows
  55     n_columns = record_batch.n_columns
  56     columns = record_batch.columns
  57     records = []
  58     i = 0
  59     while i < n_rows
  60       record = []
  61       j = 0
  62       while j < n_columns
  63         record << columns[j].data.indices[i]
  64         j += 1
  65       end
  66       records << record
  67       i += 1
  68     end
  69     records
  70   end
  71 benchmark:
  72   pure_ruby: |-
  73     pure_ruby_raw_records(record_batch)
  74   raw_records: |-
  75     record_batch.raw_records