]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/ruby/red-arrow/benchmark/raw-records/dictionary.yml
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / ruby / red-arrow / benchmark / raw-records / dictionary.yml
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 contexts:
19 - name: master
20 prelude: |
21 $LOAD_PATH.unshift(File.expand_path("ext/arrow"))
22 $LOAD_PATH.unshift(File.expand_path("lib"))
23 prelude: |-
24 require "arrow"
25 require "faker"
26
27 state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i
28 Faker::Config.random = Random.new(state)
29
30 n_rows = 1000
31 n_columns = 10
32 type = Arrow::DictionaryDataType.new(:int8, :string, true)
33
34 fields = n_columns.times.map {|i| ["column_#{i}".to_sym, type] }.to_h
35 schema = Arrow::Schema.new(**fields)
36 dictionary = Arrow::StringArray.new(
37 100.times.map { Faker::Book.genre }.uniq.sort
38 )
39 indices = Arrow::Int8Array.new(
40 n_rows.times.map {
41 Faker::Number.within(range: 0 ... dictionary.length)
42 }
43 )
44 arrays = n_columns.times.map do
45 Arrow::DictionaryArray.new(
46 type,
47 indices,
48 dictionary,
49 )
50 end
51 record_batch = Arrow::RecordBatch.new(schema, n_rows, arrays)
52
53 def pure_ruby_raw_records(record_batch)
54 n_rows = record_batch.n_rows
55 n_columns = record_batch.n_columns
56 columns = record_batch.columns
57 records = []
58 i = 0
59 while i < n_rows
60 record = []
61 j = 0
62 while j < n_columns
63 record << columns[j].data.indices[i]
64 j += 1
65 end
66 records << record
67 i += 1
68 end
69 records
70 end
71 benchmark:
72 pure_ruby: |-
73 pure_ruby_raw_records(record_batch)
74 raw_records: |-
75 record_batch.raw_records