]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | # Licensed to the Apache Software Foundation (ASF) under one |
2 | # or more contributor license agreements. See the NOTICE file | |
3 | # distributed with this work for additional information | |
4 | # regarding copyright ownership. The ASF licenses this file | |
5 | # to you under the Apache License, Version 2.0 (the | |
6 | # "License"); you may not use this file except in compliance | |
7 | # with the License. You may obtain a copy of the License at | |
8 | # | |
9 | # http://www.apache.org/licenses/LICENSE-2.0 | |
10 | # | |
11 | # Unless required by applicable law or agreed to in writing, | |
12 | # software distributed under the License is distributed on an | |
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | # KIND, either express or implied. See the License for the | |
15 | # specific language governing permissions and limitations | |
16 | # under the License. | |
17 | ||
18 | contexts: | |
19 | - name: master | |
20 | prelude: | | |
21 | $LOAD_PATH.unshift(File.expand_path("ext/arrow")) | |
22 | $LOAD_PATH.unshift(File.expand_path("lib")) | |
23 | prelude: |- | |
24 | require "arrow" | |
25 | require "faker" | |
26 | ||
27 | state = ENV.fetch("FAKER_RANDOM_SEED", 17).to_i | |
28 | Faker::Config.random = Random.new(state) | |
29 | ||
30 | n_rows = 1000 | |
31 | n_columns = 10 | |
32 | type = Arrow::DictionaryDataType.new(:int8, :string, true) | |
33 | ||
34 | fields = n_columns.times.map {|i| ["column_#{i}".to_sym, type] }.to_h | |
35 | schema = Arrow::Schema.new(**fields) | |
36 | dictionary = Arrow::StringArray.new( | |
37 | 100.times.map { Faker::Book.genre }.uniq.sort | |
38 | ) | |
39 | indices = Arrow::Int8Array.new( | |
40 | n_rows.times.map { | |
41 | Faker::Number.within(range: 0 ... dictionary.length) | |
42 | } | |
43 | ) | |
44 | arrays = n_columns.times.map do | |
45 | Arrow::DictionaryArray.new( | |
46 | type, | |
47 | indices, | |
48 | dictionary, | |
49 | ) | |
50 | end | |
51 | record_batch = Arrow::RecordBatch.new(schema, n_rows, arrays) | |
52 | ||
53 | def pure_ruby_raw_records(record_batch) | |
54 | n_rows = record_batch.n_rows | |
55 | n_columns = record_batch.n_columns | |
56 | columns = record_batch.columns | |
57 | records = [] | |
58 | i = 0 | |
59 | while i < n_rows | |
60 | record = [] | |
61 | j = 0 | |
62 | while j < n_columns | |
63 | record << columns[j].data.indices[i] | |
64 | j += 1 | |
65 | end | |
66 | records << record | |
67 | i += 1 | |
68 | end | |
69 | records | |
70 | end | |
71 | benchmark: | |
72 | pure_ruby: |- | |
73 | pure_ruby_raw_records(record_batch) | |
74 | raw_records: |- | |
75 | record_batch.raw_records |