]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/ruby/red-arrow/lib/arrow/table.rb
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / ruby / red-arrow / lib / arrow / table.rb
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 require "arrow/raw-table-converter"
19
20 module Arrow
21 class Table
22 include ColumnContainable
23 include GenericFilterable
24 include GenericTakeable
25 include RecordContainable
26
27 class << self
28 def load(path, options={})
29 TableLoader.load(path, options)
30 end
31 end
32
33 alias_method :initialize_raw, :initialize
34 private :initialize_raw
35
36 # Creates a new {Arrow::Table}.
37 #
38 # @overload initialize(columns)
39 #
40 # @param columns [::Array<Arrow::Column>] The columns of the table.
41 #
42 # @example Create a table from columns
43 # count_field = Arrow::Field.new("count", :uint32)
44 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
45 # count_column = Arrow::Column.new(count_field, count_array)
46 # visible_field = Arrow::Field.new("visible", :boolean)
47 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
48 # visible_column = Arrow::Column.new(visible_field, visible_array)
49 # Arrow::Table.new([count_column, visible_column])
50 #
51 # @overload initialize(raw_table)
52 #
53 # @param raw_table [Hash<String, Arrow::Array>]
54 # The pairs of column name and values of the table. Column values is
55 # `Arrow::Array`.
56 #
57 # @example Create a table from column name and values
58 # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
59 # "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
60 #
61 # @overload initialize(raw_table)
62 #
63 # @param raw_table [Hash<String, Arrow::ChunkedArray>]
64 # The pairs of column name and values of the table. Column values is
65 # `Arrow::ChunkedArray`.
66 #
67 # @example Create a table from column name and values
68 # count_chunks = [
69 # Arrow::UInt32Array.new([0, 2]),
70 # Arrow::UInt32Array.new([nil, 4]),
71 # ]
72 # visible_chunks = [
73 # Arrow::BooleanArray.new([true]),
74 # Arrow::BooleanArray.new([nil, nil, false]),
75 # ]
76 # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
77 # "visible" => Arrow::ChunkedArray.new(visible_chunks))
78 #
79 # @overload initialize(raw_table)
80 #
81 # @param raw_table [Hash<String, ::Array>]
82 # The pairs of column name and values of the table. Column values is
83 # `Array`.
84 #
85 # @example Create a table from column name and values
86 # Arrow::Table.new("count" => [0, 2, nil, 4],
87 # "visible" => [true, nil, nil, false])
88 #
89 # @overload initialize(schema, columns)
90 #
91 # @param schema [Arrow::Schema] The schema of the table.
92 # You can also specify schema as primitive Ruby objects.
93 # See {Arrow::Schema#initialize} for details.
94 #
95 # @param columns [::Array<Arrow::Column>] The data of the table.
96 #
97 # @example Create a table from schema and columns
98 # count_field = Arrow::Field.new("count", :uint32)
99 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
100 # count_column = Arrow::Column.new(count_field, count_array)
101 # visible_field = Arrow::Field.new("visible", :boolean)
102 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
103 # visible_column = Arrow::Column.new(visible_field, visible_array)
104 # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
105 # [count_column, visible_column])
106 #
107 # @overload initialize(schema, arrays)
108 #
109 # @param schema [Arrow::Schema] The schema of the table.
110 # You can also specify schema as primitive Ruby objects.
111 # See {Arrow::Schema#initialize} for details.
112 #
113 # @param arrays [::Array<Arrow::Array>] The data of the table.
114 #
115 # @example Create a table from schema and arrays
116 # count_field = Arrow::Field.new("count", :uint32)
117 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
118 # visible_field = Arrow::Field.new("visible", :boolean)
119 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
120 # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
121 # [count_array, visible_array])
122 #
123 # @overload initialize(schema, record_batches)
124 #
125 # @param schema [Arrow::Schema] The schema of the table.
126 # You can also specify schema as primitive Ruby objects.
127 # See {Arrow::Schema#initialize} for details.
128 #
129 # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
130 #
131 # @example Create a table from schema and record batches
132 # count_field = Arrow::Field.new("count", :uint32)
133 # visible_field = Arrow::Field.new("visible", :boolean)
134 # schema = Arrow::Schema.new([count_field, visible_field])
135 # record_batches = [
136 # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
137 # Arrow::RecordBatch.new(schema, [[4, false]]),
138 # ]
139 # Arrow::Table.new(schema, record_batches)
140 #
141 # @overload initialize(schema, raw_records)
142 #
143 # @param schema [Arrow::Schema] The schema of the table.
144 # You can also specify schema as primitive Ruby objects.
145 # See {Arrow::Schema#initialize} for details.
146 #
147 # @param arrays [::Array<::Array>] The data of the table as primitive
148 # Ruby objects.
149 #
150 # @example Create a table from schema and raw records
151 # schema = {
152 # count: :uint32,
153 # visible: :boolean,
154 # }
155 # raw_records = [
156 # [0, true],
157 # [2, nil],
158 # [nil, nil],
159 # [4, false],
160 # ]
161 # Arrow::Table.new(schema, raw_records)
162 def initialize(*args)
163 n_args = args.size
164 case n_args
165 when 1
166 raw_table_converter = RawTableConverter.new(args[0])
167 schema = raw_table_converter.schema
168 values = raw_table_converter.values
169 when 2
170 schema = args[0]
171 schema = Schema.new(schema) unless schema.is_a?(Schema)
172 values = args[1]
173 case values[0]
174 when ::Array
175 values = [RecordBatch.new(schema, values)]
176 when Column
177 values = values.collect(&:data)
178 end
179 else
180 message = "wrong number of arguments (given #{n_args}, expected 1..2)"
181 raise ArgumentError, message
182 end
183 initialize_raw(schema, values)
184 end
185
186 def each_record_batch
187 return to_enum(__method__) unless block_given?
188
189 reader = TableBatchReader.new(self)
190 while record_batch = reader.read_next
191 yield(record_batch)
192 end
193 end
194
195 alias_method :size, :n_rows
196 alias_method :length, :n_rows
197
198 alias_method :slice_raw, :slice
199
200 # @overload slice(offset, length)
201 #
202 # @param offset [Integer] The offset of sub Arrow::Table.
203 # @param length [Integer] The length of sub Arrow::Table.
204 # @return [Arrow::Table]
205 # The sub `Arrow::Table` that covers only from
206 # `offset` to `offset + length` range.
207 #
208 # @overload slice(index)
209 #
210 # @param index [Integer] The index in this table.
211 # @return [Arrow::Record]
212 # The `Arrow::Record` corresponding to index of
213 # the table.
214 #
215 # @overload slice(booleans)
216 #
217 # @param booleans [::Array<Boolean>]
218 # The values indicating the target rows.
219 # @return [Arrow::Table]
220 # The sub `Arrow::Table` that covers only rows of indices
221 # the values of `booleans` is true.
222 #
223 # @overload slice(boolean_array)
224 #
225 # @param boolean_array [::Array<Arrow::BooleanArray>]
226 # The values indicating the target rows.
227 # @return [Arrow::Table]
228 # The sub `Arrow::Table` that covers only rows of indices
229 # the values of `boolean_array` is true.
230 #
231 # @overload slice(range)
232 #
233 # @param range_included_end [Range] The range indicating the target rows.
234 # @return [Arrow::Table]
235 # The sub `Arrow::Table` that covers only rows of the range of indices.
236 #
237 # @overload slice(conditions)
238 #
239 # @param conditions [Hash] The conditions to select records.
240 # @return [Arrow::Table]
241 # The sub `Arrow::Table` that covers only rows matched by condition
242 #
243 # @overload slice
244 #
245 # @yield [slicer] Gives slicer that constructs condition to select records.
246 # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
247 # build condition.
248 # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
249 # The condition to select records.
250 # @return [Arrow::Table]
251 # The sub `Arrow::Table` that covers only rows matched by condition
252 # specified by slicer.
253 def slice(*args)
254 slicers = []
255 if block_given?
256 unless args.empty?
257 raise ArgumentError, "must not specify both arguments and block"
258 end
259 block_slicer = yield(Slicer.new(self))
260 case block_slicer
261 when ::Array
262 slicers.concat(block_slicer)
263 else
264 slicers << block_slicer
265 end
266 else
267 expected_n_args = nil
268 case args.size
269 when 1
270 case args[0]
271 when Integer
272 index = args[0]
273 index += n_rows if index < 0
274 return nil if index < 0
275 return nil if index >= n_rows
276 return Record.new(self, index)
277 when Hash
278 condition_pairs = args[0]
279 slicer = Slicer.new(self)
280 conditions = []
281 condition_pairs.each do |key, value|
282 case value
283 when Range
284 # TODO: Optimize "begin <= key <= end" case by missing "between" kernel
285 # https://issues.apache.org/jira/browse/ARROW-9843
286 unless value.begin.nil?
287 conditions << (slicer[key] >= value.begin)
288 end
289 unless value.end.nil?
290 if value.exclude_end?
291 conditions << (slicer[key] < value.end)
292 else
293 conditions << (slicer[key] <= value.end)
294 end
295 end
296 else
297 conditions << (slicer[key] == value)
298 end
299 end
300 slicers << conditions.inject(:&)
301 else
302 slicers << args[0]
303 end
304 when 2
305 offset, length = args
306 slicers << (offset...(offset + length))
307 else
308 expected_n_args = "1..2"
309 end
310 if expected_n_args
311 message = "wrong number of arguments " +
312 "(given #{args.size}, expected #{expected_n_args})"
313 raise ArgumentError, message
314 end
315 end
316
317 filter_options = Arrow::FilterOptions.new
318 filter_options.null_selection_behavior = :emit_null
319 sliced_tables = []
320 slicers.each do |slicer|
321 slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
322 case slicer
323 when Integer
324 slicer += n_rows if slicer < 0
325 sliced_tables << slice_by_range(slicer, n_rows - 1)
326 when Range
327 original_from = from = slicer.first
328 to = slicer.last
329 to -= 1 if slicer.exclude_end?
330 from += n_rows if from < 0
331 if from < 0 or from >= n_rows
332 message =
333 "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
334 "#{original_from}"
335 raise ArgumentError, message
336 end
337 to += n_rows if to < 0
338 sliced_tables << slice_by_range(from, to)
339 when ::Array, BooleanArray, ChunkedArray
340 sliced_tables << filter(slicer, filter_options)
341 else
342 message = "slicer must be Integer, Range, (from, to), " +
343 "Arrow::ChunkedArray of Arrow::BooleanArray, " +
344 "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
345 raise ArgumentError, message
346 end
347 end
348 if sliced_tables.size > 1
349 sliced_tables[0].concatenate(sliced_tables[1..-1])
350 else
351 sliced_tables[0]
352 end
353 end
354
355 # TODO
356 #
357 # @return [Arrow::Table]
358 def merge(other)
359 added_columns = {}
360 removed_columns = {}
361
362 case other
363 when Hash
364 other.each do |name, value|
365 name = name.to_s
366 if value
367 added_columns[name] = ensure_raw_column(name, value)
368 else
369 removed_columns[name] = true
370 end
371 end
372 when Table
373 added_columns = {}
374 other.columns.each do |column|
375 name = column.name
376 added_columns[name] = ensure_raw_column(name, column)
377 end
378 else
379 message = "merge target must be Hash or Arrow::Table: " +
380 "<#{other.inspect}>: #{inspect}"
381 raise ArgumentError, message
382 end
383
384 new_columns = []
385 columns.each do |column|
386 column_name = column.name
387 new_column = added_columns.delete(column_name)
388 if new_column
389 new_columns << new_column
390 next
391 end
392 next if removed_columns.key?(column_name)
393 new_columns << ensure_raw_column(column_name, column)
394 end
395 added_columns.each do |name, new_column|
396 new_columns << new_column
397 end
398 new_fields = []
399 new_arrays = []
400 new_columns.each do |new_column|
401 new_fields << new_column[:field]
402 new_arrays << new_column[:data]
403 end
404 self.class.new(new_fields, new_arrays)
405 end
406
407 alias_method :remove_column_raw, :remove_column
408 def remove_column(name_or_index)
409 case name_or_index
410 when String, Symbol
411 name = name_or_index.to_s
412 index = columns.index {|column| column.name == name}
413 if index.nil?
414 message = "unknown column: #{name_or_index.inspect}: #{inspect}"
415 raise KeyError.new(message)
416 end
417 else
418 index = name_or_index
419 index += n_columns if index < 0
420 if index < 0 or index >= n_columns
421 message = "out of index (0..#{n_columns - 1}): " +
422 "#{name_or_index.inspect}: #{inspect}"
423 raise IndexError.new(message)
424 end
425 end
426 remove_column_raw(index)
427 end
428
429 # Experimental
430 def group(*keys)
431 Group.new(self, keys)
432 end
433
434 # Experimental
435 def window(size: nil)
436 RollingWindow.new(self, size)
437 end
438
439 def save(output, options={})
440 saver = TableSaver.new(self, output, options)
441 saver.save
442 end
443
444 def pack
445 packed_arrays = columns.collect do |column|
446 column.data.pack
447 end
448 self.class.new(schema, packed_arrays)
449 end
450
451 alias_method :to_s_raw, :to_s
452 def to_s(options={})
453 format = options[:format]
454 case format
455 when :column
456 return to_s_raw
457 when :list
458 formatter_class = TableListFormatter
459 when :table, nil
460 formatter_class = TableTableFormatter
461 else
462 message = ":format must be :column, :list, :table or nil"
463 raise ArgumentError, "#{message}: <#{format.inspect}>"
464 end
465 formatter = formatter_class.new(self, options)
466 formatter.format
467 end
468
469 alias_method :inspect_raw, :inspect
470 def inspect
471 "#{super}\n#{to_s}"
472 end
473
474 def respond_to_missing?(name, include_private)
475 return true if find_column(name)
476 super
477 end
478
479 def method_missing(name, *args, &block)
480 if args.empty?
481 column = find_column(name)
482 return column if column
483 end
484 super
485 end
486
487 private
488 def slice_by_range(from, to)
489 slice_raw(from, to - from + 1)
490 end
491
492 def ensure_raw_column(name, data)
493 case data
494 when Array
495 {
496 field: Field.new(name, data.value_data_type),
497 data: ChunkedArray.new([data]),
498 }
499 when ChunkedArray
500 {
501 field: Field.new(name, data.value_data_type),
502 data: data,
503 }
504 when Column
505 column = data
506 data = column.data
507 data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray)
508 {
509 field: column.field,
510 data: data,
511 }
512 else
513 message = "column must be Arrow::Array or Arrow::Column: " +
514 "<#{name}>: <#{data.inspect}>: #{inspect}"
515 raise ArgumentError, message
516 end
517 end
518 end
519 end