]>
git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/ruby/red-arrow/lib/arrow/table.rb
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
18 require "arrow/raw-table-converter"
22 include ColumnContainable
23 include GenericFilterable
24 include GenericTakeable
25 include RecordContainable
28 def load(path
, options
={})
29 TableLoader
.load(path
, options
)
33 alias_method
:initialize_raw, :initialize
34 private :initialize_raw
36 # Creates a new {Arrow::Table}.
38 # @overload initialize(columns)
40 # @param columns [::Array<Arrow::Column>] The columns of the table.
42 # @example Create a table from columns
43 # count_field = Arrow::Field.new("count", :uint32)
44 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
45 # count_column = Arrow::Column.new(count_field, count_array)
46 # visible_field = Arrow::Field.new("visible", :boolean)
47 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
48 # visible_column = Arrow::Column.new(visible_field, visible_array)
49 # Arrow::Table.new([count_column, visible_column])
51 # @overload initialize(raw_table)
53 # @param raw_table [Hash<String, Arrow::Array>]
54 # The pairs of column name and values of the table. Column values is
57 # @example Create a table from column name and values
58 # Arrow::Table.new("count" => Arrow::UInt32Array.new([0, 2, nil, 4]),
59 # "visible" => Arrow::BooleanArray.new([true, nil, nil, false]))
61 # @overload initialize(raw_table)
63 # @param raw_table [Hash<String, Arrow::ChunkedArray>]
64 # The pairs of column name and values of the table. Column values is
65 # `Arrow::ChunkedArray`.
67 # @example Create a table from column name and values
69 # Arrow::UInt32Array.new([0, 2]),
70 # Arrow::UInt32Array.new([nil, 4]),
73 # Arrow::BooleanArray.new([true]),
74 # Arrow::BooleanArray.new([nil, nil, false]),
76 # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks),
77 # "visible" => Arrow::ChunkedArray.new(visible_chunks))
79 # @overload initialize(raw_table)
81 # @param raw_table [Hash<String, ::Array>]
82 # The pairs of column name and values of the table. Column values is
85 # @example Create a table from column name and values
86 # Arrow::Table.new("count" => [0, 2, nil, 4],
87 # "visible" => [true, nil, nil, false])
89 # @overload initialize(schema, columns)
91 # @param schema [Arrow::Schema] The schema of the table.
92 # You can also specify schema as primitive Ruby objects.
93 # See {Arrow::Schema#initialize} for details.
95 # @param columns [::Array<Arrow::Column>] The data of the table.
97 # @example Create a table from schema and columns
98 # count_field = Arrow::Field.new("count", :uint32)
99 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
100 # count_column = Arrow::Column.new(count_field, count_array)
101 # visible_field = Arrow::Field.new("visible", :boolean)
102 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
103 # visible_column = Arrow::Column.new(visible_field, visible_array)
104 # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
105 # [count_column, visible_column])
107 # @overload initialize(schema, arrays)
109 # @param schema [Arrow::Schema] The schema of the table.
110 # You can also specify schema as primitive Ruby objects.
111 # See {Arrow::Schema#initialize} for details.
113 # @param arrays [::Array<Arrow::Array>] The data of the table.
115 # @example Create a table from schema and arrays
116 # count_field = Arrow::Field.new("count", :uint32)
117 # count_array = Arrow::UInt32Array.new([0, 2, nil, 4])
118 # visible_field = Arrow::Field.new("visible", :boolean)
119 # visible_array = Arrow::BooleanArray.new([true, nil, nil, false])
120 # Arrow::Table.new(Arrow::Schema.new([count_field, visible_field]),
121 # [count_array, visible_array])
123 # @overload initialize(schema, record_batches)
125 # @param schema [Arrow::Schema] The schema of the table.
126 # You can also specify schema as primitive Ruby objects.
127 # See {Arrow::Schema#initialize} for details.
129 # @param arrays [::Array<Arrow::RecordBatch>] The data of the table.
131 # @example Create a table from schema and record batches
132 # count_field = Arrow::Field.new("count", :uint32)
133 # visible_field = Arrow::Field.new("visible", :boolean)
134 # schema = Arrow::Schema.new([count_field, visible_field])
136 # Arrow::RecordBatch.new(schema, [[0, true], [2, nil], [nil, nil]]),
137 # Arrow::RecordBatch.new(schema, [[4, false]]),
139 # Arrow::Table.new(schema, record_batches)
141 # @overload initialize(schema, raw_records)
143 # @param schema [Arrow::Schema] The schema of the table.
144 # You can also specify schema as primitive Ruby objects.
145 # See {Arrow::Schema#initialize} for details.
147 # @param arrays [::Array<::Array>] The data of the table as primitive
150 # @example Create a table from schema and raw records
161 # Arrow::Table.new(schema, raw_records)
162 def initialize(*args
)
166 raw_table_converter
= RawTableConverter
.new(args
[0])
167 schema
= raw_table_converter
.schema
168 values
= raw_table_converter
.values
171 schema
= Schema
.new(schema
) unless schema
.is_a
?(Schema
)
175 values
= [RecordBatch
.new(schema
, values
)]
177 values
= values
.collect(&:data)
180 message
= "wrong number of arguments (given #{n_args}, expected 1..2)"
181 raise ArgumentError
, message
183 initialize_raw(schema
, values
)
186 def each_record_batch
187 return to_enum(__method__
) unless block_given
?
189 reader
= TableBatchReader
.new(self)
190 while record_batch
= reader
.read_next
195 alias_method
:size, :n_rows
196 alias_method
:length, :n_rows
198 alias_method
:slice_raw, :slice
200 # @overload slice(offset, length)
202 # @param offset [Integer] The offset of sub Arrow::Table.
203 # @param length [Integer] The length of sub Arrow::Table.
204 # @return [Arrow::Table]
205 # The sub `Arrow::Table` that covers only from
206 # `offset` to `offset + length` range.
208 # @overload slice(index)
210 # @param index [Integer] The index in this table.
211 # @return [Arrow::Record]
212 # The `Arrow::Record` corresponding to index of
215 # @overload slice(booleans)
217 # @param booleans [::Array<Boolean>]
218 # The values indicating the target rows.
219 # @return [Arrow::Table]
220 # The sub `Arrow::Table` that covers only rows of indices
221 # the values of `booleans` is true.
223 # @overload slice(boolean_array)
225 # @param boolean_array [::Array<Arrow::BooleanArray>]
226 # The values indicating the target rows.
227 # @return [Arrow::Table]
228 # The sub `Arrow::Table` that covers only rows of indices
229 # the values of `boolean_array` is true.
231 # @overload slice(range)
233 # @param range_included_end [Range] The range indicating the target rows.
234 # @return [Arrow::Table]
235 # The sub `Arrow::Table` that covers only rows of the range of indices.
237 # @overload slice(conditions)
239 # @param conditions [Hash] The conditions to select records.
240 # @return [Arrow::Table]
241 # The sub `Arrow::Table` that covers only rows matched by condition
245 # @yield [slicer] Gives slicer that constructs condition to select records.
246 # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
248 # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
249 # The condition to select records.
250 # @return [Arrow::Table]
251 # The sub `Arrow::Table` that covers only rows matched by condition
252 # specified by slicer.
257 raise ArgumentError
, "must not specify both arguments and block"
259 block_slicer
= yield(Slicer
.new(self))
262 slicers
.concat(block_slicer
)
264 slicers
<< block_slicer
267 expected_n_args
= nil
273 index
+= n_rows
if index
< 0
274 return nil if index
< 0
275 return nil if index
>= n_rows
276 return Record
.new(self, index
)
278 condition_pairs
= args
[0]
279 slicer
= Slicer
.new(self)
281 condition_pairs
.each
do |key
, value
|
284 # TODO: Optimize "begin <= key <= end" case by missing "between" kernel
285 # https://issues.apache.org/jira/browse/ARROW-9843
286 unless value
.begin.nil?
287 conditions
<< (slicer
[key
] >= value
.begin)
289 unless value
.end.nil?
290 if value
.exclude_end
?
291 conditions
<< (slicer
[key
] < value
.end)
293 conditions
<< (slicer
[key
] <= value
.end)
297 conditions
<< (slicer
[key
] == value
)
300 slicers
<< conditions
.inject(:&)
305 offset
, length
= args
306 slicers
<< (offset
...(offset
+ length
))
308 expected_n_args
= "1..2"
311 message
= "wrong number of arguments " +
312 "(given #{args.size}, expected #{expected_n_args})"
313 raise ArgumentError
, message
317 filter_options
= Arrow
::FilterOptions.new
318 filter_options
.null_selection_behavior
= :emit_null
320 slicers
.each
do |slicer
|
321 slicer
= slicer
.evaluate
if slicer
.respond_to
?(:evaluate)
324 slicer
+= n_rows
if slicer
< 0
325 sliced_tables
<< slice_by_range(slicer
, n_rows
- 1)
327 original_from
= from
= slicer
.first
329 to
-= 1 if slicer
.exclude_end
?
330 from
+= n_rows
if from
< 0
331 if from
< 0 or from
>= n_rows
333 "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
335 raise ArgumentError
, message
337 to
+= n_rows
if to
< 0
338 sliced_tables
<< slice_by_range(from
, to
)
339 when ::Array, BooleanArray
, ChunkedArray
340 sliced_tables
<< filter(slicer
, filter_options
)
342 message
= "slicer must be Integer, Range, (from, to), " +
343 "Arrow::ChunkedArray of Arrow::BooleanArray, " +
344 "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
345 raise ArgumentError
, message
348 if sliced_tables
.size
> 1
349 sliced_tables
[0].concatenate(sliced_tables
[1..-1])
357 # @return [Arrow::Table]
364 other
.each
do |name
, value
|
367 added_columns
[name
] = ensure_raw_column(name
, value
)
369 removed_columns
[name
] = true
374 other
.columns
.each
do |column
|
376 added_columns
[name
] = ensure_raw_column(name
, column
)
379 message
= "merge target must be Hash or Arrow::Table: " +
380 "<#{other.inspect}>: #{inspect}"
381 raise ArgumentError
, message
385 columns
.each
do |column
|
386 column_name
= column
.name
387 new_column
= added_columns
.delete(column_name
)
389 new_columns
<< new_column
392 next if removed_columns
.key
?(column_name
)
393 new_columns
<< ensure_raw_column(column_name
, column
)
395 added_columns
.each
do |name
, new_column
|
396 new_columns
<< new_column
400 new_columns
.each
do |new_column
|
401 new_fields
<< new_column
[:field]
402 new_arrays
<< new_column
[:data]
404 self.class.new(new_fields
, new_arrays
)
407 alias_method
:remove_column_raw, :remove_column
408 def remove_column(name_or_index
)
411 name
= name_or_index
.to_s
412 index
= columns
.index
{|column
| column
.name
== name
}
414 message
= "unknown column: #{name_or_index.inspect}: #{inspect}"
415 raise KeyError
.new(message
)
418 index
= name_or_index
419 index
+= n_columns
if index
< 0
420 if index
< 0 or index
>= n_columns
421 message
= "out of index (0..#{n_columns - 1}): " +
422 "#{name_or_index.inspect}: #{inspect}"
423 raise IndexError
.new(message
)
426 remove_column_raw(index
)
431 Group
.new(self, keys
)
435 def window(size
: nil)
436 RollingWindow
.new(self, size
)
439 def save(output
, options
={})
440 saver
= TableSaver
.new(self, output
, options
)
445 packed_arrays
= columns
.collect
do |column
|
448 self.class.new(schema
, packed_arrays
)
451 alias_method
:to_s_raw, :to_s
453 format
= options
[:format]
458 formatter_class
= TableListFormatter
460 formatter_class
= TableTableFormatter
462 message
= ":format must be :column, :list, :table or nil"
463 raise ArgumentError
, "#{message}: <#{format.inspect}>"
465 formatter
= formatter_class
.new(self, options
)
469 alias_method
:inspect_raw, :inspect
474 def respond_to_missing
?(name
, include_private
)
475 return true if find_column(name
)
479 def method_missing(name
, *args
, &block
)
481 column
= find_column(name
)
482 return column
if column
488 def slice_by_range(from
, to
)
489 slice_raw(from
, to
- from
+ 1)
492 def ensure_raw_column(name
, data)
496 field
: Field
.new(name
, data.value_data_type
),
497 data: ChunkedArray
.new([data]),
501 field
: Field
.new(name
, data.value_data_type
),
507 data = ChunkedArray
.new([data]) unless data.is_a
?(ChunkedArray
)
513 message
= "column must be Arrow::Array or Arrow::Column: " +
514 "<#{name}>: <#{data.inspect}>: #{inspect}"
515 raise ArgumentError
, message