]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/r/R/table.R
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / R / table.R
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 #' @include record-batch.R
19 #' @title Table class
20 #' @description A Table is a sequence of [chunked arrays][ChunkedArray]. They
21 #' have a similar interface to [record batches][RecordBatch], but they can be
22 #' composed from multiple record batches or chunked arrays.
23 #' @usage NULL
24 #' @format NULL
25 #' @docType class
26 #'
27 #' @section S3 Methods and Usage:
28 #' Tables are data-frame-like, and many methods you expect to work on
29 #' a `data.frame` are implemented for `Table`. This includes `[`, `[[`,
30 #' `$`, `names`, `dim`, `nrow`, `ncol`, `head`, and `tail`. You can also pull
31 #' the data from an Arrow table into R with `as.data.frame()`. See the
32 #' examples.
33 #'
34 #' A caveat about the `$` method: because `Table` is an `R6` object,
35 #' `$` is also used to access the object's methods (see below). Methods take
36 #' precedence over the table's columns. So, `tab$Slice` would return the
37 #' "Slice" method function even if there were a column in the table called
38 #' "Slice".
39 #'
40 #' @section R6 Methods:
41 #' In addition to the more R-friendly S3 methods, a `Table` object has
42 #' the following R6 methods that map onto the underlying C++ methods:
43 #'
44 #' - `$column(i)`: Extract a `ChunkedArray` by integer position from the table
45 #' - `$ColumnNames()`: Get all column names (called by `names(tab)`)
46 #' - `$RenameColumns(value)`: Set all column names (called by `names(tab) <- value`)
47 #' - `$GetColumnByName(name)`: Extract a `ChunkedArray` by string name
48 #' - `$field(i)`: Extract a `Field` from the table schema by integer position
49 #' - `$SelectColumns(indices)`: Return new `Table` with specified columns, expressed as 0-based integers.
50 #' - `$Slice(offset, length = NULL)`: Create a zero-copy view starting at the
51 #' indicated integer offset and going for the given length, or to the end
52 #' of the table if `NULL`, the default.
53 #' - `$Take(i)`: return an `Table` with rows at positions given by
54 #' integers `i`. If `i` is an Arrow `Array` or `ChunkedArray`, it will be
55 #' coerced to an R vector before taking.
56 #' - `$Filter(i, keep_na = TRUE)`: return an `Table` with rows at positions where logical
57 #' vector or Arrow boolean-type `(Chunked)Array` `i` is `TRUE`.
58 #' - `$SortIndices(names, descending = FALSE)`: return an `Array` of integer row
59 #' positions that can be used to rearrange the `Table` in ascending or descending
60 #' order by the first named column, breaking ties with further named columns.
61 #' `descending` can be a logical vector of length one or of the same length as
62 #' `names`.
63 #' - `$serialize(output_stream, ...)`: Write the table to the given
64 #' [OutputStream]
65 #' - `$cast(target_schema, safe = TRUE, options = cast_options(safe))`: Alter
66 #' the schema of the record batch.
67 #'
68 #' There are also some active bindings:
69 #' - `$num_columns`
70 #' - `$num_rows`
71 #' - `$schema`
72 #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
73 #' Modify or replace by assigning in (`tab$metadata <- new_metadata`).
74 #' All list elements are coerced to string. See `schema()` for more information.
75 #' - `$columns`: Returns a list of `ChunkedArray`s
76 #' @rdname Table
77 #' @name Table
78 #' @export
79 Table <- R6Class("Table",
80 inherit = ArrowTabular,
81 public = list(
82 column = function(i) Table__column(self, i),
83 ColumnNames = function() Table__ColumnNames(self),
84 RenameColumns = function(value) Table__RenameColumns(self, value),
85 GetColumnByName = function(name) {
86 assert_is(name, "character")
87 assert_that(length(name) == 1)
88 Table__GetColumnByName(self, name)
89 },
90 RemoveColumn = function(i) Table__RemoveColumn(self, i),
91 AddColumn = function(i, new_field, value) Table__AddColumn(self, i, new_field, value),
92 SetColumn = function(i, new_field, value) Table__SetColumn(self, i, new_field, value),
93 ReplaceSchemaMetadata = function(new) {
94 Table__ReplaceSchemaMetadata(self, new)
95 },
96 field = function(i) Table__field(self, i),
97 serialize = function(output_stream, ...) write_table(self, output_stream, ...),
98 to_data_frame = function() {
99 Table__to_dataframe(self, use_threads = option_use_threads())
100 },
101 cast = function(target_schema, safe = TRUE, ..., options = cast_options(safe, ...)) {
102 assert_is(target_schema, "Schema")
103 assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
104 Table__cast(self, target_schema, options)
105 },
106 SelectColumns = function(indices) Table__SelectColumns(self, indices),
107 Slice = function(offset, length = NULL) {
108 if (is.null(length)) {
109 Table__Slice1(self, offset)
110 } else {
111 Table__Slice2(self, offset, length)
112 }
113 },
114 # Take, Filter, and SortIndices are methods on ArrowTabular
115 Equals = function(other, check_metadata = FALSE, ...) {
116 inherits(other, "Table") && Table__Equals(self, other, isTRUE(check_metadata))
117 },
118 Validate = function() Table__Validate(self),
119 ValidateFull = function() Table__ValidateFull(self),
120 invalidate = function() {
121 .Call(`_arrow_Table__Reset`, self)
122 super$invalidate()
123 }
124 ),
125 active = list(
126 num_columns = function() Table__num_columns(self),
127 num_rows = function() Table__num_rows(self),
128 schema = function() Table__schema(self),
129 columns = function() Table__columns(self)
130 )
131 )
132
133 Table$create <- function(..., schema = NULL) {
134 dots <- list2(...)
135 # making sure there are always names
136 if (is.null(names(dots))) {
137 names(dots) <- rep_len("", length(dots))
138 }
139 stopifnot(length(dots) > 0)
140
141 if (all_record_batches(dots)) {
142 return(Table__from_record_batches(dots, schema))
143 }
144
145 # If any arrays are length 1, recycle them
146 dots <- recycle_scalars(dots)
147
148 Table__from_dots(dots, schema, option_use_threads())
149 }
150
151 #' @export
152 names.Table <- function(x) x$ColumnNames()
153
154 #' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
155 #' mixture of data.frames and named vectors, the inputs will be autospliced together
156 #' (see examples). Alternatively, you can provide a single Arrow IPC
157 #' `InputStream`, `Message`, `Buffer`, or R `raw` object containing a `Buffer`.
158 #' @param schema a [Schema], or `NULL` (the default) to infer the schema from
159 #' the data in `...`. When providing an Arrow IPC buffer, `schema` is required.
160 #' @rdname Table
161 #' @examplesIf arrow_available()
162 #' tbl <- arrow_table(name = rownames(mtcars), mtcars)
163 #' dim(tbl)
164 #' dim(head(tbl))
165 #' names(tbl)
166 #' tbl$mpg
167 #' tbl[["cyl"]]
168 #' as.data.frame(tbl[4:8, c("gear", "hp", "wt")])
169 #' @export
170 arrow_table <- Table$create