]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/r/R/feather.R
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / R / feather.R
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 #' Write data in the Feather format
19 #'
20 #' Feather provides binary columnar serialization for data frames.
21 #' It is designed to make reading and writing data frames efficient,
22 #' and to make sharing data across data analysis languages easy.
23 #' This function writes both the original, limited specification of the format
24 #' and the version 2 specification, which is the Apache Arrow IPC file format.
25 #'
26 #' @param x `data.frame`, [RecordBatch], or [Table]
27 #' @param sink A string file path, URI, or [OutputStream], or path in a file
28 #' system (`SubTreeFileSystem`)
29 #' @param version integer Feather file version. Version 2 is the current.
30 #' Version 1 is the more limited legacy format.
31 #' @param chunk_size For V2 files, the number of rows that each chunk of data
32 #' should have in the file. Use a smaller `chunk_size` when you need faster
33 #' random row access. Default is 64K. This option is not supported for V1.
34 #' @param compression Name of compression codec to use, if any. Default is
35 #' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise
36 #' "uncompressed". "zstd" is the other available codec and generally has better
37 #' compression ratios in exchange for slower read and write performance
38 #' See [codec_is_available()]. This option is not supported for V1.
39 #' @param compression_level If `compression` is "zstd", you may
40 #' specify an integer compression level. If omitted, the compression codec's
41 #' default compression level is used.
42 #'
43 #' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
44 #' the stream will be left open.
45 #' @export
46 #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
47 #' @seealso [Schema] for information about schemas and metadata handling.
48 #' @examplesIf arrow_available()
49 #' tf <- tempfile()
50 #' on.exit(unlink(tf))
51 #' write_feather(mtcars, tf)
52 #' @include arrow-package.R
53 write_feather <- function(x,
54 sink,
55 version = 2,
56 chunk_size = 65536L,
57 compression = c("default", "lz4", "uncompressed", "zstd"),
58 compression_level = NULL) {
59 # Handle and validate options before touching data
60 version <- as.integer(version)
61 assert_that(version %in% 1:2)
62 compression <- match.arg(compression)
63 chunk_size <- as.integer(chunk_size)
64 assert_that(chunk_size > 0)
65 if (compression == "default") {
66 if (version == 2 && codec_is_available("lz4")) {
67 compression <- "lz4"
68 } else {
69 compression <- "uncompressed"
70 }
71 }
72 if (is.null(compression_level)) {
73 # Use -1 as sentinal for "default"
74 compression_level <- -1L
75 }
76 compression_level <- as.integer(compression_level)
77 # Now make sure that options make sense together
78 if (version == 1) {
79 if (chunk_size != 65536L) {
80 stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE)
81 }
82 if (compression != "uncompressed") {
83 stop("Feather version 1 does not support the 'compression' option", call. = FALSE)
84 }
85 if (compression_level != -1L) {
86 stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE)
87 }
88 }
89 if (compression != "zstd" && compression_level != -1L) {
90 stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE)
91 }
92 # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug:
93 version <- version + 1L
94
95 # "lz4" is the convenience
96 if (compression == "lz4") {
97 compression <- "lz4_frame"
98 }
99
100 compression <- compression_from_name(compression)
101
102 x_out <- x
103 if (is.data.frame(x) || inherits(x, "RecordBatch")) {
104 x <- Table$create(x)
105 }
106
107 assert_that(is_writable_table(x))
108
109 if (!inherits(sink, "OutputStream")) {
110 sink <- make_output_stream(sink)
111 on.exit(sink$close())
112 }
113 ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level)
114 invisible(x_out)
115 }
116
117 #' Read a Feather file
118 #'
119 #' Feather provides binary columnar serialization for data frames.
120 #' It is designed to make reading and writing data frames efficient,
121 #' and to make sharing data across data analysis languages easy.
122 #' This function reads both the original, limited specification of the format
123 #' and the version 2 specification, which is the Apache Arrow IPC file format.
124 #'
125 #' @inheritParams read_ipc_stream
126 #' @inheritParams read_delim_arrow
127 #' @param ... additional parameters, passed to [make_readable_file()].
128 #'
129 #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an
130 #' Arrow [Table] otherwise
131 #'
132 #' @export
133 #' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data.
134 #' @examplesIf arrow_available()
135 #' tf <- tempfile()
136 #' on.exit(unlink(tf))
137 #' write_feather(mtcars, tf)
138 #' df <- read_feather(tf)
139 #' dim(df)
140 #' # Can select columns
141 #' df <- read_feather(tf, col_select = starts_with("d"))
142 read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) {
143 if (!inherits(file, "RandomAccessFile")) {
144 file <- make_readable_file(file, ...)
145 on.exit(file$close())
146 }
147 reader <- FeatherReader$create(file)
148
149 col_select <- enquo(col_select)
150 columns <- if (!quo_is_null(col_select)) {
151 vars_select(names(reader), !!col_select)
152 }
153
154 out <- tryCatch(
155 reader$Read(columns),
156 error = read_compressed_error
157 )
158
159 if (isTRUE(as_data_frame)) {
160 out <- as.data.frame(out)
161 }
162 out
163 }
164
165 #' @title FeatherReader class
166 #' @rdname FeatherReader
167 #' @name FeatherReader
168 #' @docType class
169 #' @usage NULL
170 #' @format NULL
171 #' @description This class enables you to interact with Feather files. Create
172 #' one to connect to a file or other InputStream, and call `Read()` on it to
173 #' make an `arrow::Table`. See its usage in [`read_feather()`].
174 #'
175 #' @section Factory:
176 #'
177 #' The `FeatherReader$create()` factory method instantiates the object and
178 #' takes the following argument:
179 #'
180 #' - `file` an Arrow file connection object inheriting from `RandomAccessFile`.
181 #'
182 #' @section Methods:
183 #'
184 #' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of
185 #' integer indices
186 #' - `$column_names`: Active binding, returns the column names in the Feather file
187 #' - `$schema`: Active binding, returns the schema of the Feather file
188 #' - `$version`: Active binding, returns `1` or `2`, according to the Feather
189 #' file version
190 #'
191 #' @export
192 #' @include arrow-package.R
193 FeatherReader <- R6Class("FeatherReader",
194 inherit = ArrowObject,
195 public = list(
196 Read = function(columns) {
197 ipc___feather___Reader__Read(self, columns)
198 },
199 print = function(...) {
200 cat("FeatherReader:\n")
201 print(self$schema)
202 invisible(self)
203 }
204 ),
205 active = list(
206 # versions are officially 2 for V1 and 3 for V2 :shrug:
207 version = function() ipc___feather___Reader__version(self) - 1L,
208 column_names = function() names(self$schema),
209 schema = function() ipc___feather___Reader__schema(self)
210 )
211 )
212
213 #' @export
214 names.FeatherReader <- function(x) x$column_names
215
216 FeatherReader$create <- function(file) {
217 assert_is(file, "RandomAccessFile")
218 ipc___feather___Reader__Open(file)
219 }