]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/r/R/metadata.R
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / r / R / metadata.R
1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
16 # under the License.
17
18 #' @importFrom utils object.size
19 .serialize_arrow_r_metadata <- function(x) {
20 assert_is(x, "list")
21
22 # drop problems attributes (most likely from readr)
23 x[["attributes"]][["problems"]] <- NULL
24
25 out <- serialize(x, NULL, ascii = TRUE)
26
27 # if the metadata is over 100 kB, compress
28 if (option_compress_metadata() && object.size(out) > 100000) {
29 out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
30
31 # but ensure that the compression+serialization is effective.
32 if (object.size(out) > object.size(out_comp)) out <- out_comp
33 }
34
35 rawToChar(out)
36 }
37
38 .unserialize_arrow_r_metadata <- function(x) {
39 tryCatch(
40 expr = {
41 out <- unserialize(charToRaw(x))
42
43 # if this is still raw, try decompressing
44 if (is.raw(out)) {
45 out <- unserialize(memDecompress(out, type = "gzip"))
46 }
47 out
48 },
49 error = function(e) {
50 warning("Invalid metadata$r", call. = FALSE)
51 NULL
52 }
53 )
54 }
55
56 #' @importFrom rlang trace_back
57 apply_arrow_r_metadata <- function(x, r_metadata) {
58 tryCatch(
59 expr = {
60 columns_metadata <- r_metadata$columns
61 if (is.data.frame(x)) {
62 if (length(names(x)) && !is.null(columns_metadata)) {
63 for (name in intersect(names(columns_metadata), names(x))) {
64 x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
65 }
66 }
67 } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
68 # If we have a list and "columns_metadata" this applies row-level metadata
69 # inside of a column in a dataframe.
70
71 # However, if we are inside of a dplyr collection (including all datasets),
72 # we cannot apply this row-level metadata, since the order of the rows is
73 # not guaranteed to be the same, so don't even try, but warn what's going on
74 trace <- trace_back()
75 # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
76 in_dplyr_collect <- any(map_lgl(trace$calls %||% trace$call, function(x) {
77 grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]]
78 }))
79 if (in_dplyr_collect) {
80 warning(
81 "Row-level metadata is not compatible with this operation and has ",
82 "been ignored",
83 call. = FALSE
84 )
85 } else {
86 x <- map2(x, columns_metadata, function(.x, .y) {
87 apply_arrow_r_metadata(.x, .y)
88 })
89 }
90 x
91 }
92
93 if (!is.null(r_metadata$attributes)) {
94 attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
95 if (inherits(x, "POSIXlt")) {
96 # We store POSIXlt as a StructArray, which is translated back to R
97 # as a data.frame, but while data frames have a row.names = c(NA, nrow(x))
98 # attribute, POSIXlt does not, so since this is now no longer an object
99 # of class data.frame, remove the extraneous attribute
100 attr(x, "row.names") <- NULL
101 }
102 if (!is.null(attr(x, ".group_vars")) && requireNamespace("dplyr", quietly = TRUE)) {
103 x <- dplyr::group_by(x, !!!syms(attr(x, ".group_vars")))
104 attr(x, ".group_vars") <- NULL
105 }
106 }
107 },
108 error = function(e) {
109 warning("Invalid metadata$r", call. = FALSE)
110 }
111 )
112 x
113 }
114
115 remove_attributes <- function(x) {
116 removed_attributes <- character()
117 if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
118 removed_attributes <- c("class", "row.names", "names")
119 } else if (inherits(x, "data.frame")) {
120 removed_attributes <- c("row.names", "names")
121 } else if (inherits(x, "factor")) {
122 removed_attributes <- c("class", "levels")
123 } else if (inherits(x, c("integer64", "Date", "arrow_binary", "arrow_large_binary"))) {
124 removed_attributes <- c("class")
125 } else if (inherits(x, "arrow_fixed_size_binary")) {
126 removed_attributes <- c("class", "byte_width")
127 } else if (inherits(x, "POSIXct")) {
128 removed_attributes <- c("class", "tzone")
129 } else if (inherits(x, "hms") || inherits(x, "difftime")) {
130 removed_attributes <- c("class", "units")
131 }
132 removed_attributes
133 }
134
135 arrow_attributes <- function(x, only_top_level = FALSE) {
136 if (inherits(x, "grouped_df")) {
137 # Keep only the group var names, not the rest of the cached data that dplyr
138 # uses, which may be large
139 if (requireNamespace("dplyr", quietly = TRUE)) {
140 gv <- dplyr::group_vars(x)
141 x <- dplyr::ungroup(x)
142 # ungroup() first, then set attribute, bc ungroup() would erase it
143 attr(x, ".group_vars") <- gv
144 } else {
145 # Regardless, we shouldn't keep groups around
146 attr(x, "groups") <- NULL
147 }
148 }
149 att <- attributes(x)
150
151 removed_attributes <- remove_attributes(x)
152
153 att <- att[setdiff(names(att), removed_attributes)]
154 if (isTRUE(only_top_level)) {
155 return(att)
156 }
157
158 if (is.data.frame(x)) {
159 columns <- map(x, arrow_attributes)
160 out <- if (length(att) || !all(map_lgl(columns, is.null))) {
161 list(attributes = att, columns = columns)
162 }
163 return(out)
164 }
165
166 columns <- NULL
167 attempt_to_save_row_level <- getOption("arrow.preserve_row_level_metadata", FALSE) &&
168 is.list(x) && !inherits(x, "POSIXlt")
169 if (attempt_to_save_row_level) {
170 # However, if we are inside of a dplyr collection (including all datasets),
171 # we cannot apply this row-level metadata, since the order of the rows is
172 # not guaranteed to be the same, so don't even try, but warn what's going on
173 trace <- trace_back()
174 # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
175 in_dataset_write <- any(map_lgl(trace$calls %||% trace$call, function(x) {
176 grepl("write_dataset", x, fixed = TRUE)[[1]]
177 }))
178 if (in_dataset_write) {
179 warning(
180 "Row-level metadata is not compatible with datasets and will be discarded",
181 call. = FALSE
182 )
183 } else {
184 # for list columns, we also keep attributes of each
185 # element in columns
186 columns <- map(x, arrow_attributes)
187 }
188 if (all(map_lgl(columns, is.null))) {
189 columns <- NULL
190 }
191 } else if (inherits(x, c("sfc", "sf"))) {
192 # Check if there are any columns that look like sf columns, warn that we will
193 # not be saving this data for now (but only if arrow.preserve_row_level_metadata
194 # is set to FALSE)
195 warning(
196 "One of the columns given appears to be an `sfc` SF column. Due to their unique ",
197 "nature, these columns do not convert to Arrow well. We are working on ",
198 "better ways to do this, but in the interim we recommend converting any `sfc` ",
199 "columns to WKB (well-known binary) columns before using them with Arrow ",
200 "(for example, with `sf::st_as_binary(col)`).",
201 call. = FALSE
202 )
203 }
204
205 if (length(att) || !is.null(columns)) {
206 list(attributes = att, columns = columns)
207 } else {
208 NULL
209 }
210 }