1 # Licensed to the Apache Software Foundation (ASF) under one
2 # or more contributor license agreements. See the NOTICE file
3 # distributed with this work for additional information
4 # regarding copyright ownership. The ASF licenses this file
5 # to you under the Apache License, Version 2.0 (the
6 # "License"); you may not use this file except in compliance
7 # with the License. You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing,
12 # software distributed under the License is distributed on an
13 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 # KIND, either express or implied. See the License for the
15 # specific language governing permissions and limitations
18 #' @importFrom utils object.size
19 .serialize_arrow_r_metadata <- function(x) {
22 # drop problems attributes (most likely from readr)
23 x[["attributes"]][["problems"]] <- NULL
25 out <- serialize(x, NULL, ascii = TRUE)
27 # if the metadata is over 100 kB, compress
28 if (option_compress_metadata() && object.size(out) > 100000) {
29 out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
31 # but ensure that the compression+serialization is effective.
32 if (object.size(out) > object.size(out_comp)) out <- out_comp
38 .unserialize_arrow_r_metadata <- function(x) {
41 out <- unserialize(charToRaw(x))
43 # if this is still raw, try decompressing
45 out <- unserialize(memDecompress(out, type = "gzip"))
50 warning("Invalid metadata$r", call. = FALSE)
56 #' @importFrom rlang trace_back
57 apply_arrow_r_metadata <- function(x, r_metadata) {
60 columns_metadata <- r_metadata$columns
61 if (is.data.frame(x)) {
62 if (length(names(x)) && !is.null(columns_metadata)) {
63 for (name in intersect(names(columns_metadata), names(x))) {
64 x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
67 } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
68 # If we have a list and "columns_metadata" this applies row-level metadata
69 # inside of a column in a dataframe.
71 # However, if we are inside of a dplyr collection (including all datasets),
72 # we cannot apply this row-level metadata, since the order of the rows is
73 # not guaranteed to be the same, so don't even try, but warn what's going on
75 # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
76 in_dplyr_collect <- any(map_lgl(trace$calls %||% trace$call, function(x) {
77 grepl("collect.arrow_dplyr_query", x, fixed = TRUE)[[1]]
79 if (in_dplyr_collect) {
81 "Row-level metadata is not compatible with this operation and has ",
86 x <- map2(x, columns_metadata, function(.x, .y) {
87 apply_arrow_r_metadata(.x, .y)
93 if (!is.null(r_metadata$attributes)) {
94 attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
95 if (inherits(x, "POSIXlt")) {
96 # We store POSIXlt as a StructArray, which is translated back to R
97 # as a data.frame, but while data frames have a row.names = c(NA, nrow(x))
98 # attribute, POSIXlt does not, so since this is now no longer an object
99 # of class data.frame, remove the extraneous attribute
100 attr(x, "row.names") <- NULL
102 if (!is.null(attr(x, ".group_vars")) && requireNamespace("dplyr", quietly = TRUE)) {
103 x <- dplyr::group_by(x, !!!syms(attr(x, ".group_vars")))
104 attr(x, ".group_vars") <- NULL
108 error = function(e) {
109 warning("Invalid metadata$r", call. = FALSE)
115 remove_attributes <- function(x) {
116 removed_attributes <- character()
117 if (identical(class(x), c("tbl_df", "tbl", "data.frame"))) {
118 removed_attributes <- c("class", "row.names", "names")
119 } else if (inherits(x, "data.frame")) {
120 removed_attributes <- c("row.names", "names")
121 } else if (inherits(x, "factor")) {
122 removed_attributes <- c("class", "levels")
123 } else if (inherits(x, c("integer64", "Date", "arrow_binary", "arrow_large_binary"))) {
124 removed_attributes <- c("class")
125 } else if (inherits(x, "arrow_fixed_size_binary")) {
126 removed_attributes <- c("class", "byte_width")
127 } else if (inherits(x, "POSIXct")) {
128 removed_attributes <- c("class", "tzone")
129 } else if (inherits(x, "hms") || inherits(x, "difftime")) {
130 removed_attributes <- c("class", "units")
135 arrow_attributes <- function(x, only_top_level = FALSE) {
136 if (inherits(x, "grouped_df")) {
137 # Keep only the group var names, not the rest of the cached data that dplyr
138 # uses, which may be large
139 if (requireNamespace("dplyr", quietly = TRUE)) {
140 gv <- dplyr::group_vars(x)
141 x <- dplyr::ungroup(x)
142 # ungroup() first, then set attribute, bc ungroup() would erase it
143 attr(x, ".group_vars") <- gv
145 # Regardless, we shouldn't keep groups around
146 attr(x, "groups") <- NULL
151 removed_attributes <- remove_attributes(x)
153 att <- att[setdiff(names(att), removed_attributes)]
154 if (isTRUE(only_top_level)) {
158 if (is.data.frame(x)) {
159 columns <- map(x, arrow_attributes)
160 out <- if (length(att) || !all(map_lgl(columns, is.null))) {
161 list(attributes = att, columns = columns)
167 attempt_to_save_row_level <- getOption("arrow.preserve_row_level_metadata", FALSE) &&
168 is.list(x) && !inherits(x, "POSIXlt")
169 if (attempt_to_save_row_level) {
170 # However, if we are inside of a dplyr collection (including all datasets),
171 # we cannot apply this row-level metadata, since the order of the rows is
172 # not guaranteed to be the same, so don't even try, but warn what's going on
173 trace <- trace_back()
174 # TODO: remove `trace$calls %||% trace$call` once rlang > 0.4.11 is released
175 in_dataset_write <- any(map_lgl(trace$calls %||% trace$call, function(x) {
176 grepl("write_dataset", x, fixed = TRUE)[[1]]
178 if (in_dataset_write) {
180 "Row-level metadata is not compatible with datasets and will be discarded",
184 # for list columns, we also keep attributes of each
186 columns <- map(x, arrow_attributes)
188 if (all(map_lgl(columns, is.null))) {
191 } else if (inherits(x, c("sfc", "sf"))) {
192 # Check if there are any columns that look like sf columns, warn that we will
193 # not be saving this data for now (but only if arrow.preserve_row_level_metadata
196 "One of the columns given appears to be an `sfc` SF column. Due to their unique ",
197 "nature, these columns do not convert to Arrow well. We are working on ",
198 "better ways to do this, but in the interim we recommend converting any `sfc` ",
199 "columns to WKB (well-known binary) columns before using them with Arrow ",
200 "(for example, with `sf::st_as_binary(col)`).",
205 if (length(att) || !is.null(columns)) {
206 list(attributes = att, columns = columns)