]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/arrow/schema.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / arrow / schema.h
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <cassert>
21#include <memory>
22#include <unordered_map>
23#include <unordered_set>
24#include <vector>
25
26#include "arrow/result.h"
27#include "arrow/status.h"
28#include "arrow/type.h"
29#include "arrow/type_fwd.h"
30
31#include "parquet/level_conversion.h"
32#include "parquet/platform.h"
33#include "parquet/schema.h"
34
35namespace parquet {
36
37class ArrowReaderProperties;
38class ArrowWriterProperties;
39class WriterProperties;
40
41namespace arrow {
42
43/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
44/// schema into a Parquet schema.
45///
46/// @{
47
48PARQUET_EXPORT
49::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
50 const WriterProperties& properties,
51 const ArrowWriterProperties& arrow_properties,
52 schema::NodePtr* out);
53
54PARQUET_EXPORT
55::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
56 const WriterProperties& properties,
57 const ArrowWriterProperties& arrow_properties,
58 std::shared_ptr<SchemaDescriptor>* out);
59
60PARQUET_EXPORT
61::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
62 const WriterProperties& properties,
63 std::shared_ptr<SchemaDescriptor>* out);
64
65/// @}
66
67/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
68/// schema into an Arrow schema.
69///
70/// @{
71
72PARQUET_EXPORT
73::arrow::Status FromParquetSchema(
74 const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
75 const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
76 std::shared_ptr<::arrow::Schema>* out);
77
78PARQUET_EXPORT
79::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
80 const ArrowReaderProperties& properties,
81 std::shared_ptr<::arrow::Schema>* out);
82
83PARQUET_EXPORT
84::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
85 std::shared_ptr<::arrow::Schema>* out);
86
87/// @}
88
89/// \brief Bridge between an arrow::Field and parquet column indices.
90struct PARQUET_EXPORT SchemaField {
91 std::shared_ptr<::arrow::Field> field;
92 std::vector<SchemaField> children;
93
94 // Only set for leaf nodes
95 int column_index = -1;
96
97 parquet::internal::LevelInfo level_info;
98
99 bool is_leaf() const { return column_index != -1; }
100};
101
102/// \brief Bridge between a parquet Schema and an arrow Schema.
103///
104/// Expose parquet columns as a tree structure. Useful traverse and link
105/// between arrow's Schema and parquet's Schema.
106struct PARQUET_EXPORT SchemaManifest {
107 static ::arrow::Status Make(
108 const SchemaDescriptor* schema,
109 const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
110 const ArrowReaderProperties& properties, SchemaManifest* manifest);
111
112 const SchemaDescriptor* descr;
113 std::shared_ptr<::arrow::Schema> origin_schema;
114 std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
115 std::vector<SchemaField> schema_fields;
116
117 std::unordered_map<int, const SchemaField*> column_index_to_field;
118 std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
119
120 ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
121 auto it = column_index_to_field.find(column_index);
122 if (it == column_index_to_field.end()) {
123 return ::arrow::Status::KeyError("Column index ", column_index,
124 " not found in schema manifest, may be malformed");
125 }
126 *out = it->second;
127 return ::arrow::Status::OK();
128 }
129
130 const SchemaField* GetParent(const SchemaField* field) const {
131 // Returns nullptr also if not found
132 auto it = child_to_parent.find(field);
133 if (it == child_to_parent.end()) {
134 return NULLPTR;
135 }
136 return it->second;
137 }
138
139 /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
140 /// correspond to the column root (first node below the parquet schema's root group) of
141 /// each leaf referenced in column_indices.
142 ///
143 /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
144 /// the roots are `a` and `i` (return=[0,2]).
145 ///
146 /// root
147 /// -- a <------
148 /// -- -- b | |
149 /// -- -- -- c |
150 /// -- -- -- d |
151 /// -- -- -- -- e
152 /// -- f
153 /// -- -- g
154 /// -- -- -- h
155 /// -- i <---
156 /// -- -- j |
157 /// -- -- -- k
158 ::arrow::Result<std::vector<int>> GetFieldIndices(
159 const std::vector<int>& column_indices) const {
160 const schema::GroupNode* group = descr->group_node();
161 std::unordered_set<int> already_added;
162
163 std::vector<int> out;
164 for (int column_idx : column_indices) {
165 if (column_idx < 0 || column_idx >= descr->num_columns()) {
166 return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
167 }
168
169 auto field_node = descr->GetColumnRoot(column_idx);
170 auto field_idx = group->FieldIndex(*field_node);
171 if (field_idx == -1) {
172 return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
173 }
174
175 if (already_added.insert(field_idx).second) {
176 out.push_back(field_idx);
177 }
178 }
179 return out;
180 }
181};
182
183} // namespace arrow
184} // namespace parquet