]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #pragma once | |
19 | ||
20 | #include <cassert> | |
21 | #include <memory> | |
22 | #include <unordered_map> | |
23 | #include <unordered_set> | |
24 | #include <vector> | |
25 | ||
26 | #include "arrow/result.h" | |
27 | #include "arrow/status.h" | |
28 | #include "arrow/type.h" | |
29 | #include "arrow/type_fwd.h" | |
30 | ||
31 | #include "parquet/level_conversion.h" | |
32 | #include "parquet/platform.h" | |
33 | #include "parquet/schema.h" | |
34 | ||
35 | namespace parquet { | |
36 | ||
37 | class ArrowReaderProperties; | |
38 | class ArrowWriterProperties; | |
39 | class WriterProperties; | |
40 | ||
41 | namespace arrow { | |
42 | ||
43 | /// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow | |
44 | /// schema into a Parquet schema. | |
45 | /// | |
46 | /// @{ | |
47 | ||
48 | PARQUET_EXPORT | |
49 | ::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field, | |
50 | const WriterProperties& properties, | |
51 | const ArrowWriterProperties& arrow_properties, | |
52 | schema::NodePtr* out); | |
53 | ||
54 | PARQUET_EXPORT | |
55 | ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, | |
56 | const WriterProperties& properties, | |
57 | const ArrowWriterProperties& arrow_properties, | |
58 | std::shared_ptr<SchemaDescriptor>* out); | |
59 | ||
60 | PARQUET_EXPORT | |
61 | ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, | |
62 | const WriterProperties& properties, | |
63 | std::shared_ptr<SchemaDescriptor>* out); | |
64 | ||
65 | /// @} | |
66 | ||
67 | /// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet | |
68 | /// schema into an Arrow schema. | |
69 | /// | |
70 | /// @{ | |
71 | ||
72 | PARQUET_EXPORT | |
73 | ::arrow::Status FromParquetSchema( | |
74 | const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, | |
75 | const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata, | |
76 | std::shared_ptr<::arrow::Schema>* out); | |
77 | ||
78 | PARQUET_EXPORT | |
79 | ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, | |
80 | const ArrowReaderProperties& properties, | |
81 | std::shared_ptr<::arrow::Schema>* out); | |
82 | ||
83 | PARQUET_EXPORT | |
84 | ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, | |
85 | std::shared_ptr<::arrow::Schema>* out); | |
86 | ||
87 | /// @} | |
88 | ||
89 | /// \brief Bridge between an arrow::Field and parquet column indices. | |
90 | struct PARQUET_EXPORT SchemaField { | |
91 | std::shared_ptr<::arrow::Field> field; | |
92 | std::vector<SchemaField> children; | |
93 | ||
94 | // Only set for leaf nodes | |
95 | int column_index = -1; | |
96 | ||
97 | parquet::internal::LevelInfo level_info; | |
98 | ||
99 | bool is_leaf() const { return column_index != -1; } | |
100 | }; | |
101 | ||
102 | /// \brief Bridge between a parquet Schema and an arrow Schema. | |
103 | /// | |
104 | /// Expose parquet columns as a tree structure. Useful traverse and link | |
105 | /// between arrow's Schema and parquet's Schema. | |
106 | struct PARQUET_EXPORT SchemaManifest { | |
107 | static ::arrow::Status Make( | |
108 | const SchemaDescriptor* schema, | |
109 | const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata, | |
110 | const ArrowReaderProperties& properties, SchemaManifest* manifest); | |
111 | ||
112 | const SchemaDescriptor* descr; | |
113 | std::shared_ptr<::arrow::Schema> origin_schema; | |
114 | std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata; | |
115 | std::vector<SchemaField> schema_fields; | |
116 | ||
117 | std::unordered_map<int, const SchemaField*> column_index_to_field; | |
118 | std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent; | |
119 | ||
120 | ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const { | |
121 | auto it = column_index_to_field.find(column_index); | |
122 | if (it == column_index_to_field.end()) { | |
123 | return ::arrow::Status::KeyError("Column index ", column_index, | |
124 | " not found in schema manifest, may be malformed"); | |
125 | } | |
126 | *out = it->second; | |
127 | return ::arrow::Status::OK(); | |
128 | } | |
129 | ||
130 | const SchemaField* GetParent(const SchemaField* field) const { | |
131 | // Returns nullptr also if not found | |
132 | auto it = child_to_parent.find(field); | |
133 | if (it == child_to_parent.end()) { | |
134 | return NULLPTR; | |
135 | } | |
136 | return it->second; | |
137 | } | |
138 | ||
139 | /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which | |
140 | /// correspond to the column root (first node below the parquet schema's root group) of | |
141 | /// each leaf referenced in column_indices. | |
142 | /// | |
143 | /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3]) | |
144 | /// the roots are `a` and `i` (return=[0,2]). | |
145 | /// | |
146 | /// root | |
147 | /// -- a <------ | |
148 | /// -- -- b | | | |
149 | /// -- -- -- c | | |
150 | /// -- -- -- d | | |
151 | /// -- -- -- -- e | |
152 | /// -- f | |
153 | /// -- -- g | |
154 | /// -- -- -- h | |
155 | /// -- i <--- | |
156 | /// -- -- j | | |
157 | /// -- -- -- k | |
158 | ::arrow::Result<std::vector<int>> GetFieldIndices( | |
159 | const std::vector<int>& column_indices) const { | |
160 | const schema::GroupNode* group = descr->group_node(); | |
161 | std::unordered_set<int> already_added; | |
162 | ||
163 | std::vector<int> out; | |
164 | for (int column_idx : column_indices) { | |
165 | if (column_idx < 0 || column_idx >= descr->num_columns()) { | |
166 | return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid"); | |
167 | } | |
168 | ||
169 | auto field_node = descr->GetColumnRoot(column_idx); | |
170 | auto field_idx = group->FieldIndex(*field_node); | |
171 | if (field_idx == -1) { | |
172 | return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid"); | |
173 | } | |
174 | ||
175 | if (already_added.insert(field_idx).second) { | |
176 | out.push_back(field_idx); | |
177 | } | |
178 | } | |
179 | return out; | |
180 | } | |
181 | }; | |
182 | ||
183 | } // namespace arrow | |
184 | } // namespace parquet |