]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/parquet/arrow/schema_internal.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / arrow / schema_internal.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "parquet/arrow/schema_internal.h"
19
20#include "arrow/type.h"
21
22using ArrowType = ::arrow::DataType;
23using ArrowTypeId = ::arrow::Type;
24using ParquetType = parquet::Type;
25
26namespace parquet {
27
28namespace arrow {
29
30using ::arrow::Result;
31using ::arrow::Status;
32using ::arrow::internal::checked_cast;
33
34Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
35 const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
36 if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
37 return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
38 }
39 return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
40}
41
42Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
43 const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
44 switch (integer.bit_width()) {
45 case 8:
46 return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
47 case 16:
48 return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
49 case 32:
50 return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
51 default:
52 return Status::TypeError(logical_type.ToString(),
53 " can not annotate physical type Int32");
54 }
55}
56
57Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
58 const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
59 switch (integer.bit_width()) {
60 case 64:
61 return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
62 default:
63 return Status::TypeError(logical_type.ToString(),
64 " can not annotate physical type Int64");
65 }
66}
67
68Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
69 const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
70 switch (time.time_unit()) {
71 case LogicalType::TimeUnit::MILLIS:
72 return ::arrow::time32(::arrow::TimeUnit::MILLI);
73 default:
74 return Status::TypeError(logical_type.ToString(),
75 " can not annotate physical type Time32");
76 }
77}
78
79Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
80 const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
81 switch (time.time_unit()) {
82 case LogicalType::TimeUnit::MICROS:
83 return ::arrow::time64(::arrow::TimeUnit::MICRO);
84 case LogicalType::TimeUnit::NANOS:
85 return ::arrow::time64(::arrow::TimeUnit::NANO);
86 default:
87 return Status::TypeError(logical_type.ToString(),
88 " can not annotate physical type Time64");
89 }
90}
91
92Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
93 const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
94 const bool utc_normalized =
95 timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
96 static const char* utc_timezone = "UTC";
97 switch (timestamp.time_unit()) {
98 case LogicalType::TimeUnit::MILLIS:
99 return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
100 : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
101 case LogicalType::TimeUnit::MICROS:
102 return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
103 : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
104 case LogicalType::TimeUnit::NANOS:
105 return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
106 : ::arrow::timestamp(::arrow::TimeUnit::NANO));
107 default:
108 return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
109 logical_type.ToString());
110 }
111}
112
113Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
114 switch (logical_type.type()) {
115 case LogicalType::Type::STRING:
116 return ::arrow::utf8();
117 case LogicalType::Type::DECIMAL:
118 return MakeArrowDecimal(logical_type);
119 case LogicalType::Type::NONE:
120 case LogicalType::Type::ENUM:
121 case LogicalType::Type::JSON:
122 case LogicalType::Type::BSON:
123 return ::arrow::binary();
124 default:
125 return Status::NotImplemented("Unhandled logical logical_type ",
126 logical_type.ToString(), " for binary array");
127 }
128}
129
130Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
131 int32_t physical_length) {
132 switch (logical_type.type()) {
133 case LogicalType::Type::DECIMAL:
134 return MakeArrowDecimal(logical_type);
135 case LogicalType::Type::NONE:
136 case LogicalType::Type::INTERVAL:
137 case LogicalType::Type::UUID:
138 return ::arrow::fixed_size_binary(physical_length);
139 default:
140 return Status::NotImplemented("Unhandled logical logical_type ",
141 logical_type.ToString(),
142 " for fixed-length binary array");
143 }
144}
145
146::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
147 switch (logical_type.type()) {
148 case LogicalType::Type::INT:
149 return MakeArrowInt(logical_type);
150 case LogicalType::Type::DATE:
151 return ::arrow::date32();
152 case LogicalType::Type::TIME:
153 return MakeArrowTime32(logical_type);
154 case LogicalType::Type::DECIMAL:
155 return MakeArrowDecimal(logical_type);
156 case LogicalType::Type::NONE:
157 return ::arrow::int32();
158 default:
159 return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
160 " for INT32");
161 }
162}
163
164Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
165 switch (logical_type.type()) {
166 case LogicalType::Type::INT:
167 return MakeArrowInt64(logical_type);
168 case LogicalType::Type::DECIMAL:
169 return MakeArrowDecimal(logical_type);
170 case LogicalType::Type::TIMESTAMP:
171 return MakeArrowTimestamp(logical_type);
172 case LogicalType::Type::TIME:
173 return MakeArrowTime64(logical_type);
174 case LogicalType::Type::NONE:
175 return ::arrow::int64();
176 default:
177 return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
178 " for INT64");
179 }
180}
181
182Result<std::shared_ptr<ArrowType>> GetArrowType(
183 Type::type physical_type, const LogicalType& logical_type, int type_length,
184 const ::arrow::TimeUnit::type int96_arrow_time_unit) {
185 if (logical_type.is_invalid() || logical_type.is_null()) {
186 return ::arrow::null();
187 }
188
189 switch (physical_type) {
190 case ParquetType::BOOLEAN:
191 return ::arrow::boolean();
192 case ParquetType::INT32:
193 return FromInt32(logical_type);
194 case ParquetType::INT64:
195 return FromInt64(logical_type);
196 case ParquetType::INT96:
197 return ::arrow::timestamp(int96_arrow_time_unit);
198 case ParquetType::FLOAT:
199 return ::arrow::float32();
200 case ParquetType::DOUBLE:
201 return ::arrow::float64();
202 case ParquetType::BYTE_ARRAY:
203 return FromByteArray(logical_type);
204 case ParquetType::FIXED_LEN_BYTE_ARRAY:
205 return FromFLBA(logical_type, type_length);
206 default: {
207 // PARQUET-1565: This can occur if the file is corrupt
208 return Status::IOError("Invalid physical column type: ",
209 TypeToString(physical_type));
210 }
211 }
212}
213
214Result<std::shared_ptr<ArrowType>> GetArrowType(
215 const schema::PrimitiveNode& primitive,
216 const ::arrow::TimeUnit::type int96_arrow_time_unit) {
217 return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
218 primitive.type_length(), int96_arrow_time_unit);
219}
220
221} // namespace arrow
222} // namespace parquet