]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "parquet/arrow/schema_internal.h" | |
19 | ||
20 | #include "arrow/type.h" | |
21 | ||
22 | using ArrowType = ::arrow::DataType; | |
23 | using ArrowTypeId = ::arrow::Type; | |
24 | using ParquetType = parquet::Type; | |
25 | ||
26 | namespace parquet { | |
27 | ||
28 | namespace arrow { | |
29 | ||
30 | using ::arrow::Result; | |
31 | using ::arrow::Status; | |
32 | using ::arrow::internal::checked_cast; | |
33 | ||
34 | Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) { | |
35 | const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type); | |
36 | if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) { | |
37 | return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale()); | |
38 | } | |
39 | return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale()); | |
40 | } | |
41 | ||
42 | Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) { | |
43 | const auto& integer = checked_cast<const IntLogicalType&>(logical_type); | |
44 | switch (integer.bit_width()) { | |
45 | case 8: | |
46 | return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8(); | |
47 | case 16: | |
48 | return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16(); | |
49 | case 32: | |
50 | return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32(); | |
51 | default: | |
52 | return Status::TypeError(logical_type.ToString(), | |
53 | " can not annotate physical type Int32"); | |
54 | } | |
55 | } | |
56 | ||
57 | Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) { | |
58 | const auto& integer = checked_cast<const IntLogicalType&>(logical_type); | |
59 | switch (integer.bit_width()) { | |
60 | case 64: | |
61 | return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64(); | |
62 | default: | |
63 | return Status::TypeError(logical_type.ToString(), | |
64 | " can not annotate physical type Int64"); | |
65 | } | |
66 | } | |
67 | ||
68 | Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) { | |
69 | const auto& time = checked_cast<const TimeLogicalType&>(logical_type); | |
70 | switch (time.time_unit()) { | |
71 | case LogicalType::TimeUnit::MILLIS: | |
72 | return ::arrow::time32(::arrow::TimeUnit::MILLI); | |
73 | default: | |
74 | return Status::TypeError(logical_type.ToString(), | |
75 | " can not annotate physical type Time32"); | |
76 | } | |
77 | } | |
78 | ||
79 | Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) { | |
80 | const auto& time = checked_cast<const TimeLogicalType&>(logical_type); | |
81 | switch (time.time_unit()) { | |
82 | case LogicalType::TimeUnit::MICROS: | |
83 | return ::arrow::time64(::arrow::TimeUnit::MICRO); | |
84 | case LogicalType::TimeUnit::NANOS: | |
85 | return ::arrow::time64(::arrow::TimeUnit::NANO); | |
86 | default: | |
87 | return Status::TypeError(logical_type.ToString(), | |
88 | " can not annotate physical type Time64"); | |
89 | } | |
90 | } | |
91 | ||
92 | Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) { | |
93 | const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type); | |
94 | const bool utc_normalized = | |
95 | timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc(); | |
96 | static const char* utc_timezone = "UTC"; | |
97 | switch (timestamp.time_unit()) { | |
98 | case LogicalType::TimeUnit::MILLIS: | |
99 | return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone) | |
100 | : ::arrow::timestamp(::arrow::TimeUnit::MILLI)); | |
101 | case LogicalType::TimeUnit::MICROS: | |
102 | return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone) | |
103 | : ::arrow::timestamp(::arrow::TimeUnit::MICRO)); | |
104 | case LogicalType::TimeUnit::NANOS: | |
105 | return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone) | |
106 | : ::arrow::timestamp(::arrow::TimeUnit::NANO)); | |
107 | default: | |
108 | return Status::TypeError("Unrecognized time unit in timestamp logical_type: ", | |
109 | logical_type.ToString()); | |
110 | } | |
111 | } | |
112 | ||
113 | Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) { | |
114 | switch (logical_type.type()) { | |
115 | case LogicalType::Type::STRING: | |
116 | return ::arrow::utf8(); | |
117 | case LogicalType::Type::DECIMAL: | |
118 | return MakeArrowDecimal(logical_type); | |
119 | case LogicalType::Type::NONE: | |
120 | case LogicalType::Type::ENUM: | |
121 | case LogicalType::Type::JSON: | |
122 | case LogicalType::Type::BSON: | |
123 | return ::arrow::binary(); | |
124 | default: | |
125 | return Status::NotImplemented("Unhandled logical logical_type ", | |
126 | logical_type.ToString(), " for binary array"); | |
127 | } | |
128 | } | |
129 | ||
130 | Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type, | |
131 | int32_t physical_length) { | |
132 | switch (logical_type.type()) { | |
133 | case LogicalType::Type::DECIMAL: | |
134 | return MakeArrowDecimal(logical_type); | |
135 | case LogicalType::Type::NONE: | |
136 | case LogicalType::Type::INTERVAL: | |
137 | case LogicalType::Type::UUID: | |
138 | return ::arrow::fixed_size_binary(physical_length); | |
139 | default: | |
140 | return Status::NotImplemented("Unhandled logical logical_type ", | |
141 | logical_type.ToString(), | |
142 | " for fixed-length binary array"); | |
143 | } | |
144 | } | |
145 | ||
146 | ::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) { | |
147 | switch (logical_type.type()) { | |
148 | case LogicalType::Type::INT: | |
149 | return MakeArrowInt(logical_type); | |
150 | case LogicalType::Type::DATE: | |
151 | return ::arrow::date32(); | |
152 | case LogicalType::Type::TIME: | |
153 | return MakeArrowTime32(logical_type); | |
154 | case LogicalType::Type::DECIMAL: | |
155 | return MakeArrowDecimal(logical_type); | |
156 | case LogicalType::Type::NONE: | |
157 | return ::arrow::int32(); | |
158 | default: | |
159 | return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(), | |
160 | " for INT32"); | |
161 | } | |
162 | } | |
163 | ||
164 | Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) { | |
165 | switch (logical_type.type()) { | |
166 | case LogicalType::Type::INT: | |
167 | return MakeArrowInt64(logical_type); | |
168 | case LogicalType::Type::DECIMAL: | |
169 | return MakeArrowDecimal(logical_type); | |
170 | case LogicalType::Type::TIMESTAMP: | |
171 | return MakeArrowTimestamp(logical_type); | |
172 | case LogicalType::Type::TIME: | |
173 | return MakeArrowTime64(logical_type); | |
174 | case LogicalType::Type::NONE: | |
175 | return ::arrow::int64(); | |
176 | default: | |
177 | return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(), | |
178 | " for INT64"); | |
179 | } | |
180 | } | |
181 | ||
182 | Result<std::shared_ptr<ArrowType>> GetArrowType( | |
183 | Type::type physical_type, const LogicalType& logical_type, int type_length, | |
184 | const ::arrow::TimeUnit::type int96_arrow_time_unit) { | |
185 | if (logical_type.is_invalid() || logical_type.is_null()) { | |
186 | return ::arrow::null(); | |
187 | } | |
188 | ||
189 | switch (physical_type) { | |
190 | case ParquetType::BOOLEAN: | |
191 | return ::arrow::boolean(); | |
192 | case ParquetType::INT32: | |
193 | return FromInt32(logical_type); | |
194 | case ParquetType::INT64: | |
195 | return FromInt64(logical_type); | |
196 | case ParquetType::INT96: | |
197 | return ::arrow::timestamp(int96_arrow_time_unit); | |
198 | case ParquetType::FLOAT: | |
199 | return ::arrow::float32(); | |
200 | case ParquetType::DOUBLE: | |
201 | return ::arrow::float64(); | |
202 | case ParquetType::BYTE_ARRAY: | |
203 | return FromByteArray(logical_type); | |
204 | case ParquetType::FIXED_LEN_BYTE_ARRAY: | |
205 | return FromFLBA(logical_type, type_length); | |
206 | default: { | |
207 | // PARQUET-1565: This can occur if the file is corrupt | |
208 | return Status::IOError("Invalid physical column type: ", | |
209 | TypeToString(physical_type)); | |
210 | } | |
211 | } | |
212 | } | |
213 | ||
214 | Result<std::shared_ptr<ArrowType>> GetArrowType( | |
215 | const schema::PrimitiveNode& primitive, | |
216 | const ::arrow::TimeUnit::type int96_arrow_time_unit) { | |
217 | return GetArrowType(primitive.physical_type(), *primitive.logical_type(), | |
218 | primitive.type_length(), int96_arrow_time_unit); | |
219 | } | |
220 | ||
221 | } // namespace arrow | |
222 | } // namespace parquet |