]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include "arrow/json/converter.h" | |
19 | ||
20 | #include <memory> | |
21 | #include <utility> | |
22 | ||
23 | #include "arrow/array.h" | |
24 | #include "arrow/array/builder_binary.h" | |
25 | #include "arrow/array/builder_decimal.h" | |
26 | #include "arrow/array/builder_primitive.h" | |
27 | #include "arrow/array/builder_time.h" | |
28 | #include "arrow/json/parser.h" | |
29 | #include "arrow/type.h" | |
30 | #include "arrow/util/checked_cast.h" | |
31 | #include "arrow/util/decimal.h" | |
32 | #include "arrow/util/logging.h" | |
33 | #include "arrow/util/string_view.h" | |
34 | #include "arrow/util/value_parsing.h" | |
35 | ||
36 | namespace arrow { | |
37 | ||
38 | using internal::checked_cast; | |
39 | using util::string_view; | |
40 | ||
41 | namespace json { | |
42 | ||
43 | template <typename... Args> | |
44 | Status GenericConversionError(const DataType& type, Args&&... args) { | |
45 | return Status::Invalid("Failed of conversion of JSON to ", type, | |
46 | std::forward<Args>(args)...); | |
47 | } | |
48 | ||
49 | namespace { | |
50 | ||
51 | const DictionaryArray& GetDictionaryArray(const std::shared_ptr<Array>& in) { | |
52 | DCHECK_EQ(in->type_id(), Type::DICTIONARY); | |
53 | auto dict_type = checked_cast<const DictionaryType*>(in->type().get()); | |
54 | DCHECK_EQ(dict_type->index_type()->id(), Type::INT32); | |
55 | DCHECK_EQ(dict_type->value_type()->id(), Type::STRING); | |
56 | return checked_cast<const DictionaryArray&>(*in); | |
57 | } | |
58 | ||
59 | template <typename ValidVisitor, typename NullVisitor> | |
60 | Status VisitDictionaryEntries(const DictionaryArray& dict_array, | |
61 | ValidVisitor&& visit_valid, NullVisitor&& visit_null) { | |
62 | const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary()); | |
63 | const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices()); | |
64 | for (int64_t i = 0; i < indices.length(); ++i) { | |
65 | if (indices.IsValid(i)) { | |
66 | RETURN_NOT_OK(visit_valid(dict.GetView(indices.GetView(i)))); | |
67 | } else { | |
68 | RETURN_NOT_OK(visit_null()); | |
69 | } | |
70 | } | |
71 | return Status::OK(); | |
72 | } | |
73 | ||
74 | } // namespace | |
75 | ||
76 | // base class for types which accept and output non-nested types | |
77 | class PrimitiveConverter : public Converter { | |
78 | public: | |
79 | PrimitiveConverter(MemoryPool* pool, std::shared_ptr<DataType> out_type) | |
80 | : Converter(pool, out_type) {} | |
81 | }; | |
82 | ||
83 | class NullConverter : public PrimitiveConverter { | |
84 | public: | |
85 | using PrimitiveConverter::PrimitiveConverter; | |
86 | ||
87 | Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { | |
88 | if (in->type_id() != Type::NA) { | |
89 | return GenericConversionError(*out_type_, " from ", *in->type()); | |
90 | } | |
91 | *out = in; | |
92 | return Status::OK(); | |
93 | } | |
94 | }; | |
95 | ||
96 | class BooleanConverter : public PrimitiveConverter { | |
97 | public: | |
98 | using PrimitiveConverter::PrimitiveConverter; | |
99 | ||
100 | Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { | |
101 | if (in->type_id() == Type::NA) { | |
102 | return MakeArrayOfNull(boolean(), in->length(), pool_).Value(out); | |
103 | } | |
104 | if (in->type_id() != Type::BOOL) { | |
105 | return GenericConversionError(*out_type_, " from ", *in->type()); | |
106 | } | |
107 | *out = in; | |
108 | return Status::OK(); | |
109 | } | |
110 | }; | |
111 | ||
112 | template <typename T> | |
113 | class NumericConverter : public PrimitiveConverter { | |
114 | public: | |
115 | using value_type = typename T::c_type; | |
116 | ||
117 | NumericConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type) | |
118 | : PrimitiveConverter(pool, type), numeric_type_(checked_cast<const T&>(*type)) {} | |
119 | ||
120 | Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { | |
121 | if (in->type_id() == Type::NA) { | |
122 | return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); | |
123 | } | |
124 | const auto& dict_array = GetDictionaryArray(in); | |
125 | ||
126 | using Builder = typename TypeTraits<T>::BuilderType; | |
127 | Builder builder(out_type_, pool_); | |
128 | RETURN_NOT_OK(builder.Resize(dict_array.indices()->length())); | |
129 | ||
130 | auto visit_valid = [&](string_view repr) { | |
131 | value_type value; | |
132 | if (!arrow::internal::ParseValue(numeric_type_, repr.data(), repr.size(), &value)) { | |
133 | return GenericConversionError(*out_type_, ", couldn't parse:", repr); | |
134 | } | |
135 | ||
136 | builder.UnsafeAppend(value); | |
137 | return Status::OK(); | |
138 | }; | |
139 | ||
140 | auto visit_null = [&]() { | |
141 | builder.UnsafeAppendNull(); | |
142 | return Status::OK(); | |
143 | }; | |
144 | ||
145 | RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null)); | |
146 | return builder.Finish(out); | |
147 | } | |
148 | ||
149 | const T& numeric_type_; | |
150 | }; | |
151 | ||
152 | template <typename T> | |
153 | class DecimalConverter : public PrimitiveConverter { | |
154 | public: | |
155 | using value_type = typename TypeTraits<T>::BuilderType::ValueType; | |
156 | ||
157 | DecimalConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type) | |
158 | : PrimitiveConverter(pool, type) {} | |
159 | ||
160 | Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { | |
161 | if (in->type_id() == Type::NA) { | |
162 | return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); | |
163 | } | |
164 | const auto& dict_array = GetDictionaryArray(in); | |
165 | ||
166 | using Builder = typename TypeTraits<T>::BuilderType; | |
167 | Builder builder(out_type_, pool_); | |
168 | RETURN_NOT_OK(builder.Resize(dict_array.indices()->length())); | |
169 | ||
170 | auto visit_valid = [&builder](string_view repr) { | |
171 | ARROW_ASSIGN_OR_RAISE(value_type value, | |
172 | TypeTraits<T>::BuilderType::ValueType::FromString(repr)); | |
173 | builder.UnsafeAppend(value); | |
174 | return Status::OK(); | |
175 | }; | |
176 | ||
177 | auto visit_null = [&builder]() { | |
178 | builder.UnsafeAppendNull(); | |
179 | return Status::OK(); | |
180 | }; | |
181 | ||
182 | RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null)); | |
183 | return builder.Finish(out); | |
184 | } | |
185 | }; | |
186 | ||
187 | template <typename DateTimeType> | |
188 | class DateTimeConverter : public PrimitiveConverter { | |
189 | public: | |
190 | DateTimeConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type) | |
191 | : PrimitiveConverter(pool, type), converter_(pool, repr_type()) {} | |
192 | ||
193 | Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { | |
194 | if (in->type_id() == Type::NA) { | |
195 | return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); | |
196 | } | |
197 | ||
198 | std::shared_ptr<Array> repr; | |
199 | RETURN_NOT_OK(converter_.Convert(in, &repr)); | |
200 | ||
201 | auto out_data = repr->data()->Copy(); | |
202 | out_data->type = out_type_; | |
203 | *out = MakeArray(out_data); | |
204 | ||
205 | return Status::OK(); | |
206 | } | |
207 | ||
208 | private: | |
209 | using ReprType = typename CTypeTraits<typename DateTimeType::c_type>::ArrowType; | |
210 | static std::shared_ptr<DataType> repr_type() { | |
211 | return TypeTraits<ReprType>::type_singleton(); | |
212 | } | |
213 | NumericConverter<ReprType> converter_; | |
214 | }; | |
215 | ||
216 | template <typename T> | |
217 | class BinaryConverter : public PrimitiveConverter { | |
218 | public: | |
219 | using PrimitiveConverter::PrimitiveConverter; | |
220 | ||
221 | Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { | |
222 | if (in->type_id() == Type::NA) { | |
223 | return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); | |
224 | } | |
225 | const auto& dict_array = GetDictionaryArray(in); | |
226 | ||
227 | using Builder = typename TypeTraits<T>::BuilderType; | |
228 | Builder builder(out_type_, pool_); | |
229 | RETURN_NOT_OK(builder.Resize(dict_array.indices()->length())); | |
230 | ||
231 | // TODO(bkietz) this can be computed during parsing at low cost | |
232 | int64_t data_length = 0; | |
233 | auto visit_lengths_valid = [&](string_view value) { | |
234 | data_length += value.size(); | |
235 | return Status::OK(); | |
236 | }; | |
237 | ||
238 | auto visit_lengths_null = [&]() { | |
239 | // no-op | |
240 | return Status::OK(); | |
241 | }; | |
242 | ||
243 | RETURN_NOT_OK( | |
244 | VisitDictionaryEntries(dict_array, visit_lengths_valid, visit_lengths_null)); | |
245 | RETURN_NOT_OK(builder.ReserveData(data_length)); | |
246 | ||
247 | auto visit_valid = [&](string_view value) { | |
248 | builder.UnsafeAppend(value); | |
249 | return Status::OK(); | |
250 | }; | |
251 | ||
252 | auto visit_null = [&]() { | |
253 | builder.UnsafeAppendNull(); | |
254 | return Status::OK(); | |
255 | }; | |
256 | ||
257 | RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null)); | |
258 | return builder.Finish(out); | |
259 | } | |
260 | }; | |
261 | ||
262 | Status MakeConverter(const std::shared_ptr<DataType>& out_type, MemoryPool* pool, | |
263 | std::shared_ptr<Converter>* out) { | |
264 | switch (out_type->id()) { | |
265 | #define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \ | |
266 | case TYPE_ID: \ | |
267 | *out = std::make_shared<CONVERTER_TYPE>(pool, out_type); \ | |
268 | break | |
269 | CONVERTER_CASE(Type::NA, NullConverter); | |
270 | CONVERTER_CASE(Type::BOOL, BooleanConverter); | |
271 | CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>); | |
272 | CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>); | |
273 | CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>); | |
274 | CONVERTER_CASE(Type::INT64, NumericConverter<Int64Type>); | |
275 | CONVERTER_CASE(Type::UINT8, NumericConverter<UInt8Type>); | |
276 | CONVERTER_CASE(Type::UINT16, NumericConverter<UInt16Type>); | |
277 | CONVERTER_CASE(Type::UINT32, NumericConverter<UInt32Type>); | |
278 | CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>); | |
279 | CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>); | |
280 | CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>); | |
281 | CONVERTER_CASE(Type::TIMESTAMP, NumericConverter<TimestampType>); | |
282 | CONVERTER_CASE(Type::TIME32, DateTimeConverter<Time32Type>); | |
283 | CONVERTER_CASE(Type::TIME64, DateTimeConverter<Time64Type>); | |
284 | CONVERTER_CASE(Type::DATE32, DateTimeConverter<Date32Type>); | |
285 | CONVERTER_CASE(Type::DATE64, DateTimeConverter<Date64Type>); | |
286 | CONVERTER_CASE(Type::BINARY, BinaryConverter<BinaryType>); | |
287 | CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>); | |
288 | CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>); | |
289 | CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>); | |
290 | CONVERTER_CASE(Type::DECIMAL128, DecimalConverter<Decimal128Type>); | |
291 | CONVERTER_CASE(Type::DECIMAL256, DecimalConverter<Decimal256Type>); | |
292 | default: | |
293 | return Status::NotImplemented("JSON conversion to ", *out_type, | |
294 | " is not supported"); | |
295 | #undef CONVERTER_CASE | |
296 | } | |
297 | return Status::OK(); | |
298 | } | |
299 | ||
300 | const PromotionGraph* GetPromotionGraph() { | |
301 | static struct : PromotionGraph { | |
302 | std::shared_ptr<Field> Null(const std::string& name) const override { | |
303 | return field(name, null(), true, Kind::Tag(Kind::kNull)); | |
304 | } | |
305 | ||
306 | std::shared_ptr<DataType> Infer( | |
307 | const std::shared_ptr<Field>& unexpected_field) const override { | |
308 | auto kind = Kind::FromTag(unexpected_field->metadata()); | |
309 | switch (kind) { | |
310 | case Kind::kNull: | |
311 | return null(); | |
312 | ||
313 | case Kind::kBoolean: | |
314 | return boolean(); | |
315 | ||
316 | case Kind::kNumber: | |
317 | return int64(); | |
318 | ||
319 | case Kind::kString: | |
320 | return timestamp(TimeUnit::SECOND); | |
321 | ||
322 | case Kind::kArray: { | |
323 | const auto& type = checked_cast<const ListType&>(*unexpected_field->type()); | |
324 | auto value_field = type.value_field(); | |
325 | return list(value_field->WithType(Infer(value_field))); | |
326 | } | |
327 | case Kind::kObject: { | |
328 | auto fields = unexpected_field->type()->fields(); | |
329 | for (auto& field : fields) { | |
330 | field = field->WithType(Infer(field)); | |
331 | } | |
332 | return struct_(std::move(fields)); | |
333 | } | |
334 | default: | |
335 | return nullptr; | |
336 | } | |
337 | } | |
338 | ||
339 | std::shared_ptr<DataType> Promote( | |
340 | const std::shared_ptr<DataType>& failed, | |
341 | const std::shared_ptr<Field>& unexpected_field) const override { | |
342 | switch (failed->id()) { | |
343 | case Type::NA: | |
344 | return Infer(unexpected_field); | |
345 | ||
346 | case Type::TIMESTAMP: | |
347 | return utf8(); | |
348 | ||
349 | case Type::INT64: | |
350 | return float64(); | |
351 | ||
352 | default: | |
353 | return nullptr; | |
354 | } | |
355 | } | |
356 | } impl; | |
357 | ||
358 | return &impl; | |
359 | } | |
360 | ||
361 | } // namespace json | |
362 | } // namespace arrow |