]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/json/converter.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / json / converter.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/json/converter.h"
19
20#include <memory>
21#include <utility>
22
23#include "arrow/array.h"
24#include "arrow/array/builder_binary.h"
25#include "arrow/array/builder_decimal.h"
26#include "arrow/array/builder_primitive.h"
27#include "arrow/array/builder_time.h"
28#include "arrow/json/parser.h"
29#include "arrow/type.h"
30#include "arrow/util/checked_cast.h"
31#include "arrow/util/decimal.h"
32#include "arrow/util/logging.h"
33#include "arrow/util/string_view.h"
34#include "arrow/util/value_parsing.h"
35
36namespace arrow {
37
38using internal::checked_cast;
39using util::string_view;
40
41namespace json {
42
43template <typename... Args>
44Status GenericConversionError(const DataType& type, Args&&... args) {
45 return Status::Invalid("Failed of conversion of JSON to ", type,
46 std::forward<Args>(args)...);
47}
48
49namespace {
50
51const DictionaryArray& GetDictionaryArray(const std::shared_ptr<Array>& in) {
52 DCHECK_EQ(in->type_id(), Type::DICTIONARY);
53 auto dict_type = checked_cast<const DictionaryType*>(in->type().get());
54 DCHECK_EQ(dict_type->index_type()->id(), Type::INT32);
55 DCHECK_EQ(dict_type->value_type()->id(), Type::STRING);
56 return checked_cast<const DictionaryArray&>(*in);
57}
58
59template <typename ValidVisitor, typename NullVisitor>
60Status VisitDictionaryEntries(const DictionaryArray& dict_array,
61 ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
62 const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
63 const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
64 for (int64_t i = 0; i < indices.length(); ++i) {
65 if (indices.IsValid(i)) {
66 RETURN_NOT_OK(visit_valid(dict.GetView(indices.GetView(i))));
67 } else {
68 RETURN_NOT_OK(visit_null());
69 }
70 }
71 return Status::OK();
72}
73
74} // namespace
75
76// base class for types which accept and output non-nested types
77class PrimitiveConverter : public Converter {
78 public:
79 PrimitiveConverter(MemoryPool* pool, std::shared_ptr<DataType> out_type)
80 : Converter(pool, out_type) {}
81};
82
83class NullConverter : public PrimitiveConverter {
84 public:
85 using PrimitiveConverter::PrimitiveConverter;
86
87 Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
88 if (in->type_id() != Type::NA) {
89 return GenericConversionError(*out_type_, " from ", *in->type());
90 }
91 *out = in;
92 return Status::OK();
93 }
94};
95
96class BooleanConverter : public PrimitiveConverter {
97 public:
98 using PrimitiveConverter::PrimitiveConverter;
99
100 Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
101 if (in->type_id() == Type::NA) {
102 return MakeArrayOfNull(boolean(), in->length(), pool_).Value(out);
103 }
104 if (in->type_id() != Type::BOOL) {
105 return GenericConversionError(*out_type_, " from ", *in->type());
106 }
107 *out = in;
108 return Status::OK();
109 }
110};
111
112template <typename T>
113class NumericConverter : public PrimitiveConverter {
114 public:
115 using value_type = typename T::c_type;
116
117 NumericConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type)
118 : PrimitiveConverter(pool, type), numeric_type_(checked_cast<const T&>(*type)) {}
119
120 Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
121 if (in->type_id() == Type::NA) {
122 return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out);
123 }
124 const auto& dict_array = GetDictionaryArray(in);
125
126 using Builder = typename TypeTraits<T>::BuilderType;
127 Builder builder(out_type_, pool_);
128 RETURN_NOT_OK(builder.Resize(dict_array.indices()->length()));
129
130 auto visit_valid = [&](string_view repr) {
131 value_type value;
132 if (!arrow::internal::ParseValue(numeric_type_, repr.data(), repr.size(), &value)) {
133 return GenericConversionError(*out_type_, ", couldn't parse:", repr);
134 }
135
136 builder.UnsafeAppend(value);
137 return Status::OK();
138 };
139
140 auto visit_null = [&]() {
141 builder.UnsafeAppendNull();
142 return Status::OK();
143 };
144
145 RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null));
146 return builder.Finish(out);
147 }
148
149 const T& numeric_type_;
150};
151
152template <typename T>
153class DecimalConverter : public PrimitiveConverter {
154 public:
155 using value_type = typename TypeTraits<T>::BuilderType::ValueType;
156
157 DecimalConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type)
158 : PrimitiveConverter(pool, type) {}
159
160 Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
161 if (in->type_id() == Type::NA) {
162 return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out);
163 }
164 const auto& dict_array = GetDictionaryArray(in);
165
166 using Builder = typename TypeTraits<T>::BuilderType;
167 Builder builder(out_type_, pool_);
168 RETURN_NOT_OK(builder.Resize(dict_array.indices()->length()));
169
170 auto visit_valid = [&builder](string_view repr) {
171 ARROW_ASSIGN_OR_RAISE(value_type value,
172 TypeTraits<T>::BuilderType::ValueType::FromString(repr));
173 builder.UnsafeAppend(value);
174 return Status::OK();
175 };
176
177 auto visit_null = [&builder]() {
178 builder.UnsafeAppendNull();
179 return Status::OK();
180 };
181
182 RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null));
183 return builder.Finish(out);
184 }
185};
186
187template <typename DateTimeType>
188class DateTimeConverter : public PrimitiveConverter {
189 public:
190 DateTimeConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type)
191 : PrimitiveConverter(pool, type), converter_(pool, repr_type()) {}
192
193 Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
194 if (in->type_id() == Type::NA) {
195 return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out);
196 }
197
198 std::shared_ptr<Array> repr;
199 RETURN_NOT_OK(converter_.Convert(in, &repr));
200
201 auto out_data = repr->data()->Copy();
202 out_data->type = out_type_;
203 *out = MakeArray(out_data);
204
205 return Status::OK();
206 }
207
208 private:
209 using ReprType = typename CTypeTraits<typename DateTimeType::c_type>::ArrowType;
210 static std::shared_ptr<DataType> repr_type() {
211 return TypeTraits<ReprType>::type_singleton();
212 }
213 NumericConverter<ReprType> converter_;
214};
215
216template <typename T>
217class BinaryConverter : public PrimitiveConverter {
218 public:
219 using PrimitiveConverter::PrimitiveConverter;
220
221 Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override {
222 if (in->type_id() == Type::NA) {
223 return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out);
224 }
225 const auto& dict_array = GetDictionaryArray(in);
226
227 using Builder = typename TypeTraits<T>::BuilderType;
228 Builder builder(out_type_, pool_);
229 RETURN_NOT_OK(builder.Resize(dict_array.indices()->length()));
230
231 // TODO(bkietz) this can be computed during parsing at low cost
232 int64_t data_length = 0;
233 auto visit_lengths_valid = [&](string_view value) {
234 data_length += value.size();
235 return Status::OK();
236 };
237
238 auto visit_lengths_null = [&]() {
239 // no-op
240 return Status::OK();
241 };
242
243 RETURN_NOT_OK(
244 VisitDictionaryEntries(dict_array, visit_lengths_valid, visit_lengths_null));
245 RETURN_NOT_OK(builder.ReserveData(data_length));
246
247 auto visit_valid = [&](string_view value) {
248 builder.UnsafeAppend(value);
249 return Status::OK();
250 };
251
252 auto visit_null = [&]() {
253 builder.UnsafeAppendNull();
254 return Status::OK();
255 };
256
257 RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null));
258 return builder.Finish(out);
259 }
260};
261
262Status MakeConverter(const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
263 std::shared_ptr<Converter>* out) {
264 switch (out_type->id()) {
265#define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \
266 case TYPE_ID: \
267 *out = std::make_shared<CONVERTER_TYPE>(pool, out_type); \
268 break
269 CONVERTER_CASE(Type::NA, NullConverter);
270 CONVERTER_CASE(Type::BOOL, BooleanConverter);
271 CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>);
272 CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>);
273 CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>);
274 CONVERTER_CASE(Type::INT64, NumericConverter<Int64Type>);
275 CONVERTER_CASE(Type::UINT8, NumericConverter<UInt8Type>);
276 CONVERTER_CASE(Type::UINT16, NumericConverter<UInt16Type>);
277 CONVERTER_CASE(Type::UINT32, NumericConverter<UInt32Type>);
278 CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>);
279 CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>);
280 CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>);
281 CONVERTER_CASE(Type::TIMESTAMP, NumericConverter<TimestampType>);
282 CONVERTER_CASE(Type::TIME32, DateTimeConverter<Time32Type>);
283 CONVERTER_CASE(Type::TIME64, DateTimeConverter<Time64Type>);
284 CONVERTER_CASE(Type::DATE32, DateTimeConverter<Date32Type>);
285 CONVERTER_CASE(Type::DATE64, DateTimeConverter<Date64Type>);
286 CONVERTER_CASE(Type::BINARY, BinaryConverter<BinaryType>);
287 CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>);
288 CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>);
289 CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>);
290 CONVERTER_CASE(Type::DECIMAL128, DecimalConverter<Decimal128Type>);
291 CONVERTER_CASE(Type::DECIMAL256, DecimalConverter<Decimal256Type>);
292 default:
293 return Status::NotImplemented("JSON conversion to ", *out_type,
294 " is not supported");
295#undef CONVERTER_CASE
296 }
297 return Status::OK();
298}
299
300const PromotionGraph* GetPromotionGraph() {
301 static struct : PromotionGraph {
302 std::shared_ptr<Field> Null(const std::string& name) const override {
303 return field(name, null(), true, Kind::Tag(Kind::kNull));
304 }
305
306 std::shared_ptr<DataType> Infer(
307 const std::shared_ptr<Field>& unexpected_field) const override {
308 auto kind = Kind::FromTag(unexpected_field->metadata());
309 switch (kind) {
310 case Kind::kNull:
311 return null();
312
313 case Kind::kBoolean:
314 return boolean();
315
316 case Kind::kNumber:
317 return int64();
318
319 case Kind::kString:
320 return timestamp(TimeUnit::SECOND);
321
322 case Kind::kArray: {
323 const auto& type = checked_cast<const ListType&>(*unexpected_field->type());
324 auto value_field = type.value_field();
325 return list(value_field->WithType(Infer(value_field)));
326 }
327 case Kind::kObject: {
328 auto fields = unexpected_field->type()->fields();
329 for (auto& field : fields) {
330 field = field->WithType(Infer(field));
331 }
332 return struct_(std::move(fields));
333 }
334 default:
335 return nullptr;
336 }
337 }
338
339 std::shared_ptr<DataType> Promote(
340 const std::shared_ptr<DataType>& failed,
341 const std::shared_ptr<Field>& unexpected_field) const override {
342 switch (failed->id()) {
343 case Type::NA:
344 return Infer(unexpected_field);
345
346 case Type::TIMESTAMP:
347 return utf8();
348
349 case Type::INT64:
350 return float64();
351
352 default:
353 return nullptr;
354 }
355 }
356 } impl;
357
358 return &impl;
359}
360
361} // namespace json
362} // namespace arrow