]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/csv/converter.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / csv / converter.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "arrow/csv/converter.h"
19
20 #include <array>
21 #include <cstring>
22 #include <limits>
23 #include <sstream>
24 #include <string>
25 #include <type_traits>
26 #include <vector>
27
28 #include "arrow/array/builder_binary.h"
29 #include "arrow/array/builder_decimal.h"
30 #include "arrow/array/builder_dict.h"
31 #include "arrow/array/builder_primitive.h"
32 #include "arrow/csv/parser.h"
33 #include "arrow/status.h"
34 #include "arrow/type.h"
35 #include "arrow/type_fwd.h"
36 #include "arrow/type_traits.h"
37 #include "arrow/util/checked_cast.h"
38 #include "arrow/util/decimal.h"
39 #include "arrow/util/trie.h"
40 #include "arrow/util/utf8.h"
41 #include "arrow/util/value_parsing.h" // IWYU pragma: keep
42
43 namespace arrow {
44 namespace csv {
45
46 using internal::checked_cast;
47 using internal::Trie;
48 using internal::TrieBuilder;
49
50 namespace {
51
52 Status GenericConversionError(const std::shared_ptr<DataType>& type, const uint8_t* data,
53 uint32_t size) {
54 return Status::Invalid("CSV conversion error to ", type->ToString(),
55 ": invalid value '",
56 std::string(reinterpret_cast<const char*>(data), size), "'");
57 }
58
59 inline bool IsWhitespace(uint8_t c) {
60 if (ARROW_PREDICT_TRUE(c > ' ')) {
61 return false;
62 }
63 return c == ' ' || c == '\t';
64 }
65
66 // Updates data_inout and size_inout to not include leading/trailing whitespace
67 // characters.
68 inline void TrimWhiteSpace(const uint8_t** data_inout, uint32_t* size_inout) {
69 const uint8_t*& data = *data_inout;
70 uint32_t& size = *size_inout;
71 // Skip trailing whitespace
72 if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[size - 1]))) {
73 const uint8_t* p = data + size - 1;
74 while (size > 0 && IsWhitespace(*p)) {
75 --size;
76 --p;
77 }
78 }
79 // Skip leading whitespace
80 if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[0]))) {
81 while (size > 0 && IsWhitespace(*data)) {
82 --size;
83 ++data;
84 }
85 }
86 }
87
88 Status InitializeTrie(const std::vector<std::string>& inputs, Trie* trie) {
89 TrieBuilder builder;
90 for (const auto& s : inputs) {
91 RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
92 }
93 *trie = builder.Finish();
94 return Status::OK();
95 }
96
97 // Presize a builder based on parser contents
98 template <typename BuilderType>
99 enable_if_t<!is_base_binary_type<typename BuilderType::TypeClass>::value, Status>
100 PresizeBuilder(const BlockParser& parser, BuilderType* builder) {
101 return builder->Resize(parser.num_rows());
102 }
103
104 // Same, for variable-sized binary builders
105 template <typename T>
106 Status PresizeBuilder(const BlockParser& parser, BaseBinaryBuilder<T>* builder) {
107 RETURN_NOT_OK(builder->Resize(parser.num_rows()));
108 return builder->ReserveData(parser.num_bytes());
109 }
110
111 /////////////////////////////////////////////////////////////////////////
112 // Per-type value decoders
113
114 struct ValueDecoder {
115 explicit ValueDecoder(const std::shared_ptr<DataType>& type,
116 const ConvertOptions& options)
117 : type_(type), options_(options) {}
118
119 Status Initialize() {
120 // TODO no need to build a separate Trie for each instance
121 return InitializeTrie(options_.null_values, &null_trie_);
122 }
123
124 bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
125 if (quoted && !options_.quoted_strings_can_be_null) {
126 return false;
127 }
128 return null_trie_.Find(
129 util::string_view(reinterpret_cast<const char*>(data), size)) >= 0;
130 }
131
132 protected:
133 Trie null_trie_;
134 const std::shared_ptr<DataType> type_;
135 const ConvertOptions& options_;
136 };
137
138 //
139 // Value decoder for fixed-size binary
140 //
141
142 struct FixedSizeBinaryValueDecoder : public ValueDecoder {
143 using value_type = const uint8_t*;
144
145 explicit FixedSizeBinaryValueDecoder(const std::shared_ptr<DataType>& type,
146 const ConvertOptions& options)
147 : ValueDecoder(type, options),
148 byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
149
150 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
151 if (ARROW_PREDICT_FALSE(size != byte_width_)) {
152 return Status::Invalid("CSV conversion error to ", type_->ToString(), ": got a ",
153 size, "-byte long string");
154 }
155 *out = data;
156 return Status::OK();
157 }
158
159 protected:
160 const uint32_t byte_width_;
161 };
162
163 //
164 // Value decoder for variable-size binary
165 //
166
167 template <bool CheckUTF8>
168 struct BinaryValueDecoder : public ValueDecoder {
169 using value_type = util::string_view;
170
171 using ValueDecoder::ValueDecoder;
172
173 Status Initialize() {
174 util::InitializeUTF8();
175 return ValueDecoder::Initialize();
176 }
177
178 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
179 if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
180 return Status::Invalid("CSV conversion error to ", type_->ToString(),
181 ": invalid UTF8 data");
182 }
183 *out = {reinterpret_cast<const char*>(data), size};
184 return Status::OK();
185 }
186
187 bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
188 return options_.strings_can_be_null &&
189 (!quoted || options_.quoted_strings_can_be_null) &&
190 ValueDecoder::IsNull(data, size, false /* quoted */);
191 }
192 };
193
194 //
195 // Value decoder for integers, floats and temporals
196 //
197
198 template <typename T>
199 struct NumericValueDecoder : public ValueDecoder {
200 using value_type = typename T::c_type;
201
202 explicit NumericValueDecoder(const std::shared_ptr<DataType>& type,
203 const ConvertOptions& options)
204 : ValueDecoder(type, options), concrete_type_(checked_cast<const T&>(*type)) {}
205
206 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
207 // XXX should quoted values be allowed at all?
208 TrimWhiteSpace(&data, &size);
209 if (ARROW_PREDICT_FALSE(!internal::ParseValue<T>(
210 concrete_type_, reinterpret_cast<const char*>(data), size, out))) {
211 return GenericConversionError(type_, data, size);
212 }
213 return Status::OK();
214 }
215
216 protected:
217 const T& concrete_type_;
218 };
219
220 //
221 // Value decoder for booleans
222 //
223
224 struct BooleanValueDecoder : public ValueDecoder {
225 using value_type = bool;
226
227 using ValueDecoder::ValueDecoder;
228
229 Status Initialize() {
230 // TODO no need to build separate Tries for each instance
231 RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_));
232 RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_));
233 return ValueDecoder::Initialize();
234 }
235
236 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
237 // XXX should quoted values be allowed at all?
238 if (false_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
239 0) {
240 *out = false;
241 return Status::OK();
242 }
243 if (ARROW_PREDICT_TRUE(true_trie_.Find(util::string_view(
244 reinterpret_cast<const char*>(data), size)) >= 0)) {
245 *out = true;
246 return Status::OK();
247 }
248 return GenericConversionError(type_, data, size);
249 }
250
251 protected:
252 Trie true_trie_;
253 Trie false_trie_;
254 };
255
256 //
257 // Value decoder for decimals
258 //
259
260 struct DecimalValueDecoder : public ValueDecoder {
261 using value_type = Decimal128;
262
263 explicit DecimalValueDecoder(const std::shared_ptr<DataType>& type,
264 const ConvertOptions& options)
265 : ValueDecoder(type, options),
266 decimal_type_(internal::checked_cast<const DecimalType&>(*type_)),
267 type_precision_(decimal_type_.precision()),
268 type_scale_(decimal_type_.scale()) {}
269
270 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
271 TrimWhiteSpace(&data, &size);
272 Decimal128 decimal;
273 int32_t precision, scale;
274 util::string_view view(reinterpret_cast<const char*>(data), size);
275 RETURN_NOT_OK(Decimal128::FromString(view, &decimal, &precision, &scale));
276 if (precision > type_precision_) {
277 return Status::Invalid("Error converting '", view, "' to ", type_->ToString(),
278 ": precision not supported by type.");
279 }
280 if (scale != type_scale_) {
281 ARROW_ASSIGN_OR_RAISE(*out, decimal.Rescale(scale, type_scale_));
282 } else {
283 *out = std::move(decimal);
284 }
285 return Status::OK();
286 }
287
288 protected:
289 const DecimalType& decimal_type_;
290 const int32_t type_precision_;
291 const int32_t type_scale_;
292 };
293
294 //
295 // Value decoder wrapper for floating-point and decimals
296 // with a non-default decimal point
297 //
298
299 template <typename WrappedDecoder>
300 struct CustomDecimalPointValueDecoder : public ValueDecoder {
301 using value_type = typename WrappedDecoder::value_type;
302
303 explicit CustomDecimalPointValueDecoder(const std::shared_ptr<DataType>& type,
304 const ConvertOptions& options)
305 : ValueDecoder(type, options), wrapped_decoder_(type, options) {}
306
307 Status Initialize() {
308 RETURN_NOT_OK(wrapped_decoder_.Initialize());
309 for (int i = 0; i < 256; ++i) {
310 mapping_[i] = i;
311 }
312 mapping_[options_.decimal_point] = '.';
313 mapping_['.'] = options_.decimal_point; // error out on standard decimal point
314 temp_.resize(30);
315 return Status::OK();
316 }
317
318 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
319 if (ARROW_PREDICT_FALSE(size > temp_.size())) {
320 temp_.resize(size);
321 }
322 uint8_t* temp_data = temp_.data();
323 for (uint32_t i = 0; i < size; ++i) {
324 temp_data[i] = mapping_[data[i]];
325 }
326 if (ARROW_PREDICT_FALSE(
327 !wrapped_decoder_.Decode(temp_data, size, quoted, out).ok())) {
328 return GenericConversionError(type_, data, size);
329 }
330 return Status::OK();
331 }
332
333 bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
334 return wrapped_decoder_.IsNull(data, size, quoted);
335 }
336
337 protected:
338 WrappedDecoder wrapped_decoder_;
339 std::array<uint8_t, 256> mapping_;
340 std::vector<uint8_t> temp_;
341 };
342
343 //
344 // Value decoders for timestamps
345 //
346
347 struct InlineISO8601ValueDecoder : public ValueDecoder {
348 using value_type = int64_t;
349
350 explicit InlineISO8601ValueDecoder(const std::shared_ptr<DataType>& type,
351 const ConvertOptions& options)
352 : ValueDecoder(type, options),
353 unit_(checked_cast<const TimestampType&>(*type_).unit()) {}
354
355 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
356 if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601(
357 reinterpret_cast<const char*>(data), size, unit_, out))) {
358 return GenericConversionError(type_, data, size);
359 }
360 return Status::OK();
361 }
362
363 protected:
364 TimeUnit::type unit_;
365 };
366
367 struct SingleParserTimestampValueDecoder : public ValueDecoder {
368 using value_type = int64_t;
369
370 explicit SingleParserTimestampValueDecoder(const std::shared_ptr<DataType>& type,
371 const ConvertOptions& options)
372 : ValueDecoder(type, options),
373 unit_(checked_cast<const TimestampType&>(*type_).unit()),
374 parser_(*options_.timestamp_parsers[0]) {}
375
376 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
377 if (ARROW_PREDICT_FALSE(
378 !parser_(reinterpret_cast<const char*>(data), size, unit_, out))) {
379 return GenericConversionError(type_, data, size);
380 }
381 return Status::OK();
382 }
383
384 protected:
385 TimeUnit::type unit_;
386 const TimestampParser& parser_;
387 };
388
389 struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
390 using value_type = int64_t;
391
392 explicit MultipleParsersTimestampValueDecoder(const std::shared_ptr<DataType>& type,
393 const ConvertOptions& options)
394 : ValueDecoder(type, options),
395 unit_(checked_cast<const TimestampType&>(*type_).unit()),
396 parsers_(GetParsers(options_)) {}
397
398 Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
399 for (const auto& parser : parsers_) {
400 if (parser->operator()(reinterpret_cast<const char*>(data), size, unit_, out)) {
401 return Status::OK();
402 }
403 }
404 return GenericConversionError(type_, data, size);
405 }
406
407 protected:
408 using ParserVector = std::vector<const TimestampParser*>;
409
410 static ParserVector GetParsers(const ConvertOptions& options) {
411 ParserVector parsers(options.timestamp_parsers.size());
412 for (size_t i = 0; i < options.timestamp_parsers.size(); ++i) {
413 parsers[i] = options.timestamp_parsers[i].get();
414 }
415 return parsers;
416 }
417
418 TimeUnit::type unit_;
419 std::vector<const TimestampParser*> parsers_;
420 };
421
422 /////////////////////////////////////////////////////////////////////////
423 // Concrete Converter hierarchy
424
425 class ConcreteConverter : public Converter {
426 public:
427 using Converter::Converter;
428 };
429
430 class ConcreteDictionaryConverter : public DictionaryConverter {
431 public:
432 using DictionaryConverter::DictionaryConverter;
433 };
434
435 //
436 // Concrete Converter for nulls
437 //
438
439 class NullConverter : public ConcreteConverter {
440 public:
441 NullConverter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
442 MemoryPool* pool)
443 : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
444
445 Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
446 int32_t col_index) override {
447 NullBuilder builder(pool_);
448
449 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
450 if (ARROW_PREDICT_TRUE(decoder_.IsNull(data, size, quoted))) {
451 return builder.AppendNull();
452 } else {
453 return GenericConversionError(type_, data, size);
454 }
455 };
456 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
457 std::shared_ptr<Array> res;
458 RETURN_NOT_OK(builder.Finish(&res));
459 return res;
460 }
461
462 protected:
463 Status Initialize() override { return decoder_.Initialize(); }
464
465 ValueDecoder decoder_;
466 };
467
468 //
469 // Concrete Converter for primitives
470 //
471
472 template <typename T, typename ValueDecoderType>
473 class PrimitiveConverter : public ConcreteConverter {
474 public:
475 PrimitiveConverter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
476 MemoryPool* pool)
477 : ConcreteConverter(type, options, pool), decoder_(type_, options_) {}
478
479 Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
480 int32_t col_index) override {
481 using BuilderType = typename TypeTraits<T>::BuilderType;
482 using value_type = typename ValueDecoderType::value_type;
483
484 BuilderType builder(type_, pool_);
485 RETURN_NOT_OK(PresizeBuilder(parser, &builder));
486
487 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
488 if (decoder_.IsNull(data, size, quoted /* quoted */)) {
489 return builder.AppendNull();
490 }
491 value_type value{};
492 RETURN_NOT_OK(decoder_.Decode(data, size, quoted, &value));
493 builder.UnsafeAppend(value);
494 return Status::OK();
495 };
496 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
497
498 std::shared_ptr<Array> res;
499 RETURN_NOT_OK(builder.Finish(&res));
500 return res;
501 }
502
503 protected:
504 Status Initialize() override { return decoder_.Initialize(); }
505
506 ValueDecoderType decoder_;
507 };
508
509 //
510 // Concrete Converter for dictionaries
511 //
512
513 template <typename T, typename ValueDecoderType>
514 class TypedDictionaryConverter : public ConcreteDictionaryConverter {
515 public:
516 TypedDictionaryConverter(const std::shared_ptr<DataType>& value_type,
517 const ConvertOptions& options, MemoryPool* pool)
518 : ConcreteDictionaryConverter(value_type, options, pool),
519 decoder_(value_type, options_) {}
520
521 Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
522 int32_t col_index) override {
523 // We use a fixed index width so that all column chunks get the same index type
524 using BuilderType = Dictionary32Builder<T>;
525 using value_type = typename ValueDecoderType::value_type;
526
527 BuilderType builder(value_type_, pool_);
528 RETURN_NOT_OK(PresizeBuilder(parser, &builder));
529
530 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
531 if (decoder_.IsNull(data, size, quoted /* quoted */)) {
532 return builder.AppendNull();
533 }
534 if (ARROW_PREDICT_FALSE(builder.dictionary_length() > max_cardinality_)) {
535 return Status::IndexError("Dictionary length exceeded max cardinality");
536 }
537 value_type value{};
538 RETURN_NOT_OK(decoder_.Decode(data, size, quoted, &value));
539 return builder.Append(value);
540 };
541 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
542
543 std::shared_ptr<Array> res;
544 RETURN_NOT_OK(builder.Finish(&res));
545 return res;
546 }
547
548 void SetMaxCardinality(int32_t max_length) override { max_cardinality_ = max_length; }
549
550 protected:
551 Status Initialize() override {
552 util::InitializeUTF8();
553 return decoder_.Initialize();
554 }
555
556 ValueDecoderType decoder_;
557 int32_t max_cardinality_ = std::numeric_limits<int32_t>::max();
558 };
559
560 //
561 // Concrete Converter factory for timestamps
562 //
563
564 template <template <typename, typename> class ConverterType>
565 std::shared_ptr<Converter> MakeTimestampConverter(const std::shared_ptr<DataType>& type,
566 const ConvertOptions& options,
567 MemoryPool* pool) {
568 if (options.timestamp_parsers.size() == 0) {
569 // Default to ISO-8601
570 return std::make_shared<ConverterType<TimestampType, InlineISO8601ValueDecoder>>(
571 type, options, pool);
572 } else if (options.timestamp_parsers.size() == 1) {
573 // Single user-supplied converter
574 return std::make_shared<
575 ConverterType<TimestampType, SingleParserTimestampValueDecoder>>(type, options,
576 pool);
577 } else {
578 // Multiple converters, must iterate for each value
579 return std::make_shared<
580 ConverterType<TimestampType, MultipleParsersTimestampValueDecoder>>(type, options,
581 pool);
582 }
583 }
584
585 //
586 // Concrete Converter factory for reals
587 //
588
589 template <typename ConverterType, template <typename...> class ConcreteConverterType,
590 typename Type, typename DecoderType>
591 std::shared_ptr<ConverterType> MakeRealConverter(const std::shared_ptr<DataType>& type,
592 const ConvertOptions& options,
593 MemoryPool* pool) {
594 if (options.decimal_point == '.') {
595 return std::make_shared<ConcreteConverterType<Type, DecoderType>>(type, options,
596 pool);
597 }
598 return std::make_shared<
599 ConcreteConverterType<Type, CustomDecimalPointValueDecoder<DecoderType>>>(
600 type, options, pool);
601 }
602
603 } // namespace
604
605 /////////////////////////////////////////////////////////////////////////
606 // Base Converter class implementation
607
608 Converter::Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
609 MemoryPool* pool)
610 : options_(options), pool_(pool), type_(type) {}
611
612 DictionaryConverter::DictionaryConverter(const std::shared_ptr<DataType>& value_type,
613 const ConvertOptions& options, MemoryPool* pool)
614 : Converter(dictionary(int32(), value_type), options, pool),
615 value_type_(value_type) {}
616
617 Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataType>& type,
618 const ConvertOptions& options,
619 MemoryPool* pool) {
620 std::shared_ptr<Converter> ptr;
621
622 switch (type->id()) {
623 #define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \
624 case TYPE_ID: \
625 ptr.reset(new CONVERTER_TYPE(type, options, pool)); \
626 break;
627
628 #define NUMERIC_CONVERTER_CASE(TYPE_ID, TYPE_CLASS) \
629 CONVERTER_CASE(TYPE_ID, \
630 (PrimitiveConverter<TYPE_CLASS, NumericValueDecoder<TYPE_CLASS>>))
631
632 #define REAL_CONVERTER_CASE(TYPE_ID, TYPE_CLASS, DECODER) \
633 case TYPE_ID: \
634 ptr = MakeRealConverter<Converter, PrimitiveConverter, TYPE_CLASS, DECODER>( \
635 type, options, pool); \
636 break;
637
638 CONVERTER_CASE(Type::NA, NullConverter)
639 NUMERIC_CONVERTER_CASE(Type::INT8, Int8Type)
640 NUMERIC_CONVERTER_CASE(Type::INT16, Int16Type)
641 NUMERIC_CONVERTER_CASE(Type::INT32, Int32Type)
642 NUMERIC_CONVERTER_CASE(Type::INT64, Int64Type)
643 NUMERIC_CONVERTER_CASE(Type::UINT8, UInt8Type)
644 NUMERIC_CONVERTER_CASE(Type::UINT16, UInt16Type)
645 NUMERIC_CONVERTER_CASE(Type::UINT32, UInt32Type)
646 NUMERIC_CONVERTER_CASE(Type::UINT64, UInt64Type)
647 REAL_CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder<FloatType>)
648 REAL_CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder<DoubleType>)
649 REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
650 NUMERIC_CONVERTER_CASE(Type::DATE32, Date32Type)
651 NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
652 NUMERIC_CONVERTER_CASE(Type::TIME32, Time32Type)
653 NUMERIC_CONVERTER_CASE(Type::TIME64, Time64Type)
654 CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, BooleanValueDecoder>))
655 CONVERTER_CASE(Type::BINARY,
656 (PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
657 CONVERTER_CASE(Type::LARGE_BINARY,
658 (PrimitiveConverter<LargeBinaryType, BinaryValueDecoder<false>>))
659 CONVERTER_CASE(Type::FIXED_SIZE_BINARY,
660 (PrimitiveConverter<FixedSizeBinaryType, FixedSizeBinaryValueDecoder>))
661
662 case Type::TIMESTAMP:
663 ptr = MakeTimestampConverter<PrimitiveConverter>(type, options, pool);
664 break;
665
666 case Type::STRING:
667 if (options.check_utf8) {
668 ptr = std::make_shared<PrimitiveConverter<StringType, BinaryValueDecoder<true>>>(
669 type, options, pool);
670 } else {
671 ptr = std::make_shared<PrimitiveConverter<StringType, BinaryValueDecoder<false>>>(
672 type, options, pool);
673 }
674 break;
675
676 case Type::LARGE_STRING:
677 if (options.check_utf8) {
678 ptr = std::make_shared<
679 PrimitiveConverter<LargeStringType, BinaryValueDecoder<true>>>(type, options,
680 pool);
681 } else {
682 ptr = std::make_shared<
683 PrimitiveConverter<LargeStringType, BinaryValueDecoder<false>>>(type, options,
684 pool);
685 }
686 break;
687
688 case Type::DICTIONARY: {
689 const auto& dict_type = checked_cast<const DictionaryType&>(*type);
690 if (dict_type.index_type()->id() != Type::INT32) {
691 return Status::NotImplemented(
692 "CSV conversion to dictionary only supported for int32 indices, "
693 "got ",
694 type->ToString());
695 }
696 return DictionaryConverter::Make(dict_type.value_type(), options, pool);
697 }
698
699 default: {
700 return Status::NotImplemented("CSV conversion to ", type->ToString(),
701 " is not supported");
702 }
703
704 #undef CONVERTER_CASE
705 #undef NUMERIC_CONVERTER_CASE
706 #undef REAL_CONVERTER_CASE
707 }
708 RETURN_NOT_OK(ptr->Initialize());
709 return ptr;
710 }
711
712 Result<std::shared_ptr<DictionaryConverter>> DictionaryConverter::Make(
713 const std::shared_ptr<DataType>& type, const ConvertOptions& options,
714 MemoryPool* pool) {
715 std::shared_ptr<DictionaryConverter> ptr;
716
717 switch (type->id()) {
718 #define CONVERTER_CASE(TYPE_ID, TYPE, VALUE_DECODER_TYPE) \
719 case TYPE_ID: \
720 ptr.reset( \
721 new TypedDictionaryConverter<TYPE, VALUE_DECODER_TYPE>(type, options, pool)); \
722 break;
723
724 #define REAL_CONVERTER_CASE(TYPE_ID, TYPE_CLASS, DECODER) \
725 case TYPE_ID: \
726 ptr = MakeRealConverter<DictionaryConverter, TypedDictionaryConverter, TYPE_CLASS, \
727 DECODER>(type, options, pool); \
728 break;
729
730 // XXX Are 32-bit types useful?
731 CONVERTER_CASE(Type::INT32, Int32Type, NumericValueDecoder<Int32Type>)
732 CONVERTER_CASE(Type::INT64, Int64Type, NumericValueDecoder<Int64Type>)
733 CONVERTER_CASE(Type::UINT32, UInt32Type, NumericValueDecoder<UInt32Type>)
734 CONVERTER_CASE(Type::UINT64, UInt64Type, NumericValueDecoder<UInt64Type>)
735 REAL_CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder<FloatType>)
736 REAL_CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder<DoubleType>)
737 REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
738 CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryType,
739 FixedSizeBinaryValueDecoder)
740 CONVERTER_CASE(Type::BINARY, BinaryType, BinaryValueDecoder<false>)
741 CONVERTER_CASE(Type::LARGE_BINARY, LargeBinaryType, BinaryValueDecoder<false>)
742
743 case Type::STRING:
744 if (options.check_utf8) {
745 ptr = std::make_shared<
746 TypedDictionaryConverter<StringType, BinaryValueDecoder<true>>>(type, options,
747 pool);
748 } else {
749 ptr = std::make_shared<
750 TypedDictionaryConverter<StringType, BinaryValueDecoder<false>>>(
751 type, options, pool);
752 }
753 break;
754
755 case Type::LARGE_STRING:
756 if (options.check_utf8) {
757 ptr = std::make_shared<
758 TypedDictionaryConverter<LargeStringType, BinaryValueDecoder<true>>>(
759 type, options, pool);
760 } else {
761 ptr = std::make_shared<
762 TypedDictionaryConverter<LargeStringType, BinaryValueDecoder<false>>>(
763 type, options, pool);
764 }
765 break;
766
767 default: {
768 return Status::NotImplemented("CSV dictionary conversion to ", type->ToString(),
769 " is not supported");
770 }
771
772 #undef CONVERTER_CASE
773 #undef REAL_CONVERTER_CASE
774 }
775 RETURN_NOT_OK(ptr->Initialize());
776 return ptr;
777 }
778
779 } // namespace csv
780 } // namespace arrow