1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
18 #include "arrow/csv/converter.h"
25 #include <type_traits>
28 #include "arrow/array/builder_binary.h"
29 #include "arrow/array/builder_decimal.h"
30 #include "arrow/array/builder_dict.h"
31 #include "arrow/array/builder_primitive.h"
32 #include "arrow/csv/parser.h"
33 #include "arrow/status.h"
34 #include "arrow/type.h"
35 #include "arrow/type_fwd.h"
36 #include "arrow/type_traits.h"
37 #include "arrow/util/checked_cast.h"
38 #include "arrow/util/decimal.h"
39 #include "arrow/util/trie.h"
40 #include "arrow/util/utf8.h"
41 #include "arrow/util/value_parsing.h" // IWYU pragma: keep
46 using internal::checked_cast
;
48 using internal::TrieBuilder
;
52 Status
GenericConversionError(const std::shared_ptr
<DataType
>& type
, const uint8_t* data
,
54 return Status::Invalid("CSV conversion error to ", type
->ToString(),
56 std::string(reinterpret_cast<const char*>(data
), size
), "'");
59 inline bool IsWhitespace(uint8_t c
) {
60 if (ARROW_PREDICT_TRUE(c
> ' ')) {
63 return c
== ' ' || c
== '\t';
66 // Updates data_inout and size_inout to not include leading/trailing whitespace
68 inline void TrimWhiteSpace(const uint8_t** data_inout
, uint32_t* size_inout
) {
69 const uint8_t*& data
= *data_inout
;
70 uint32_t& size
= *size_inout
;
71 // Skip trailing whitespace
72 if (ARROW_PREDICT_TRUE(size
> 0) && ARROW_PREDICT_FALSE(IsWhitespace(data
[size
- 1]))) {
73 const uint8_t* p
= data
+ size
- 1;
74 while (size
> 0 && IsWhitespace(*p
)) {
79 // Skip leading whitespace
80 if (ARROW_PREDICT_TRUE(size
> 0) && ARROW_PREDICT_FALSE(IsWhitespace(data
[0]))) {
81 while (size
> 0 && IsWhitespace(*data
)) {
88 Status
InitializeTrie(const std::vector
<std::string
>& inputs
, Trie
* trie
) {
90 for (const auto& s
: inputs
) {
91 RETURN_NOT_OK(builder
.Append(s
, true /* allow_duplicates */));
93 *trie
= builder
.Finish();
97 // Presize a builder based on parser contents
98 template <typename BuilderType
>
99 enable_if_t
<!is_base_binary_type
<typename
BuilderType::TypeClass
>::value
, Status
>
100 PresizeBuilder(const BlockParser
& parser
, BuilderType
* builder
) {
101 return builder
->Resize(parser
.num_rows());
104 // Same, for variable-sized binary builders
105 template <typename T
>
106 Status
PresizeBuilder(const BlockParser
& parser
, BaseBinaryBuilder
<T
>* builder
) {
107 RETURN_NOT_OK(builder
->Resize(parser
.num_rows()));
108 return builder
->ReserveData(parser
.num_bytes());
111 /////////////////////////////////////////////////////////////////////////
112 // Per-type value decoders
114 struct ValueDecoder
{
115 explicit ValueDecoder(const std::shared_ptr
<DataType
>& type
,
116 const ConvertOptions
& options
)
117 : type_(type
), options_(options
) {}
119 Status
Initialize() {
120 // TODO no need to build a separate Trie for each instance
121 return InitializeTrie(options_
.null_values
, &null_trie_
);
124 bool IsNull(const uint8_t* data
, uint32_t size
, bool quoted
) {
125 if (quoted
&& !options_
.quoted_strings_can_be_null
) {
128 return null_trie_
.Find(
129 util::string_view(reinterpret_cast<const char*>(data
), size
)) >= 0;
134 const std::shared_ptr
<DataType
> type_
;
135 const ConvertOptions
& options_
;
139 // Value decoder for fixed-size binary
142 struct FixedSizeBinaryValueDecoder
: public ValueDecoder
{
143 using value_type
= const uint8_t*;
145 explicit FixedSizeBinaryValueDecoder(const std::shared_ptr
<DataType
>& type
,
146 const ConvertOptions
& options
)
147 : ValueDecoder(type
, options
),
148 byte_width_(checked_cast
<const FixedSizeBinaryType
&>(*type
).byte_width()) {}
150 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
151 if (ARROW_PREDICT_FALSE(size
!= byte_width_
)) {
152 return Status::Invalid("CSV conversion error to ", type_
->ToString(), ": got a ",
153 size
, "-byte long string");
160 const uint32_t byte_width_
;
164 // Value decoder for variable-size binary
167 template <bool CheckUTF8
>
168 struct BinaryValueDecoder
: public ValueDecoder
{
169 using value_type
= util::string_view
;
171 using ValueDecoder::ValueDecoder
;
173 Status
Initialize() {
174 util::InitializeUTF8();
175 return ValueDecoder::Initialize();
178 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
179 if (CheckUTF8
&& ARROW_PREDICT_FALSE(!util::ValidateUTF8(data
, size
))) {
180 return Status::Invalid("CSV conversion error to ", type_
->ToString(),
181 ": invalid UTF8 data");
183 *out
= {reinterpret_cast<const char*>(data
), size
};
187 bool IsNull(const uint8_t* data
, uint32_t size
, bool quoted
) {
188 return options_
.strings_can_be_null
&&
189 (!quoted
|| options_
.quoted_strings_can_be_null
) &&
190 ValueDecoder::IsNull(data
, size
, false /* quoted */);
195 // Value decoder for integers, floats and temporals
198 template <typename T
>
199 struct NumericValueDecoder
: public ValueDecoder
{
200 using value_type
= typename
T::c_type
;
202 explicit NumericValueDecoder(const std::shared_ptr
<DataType
>& type
,
203 const ConvertOptions
& options
)
204 : ValueDecoder(type
, options
), concrete_type_(checked_cast
<const T
&>(*type
)) {}
206 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
207 // XXX should quoted values be allowed at all?
208 TrimWhiteSpace(&data
, &size
);
209 if (ARROW_PREDICT_FALSE(!internal::ParseValue
<T
>(
210 concrete_type_
, reinterpret_cast<const char*>(data
), size
, out
))) {
211 return GenericConversionError(type_
, data
, size
);
217 const T
& concrete_type_
;
221 // Value decoder for booleans
224 struct BooleanValueDecoder
: public ValueDecoder
{
225 using value_type
= bool;
227 using ValueDecoder::ValueDecoder
;
229 Status
Initialize() {
230 // TODO no need to build separate Tries for each instance
231 RETURN_NOT_OK(InitializeTrie(options_
.true_values
, &true_trie_
));
232 RETURN_NOT_OK(InitializeTrie(options_
.false_values
, &false_trie_
));
233 return ValueDecoder::Initialize();
236 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
237 // XXX should quoted values be allowed at all?
238 if (false_trie_
.Find(util::string_view(reinterpret_cast<const char*>(data
), size
)) >=
243 if (ARROW_PREDICT_TRUE(true_trie_
.Find(util::string_view(
244 reinterpret_cast<const char*>(data
), size
)) >= 0)) {
248 return GenericConversionError(type_
, data
, size
);
257 // Value decoder for decimals
260 struct DecimalValueDecoder
: public ValueDecoder
{
261 using value_type
= Decimal128
;
263 explicit DecimalValueDecoder(const std::shared_ptr
<DataType
>& type
,
264 const ConvertOptions
& options
)
265 : ValueDecoder(type
, options
),
266 decimal_type_(internal::checked_cast
<const DecimalType
&>(*type_
)),
267 type_precision_(decimal_type_
.precision()),
268 type_scale_(decimal_type_
.scale()) {}
270 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
271 TrimWhiteSpace(&data
, &size
);
273 int32_t precision
, scale
;
274 util::string_view
view(reinterpret_cast<const char*>(data
), size
);
275 RETURN_NOT_OK(Decimal128::FromString(view
, &decimal
, &precision
, &scale
));
276 if (precision
> type_precision_
) {
277 return Status::Invalid("Error converting '", view
, "' to ", type_
->ToString(),
278 ": precision not supported by type.");
280 if (scale
!= type_scale_
) {
281 ARROW_ASSIGN_OR_RAISE(*out
, decimal
.Rescale(scale
, type_scale_
));
283 *out
= std::move(decimal
);
289 const DecimalType
& decimal_type_
;
290 const int32_t type_precision_
;
291 const int32_t type_scale_
;
295 // Value decoder wrapper for floating-point and decimals
296 // with a non-default decimal point
299 template <typename WrappedDecoder
>
300 struct CustomDecimalPointValueDecoder
: public ValueDecoder
{
301 using value_type
= typename
WrappedDecoder::value_type
;
303 explicit CustomDecimalPointValueDecoder(const std::shared_ptr
<DataType
>& type
,
304 const ConvertOptions
& options
)
305 : ValueDecoder(type
, options
), wrapped_decoder_(type
, options
) {}
307 Status
Initialize() {
308 RETURN_NOT_OK(wrapped_decoder_
.Initialize());
309 for (int i
= 0; i
< 256; ++i
) {
312 mapping_
[options_
.decimal_point
] = '.';
313 mapping_
['.'] = options_
.decimal_point
; // error out on standard decimal point
318 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
319 if (ARROW_PREDICT_FALSE(size
> temp_
.size())) {
322 uint8_t* temp_data
= temp_
.data();
323 for (uint32_t i
= 0; i
< size
; ++i
) {
324 temp_data
[i
] = mapping_
[data
[i
]];
326 if (ARROW_PREDICT_FALSE(
327 !wrapped_decoder_
.Decode(temp_data
, size
, quoted
, out
).ok())) {
328 return GenericConversionError(type_
, data
, size
);
333 bool IsNull(const uint8_t* data
, uint32_t size
, bool quoted
) {
334 return wrapped_decoder_
.IsNull(data
, size
, quoted
);
338 WrappedDecoder wrapped_decoder_
;
339 std::array
<uint8_t, 256> mapping_
;
340 std::vector
<uint8_t> temp_
;
344 // Value decoders for timestamps
347 struct InlineISO8601ValueDecoder
: public ValueDecoder
{
348 using value_type
= int64_t;
350 explicit InlineISO8601ValueDecoder(const std::shared_ptr
<DataType
>& type
,
351 const ConvertOptions
& options
)
352 : ValueDecoder(type
, options
),
353 unit_(checked_cast
<const TimestampType
&>(*type_
).unit()) {}
355 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
356 if (ARROW_PREDICT_FALSE(!internal::ParseTimestampISO8601(
357 reinterpret_cast<const char*>(data
), size
, unit_
, out
))) {
358 return GenericConversionError(type_
, data
, size
);
364 TimeUnit::type unit_
;
367 struct SingleParserTimestampValueDecoder
: public ValueDecoder
{
368 using value_type
= int64_t;
370 explicit SingleParserTimestampValueDecoder(const std::shared_ptr
<DataType
>& type
,
371 const ConvertOptions
& options
)
372 : ValueDecoder(type
, options
),
373 unit_(checked_cast
<const TimestampType
&>(*type_
).unit()),
374 parser_(*options_
.timestamp_parsers
[0]) {}
376 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
377 if (ARROW_PREDICT_FALSE(
378 !parser_(reinterpret_cast<const char*>(data
), size
, unit_
, out
))) {
379 return GenericConversionError(type_
, data
, size
);
385 TimeUnit::type unit_
;
386 const TimestampParser
& parser_
;
389 struct MultipleParsersTimestampValueDecoder
: public ValueDecoder
{
390 using value_type
= int64_t;
392 explicit MultipleParsersTimestampValueDecoder(const std::shared_ptr
<DataType
>& type
,
393 const ConvertOptions
& options
)
394 : ValueDecoder(type
, options
),
395 unit_(checked_cast
<const TimestampType
&>(*type_
).unit()),
396 parsers_(GetParsers(options_
)) {}
398 Status
Decode(const uint8_t* data
, uint32_t size
, bool quoted
, value_type
* out
) {
399 for (const auto& parser
: parsers_
) {
400 if (parser
->operator()(reinterpret_cast<const char*>(data
), size
, unit_
, out
)) {
404 return GenericConversionError(type_
, data
, size
);
408 using ParserVector
= std::vector
<const TimestampParser
*>;
410 static ParserVector
GetParsers(const ConvertOptions
& options
) {
411 ParserVector
parsers(options
.timestamp_parsers
.size());
412 for (size_t i
= 0; i
< options
.timestamp_parsers
.size(); ++i
) {
413 parsers
[i
] = options
.timestamp_parsers
[i
].get();
418 TimeUnit::type unit_
;
419 std::vector
<const TimestampParser
*> parsers_
;
422 /////////////////////////////////////////////////////////////////////////
423 // Concrete Converter hierarchy
425 class ConcreteConverter
: public Converter
{
427 using Converter::Converter
;
430 class ConcreteDictionaryConverter
: public DictionaryConverter
{
432 using DictionaryConverter::DictionaryConverter
;
436 // Concrete Converter for nulls
439 class NullConverter
: public ConcreteConverter
{
441 NullConverter(const std::shared_ptr
<DataType
>& type
, const ConvertOptions
& options
,
443 : ConcreteConverter(type
, options
, pool
), decoder_(type_
, options_
) {}
445 Result
<std::shared_ptr
<Array
>> Convert(const BlockParser
& parser
,
446 int32_t col_index
) override
{
447 NullBuilder
builder(pool_
);
449 auto visit
= [&](const uint8_t* data
, uint32_t size
, bool quoted
) -> Status
{
450 if (ARROW_PREDICT_TRUE(decoder_
.IsNull(data
, size
, quoted
))) {
451 return builder
.AppendNull();
453 return GenericConversionError(type_
, data
, size
);
456 RETURN_NOT_OK(parser
.VisitColumn(col_index
, visit
));
457 std::shared_ptr
<Array
> res
;
458 RETURN_NOT_OK(builder
.Finish(&res
));
463 Status
Initialize() override
{ return decoder_
.Initialize(); }
465 ValueDecoder decoder_
;
469 // Concrete Converter for primitives
472 template <typename T
, typename ValueDecoderType
>
473 class PrimitiveConverter
: public ConcreteConverter
{
475 PrimitiveConverter(const std::shared_ptr
<DataType
>& type
, const ConvertOptions
& options
,
477 : ConcreteConverter(type
, options
, pool
), decoder_(type_
, options_
) {}
479 Result
<std::shared_ptr
<Array
>> Convert(const BlockParser
& parser
,
480 int32_t col_index
) override
{
481 using BuilderType
= typename TypeTraits
<T
>::BuilderType
;
482 using value_type
= typename
ValueDecoderType::value_type
;
484 BuilderType
builder(type_
, pool_
);
485 RETURN_NOT_OK(PresizeBuilder(parser
, &builder
));
487 auto visit
= [&](const uint8_t* data
, uint32_t size
, bool quoted
) -> Status
{
488 if (decoder_
.IsNull(data
, size
, quoted
/* quoted */)) {
489 return builder
.AppendNull();
492 RETURN_NOT_OK(decoder_
.Decode(data
, size
, quoted
, &value
));
493 builder
.UnsafeAppend(value
);
496 RETURN_NOT_OK(parser
.VisitColumn(col_index
, visit
));
498 std::shared_ptr
<Array
> res
;
499 RETURN_NOT_OK(builder
.Finish(&res
));
504 Status
Initialize() override
{ return decoder_
.Initialize(); }
506 ValueDecoderType decoder_
;
510 // Concrete Converter for dictionaries
513 template <typename T
, typename ValueDecoderType
>
514 class TypedDictionaryConverter
: public ConcreteDictionaryConverter
{
516 TypedDictionaryConverter(const std::shared_ptr
<DataType
>& value_type
,
517 const ConvertOptions
& options
, MemoryPool
* pool
)
518 : ConcreteDictionaryConverter(value_type
, options
, pool
),
519 decoder_(value_type
, options_
) {}
521 Result
<std::shared_ptr
<Array
>> Convert(const BlockParser
& parser
,
522 int32_t col_index
) override
{
523 // We use a fixed index width so that all column chunks get the same index type
524 using BuilderType
= Dictionary32Builder
<T
>;
525 using value_type
= typename
ValueDecoderType::value_type
;
527 BuilderType
builder(value_type_
, pool_
);
528 RETURN_NOT_OK(PresizeBuilder(parser
, &builder
));
530 auto visit
= [&](const uint8_t* data
, uint32_t size
, bool quoted
) -> Status
{
531 if (decoder_
.IsNull(data
, size
, quoted
/* quoted */)) {
532 return builder
.AppendNull();
534 if (ARROW_PREDICT_FALSE(builder
.dictionary_length() > max_cardinality_
)) {
535 return Status::IndexError("Dictionary length exceeded max cardinality");
538 RETURN_NOT_OK(decoder_
.Decode(data
, size
, quoted
, &value
));
539 return builder
.Append(value
);
541 RETURN_NOT_OK(parser
.VisitColumn(col_index
, visit
));
543 std::shared_ptr
<Array
> res
;
544 RETURN_NOT_OK(builder
.Finish(&res
));
548 void SetMaxCardinality(int32_t max_length
) override
{ max_cardinality_
= max_length
; }
551 Status
Initialize() override
{
552 util::InitializeUTF8();
553 return decoder_
.Initialize();
556 ValueDecoderType decoder_
;
557 int32_t max_cardinality_
= std::numeric_limits
<int32_t>::max();
561 // Concrete Converter factory for timestamps
564 template <template <typename
, typename
> class ConverterType
>
565 std::shared_ptr
<Converter
> MakeTimestampConverter(const std::shared_ptr
<DataType
>& type
,
566 const ConvertOptions
& options
,
568 if (options
.timestamp_parsers
.size() == 0) {
569 // Default to ISO-8601
570 return std::make_shared
<ConverterType
<TimestampType
, InlineISO8601ValueDecoder
>>(
571 type
, options
, pool
);
572 } else if (options
.timestamp_parsers
.size() == 1) {
573 // Single user-supplied converter
574 return std::make_shared
<
575 ConverterType
<TimestampType
, SingleParserTimestampValueDecoder
>>(type
, options
,
578 // Multiple converters, must iterate for each value
579 return std::make_shared
<
580 ConverterType
<TimestampType
, MultipleParsersTimestampValueDecoder
>>(type
, options
,
586 // Concrete Converter factory for reals
589 template <typename ConverterType
, template <typename
...> class ConcreteConverterType
,
590 typename Type
, typename DecoderType
>
591 std::shared_ptr
<ConverterType
> MakeRealConverter(const std::shared_ptr
<DataType
>& type
,
592 const ConvertOptions
& options
,
594 if (options
.decimal_point
== '.') {
595 return std::make_shared
<ConcreteConverterType
<Type
, DecoderType
>>(type
, options
,
598 return std::make_shared
<
599 ConcreteConverterType
<Type
, CustomDecimalPointValueDecoder
<DecoderType
>>>(
600 type
, options
, pool
);
605 /////////////////////////////////////////////////////////////////////////
606 // Base Converter class implementation
608 Converter::Converter(const std::shared_ptr
<DataType
>& type
, const ConvertOptions
& options
,
610 : options_(options
), pool_(pool
), type_(type
) {}
612 DictionaryConverter::DictionaryConverter(const std::shared_ptr
<DataType
>& value_type
,
613 const ConvertOptions
& options
, MemoryPool
* pool
)
614 : Converter(dictionary(int32(), value_type
), options
, pool
),
615 value_type_(value_type
) {}
617 Result
<std::shared_ptr
<Converter
>> Converter::Make(const std::shared_ptr
<DataType
>& type
,
618 const ConvertOptions
& options
,
620 std::shared_ptr
<Converter
> ptr
;
622 switch (type
->id()) {
623 #define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \
625 ptr.reset(new CONVERTER_TYPE(type, options, pool)); \
628 #define NUMERIC_CONVERTER_CASE(TYPE_ID, TYPE_CLASS) \
629 CONVERTER_CASE(TYPE_ID, \
630 (PrimitiveConverter<TYPE_CLASS, NumericValueDecoder<TYPE_CLASS>>))
632 #define REAL_CONVERTER_CASE(TYPE_ID, TYPE_CLASS, DECODER) \
634 ptr = MakeRealConverter<Converter, PrimitiveConverter, TYPE_CLASS, DECODER>( \
635 type, options, pool); \
638 CONVERTER_CASE(Type::NA
, NullConverter
)
639 NUMERIC_CONVERTER_CASE(Type::INT8
, Int8Type
)
640 NUMERIC_CONVERTER_CASE(Type::INT16
, Int16Type
)
641 NUMERIC_CONVERTER_CASE(Type::INT32
, Int32Type
)
642 NUMERIC_CONVERTER_CASE(Type::INT64
, Int64Type
)
643 NUMERIC_CONVERTER_CASE(Type::UINT8
, UInt8Type
)
644 NUMERIC_CONVERTER_CASE(Type::UINT16
, UInt16Type
)
645 NUMERIC_CONVERTER_CASE(Type::UINT32
, UInt32Type
)
646 NUMERIC_CONVERTER_CASE(Type::UINT64
, UInt64Type
)
647 REAL_CONVERTER_CASE(Type::FLOAT
, FloatType
, NumericValueDecoder
<FloatType
>)
648 REAL_CONVERTER_CASE(Type::DOUBLE
, DoubleType
, NumericValueDecoder
<DoubleType
>)
649 REAL_CONVERTER_CASE(Type::DECIMAL
, Decimal128Type
, DecimalValueDecoder
)
650 NUMERIC_CONVERTER_CASE(Type::DATE32
, Date32Type
)
651 NUMERIC_CONVERTER_CASE(Type::DATE64
, Date64Type
)
652 NUMERIC_CONVERTER_CASE(Type::TIME32
, Time32Type
)
653 NUMERIC_CONVERTER_CASE(Type::TIME64
, Time64Type
)
654 CONVERTER_CASE(Type::BOOL
, (PrimitiveConverter
<BooleanType
, BooleanValueDecoder
>))
655 CONVERTER_CASE(Type::BINARY
,
656 (PrimitiveConverter
<BinaryType
, BinaryValueDecoder
<false>>))
657 CONVERTER_CASE(Type::LARGE_BINARY
,
658 (PrimitiveConverter
<LargeBinaryType
, BinaryValueDecoder
<false>>))
659 CONVERTER_CASE(Type::FIXED_SIZE_BINARY
,
660 (PrimitiveConverter
<FixedSizeBinaryType
, FixedSizeBinaryValueDecoder
>))
662 case Type::TIMESTAMP
:
663 ptr
= MakeTimestampConverter
<PrimitiveConverter
>(type
, options
, pool
);
667 if (options
.check_utf8
) {
668 ptr
= std::make_shared
<PrimitiveConverter
<StringType
, BinaryValueDecoder
<true>>>(
669 type
, options
, pool
);
671 ptr
= std::make_shared
<PrimitiveConverter
<StringType
, BinaryValueDecoder
<false>>>(
672 type
, options
, pool
);
676 case Type::LARGE_STRING
:
677 if (options
.check_utf8
) {
678 ptr
= std::make_shared
<
679 PrimitiveConverter
<LargeStringType
, BinaryValueDecoder
<true>>>(type
, options
,
682 ptr
= std::make_shared
<
683 PrimitiveConverter
<LargeStringType
, BinaryValueDecoder
<false>>>(type
, options
,
688 case Type::DICTIONARY
: {
689 const auto& dict_type
= checked_cast
<const DictionaryType
&>(*type
);
690 if (dict_type
.index_type()->id() != Type::INT32
) {
691 return Status::NotImplemented(
692 "CSV conversion to dictionary only supported for int32 indices, "
696 return DictionaryConverter::Make(dict_type
.value_type(), options
, pool
);
700 return Status::NotImplemented("CSV conversion to ", type
->ToString(),
701 " is not supported");
704 #undef CONVERTER_CASE
705 #undef NUMERIC_CONVERTER_CASE
706 #undef REAL_CONVERTER_CASE
708 RETURN_NOT_OK(ptr
->Initialize());
712 Result
<std::shared_ptr
<DictionaryConverter
>> DictionaryConverter::Make(
713 const std::shared_ptr
<DataType
>& type
, const ConvertOptions
& options
,
715 std::shared_ptr
<DictionaryConverter
> ptr
;
717 switch (type
->id()) {
718 #define CONVERTER_CASE(TYPE_ID, TYPE, VALUE_DECODER_TYPE) \
721 new TypedDictionaryConverter<TYPE, VALUE_DECODER_TYPE>(type, options, pool)); \
724 #define REAL_CONVERTER_CASE(TYPE_ID, TYPE_CLASS, DECODER) \
726 ptr = MakeRealConverter<DictionaryConverter, TypedDictionaryConverter, TYPE_CLASS, \
727 DECODER>(type, options, pool); \
730 // XXX Are 32-bit types useful?
731 CONVERTER_CASE(Type::INT32
, Int32Type
, NumericValueDecoder
<Int32Type
>)
732 CONVERTER_CASE(Type::INT64
, Int64Type
, NumericValueDecoder
<Int64Type
>)
733 CONVERTER_CASE(Type::UINT32
, UInt32Type
, NumericValueDecoder
<UInt32Type
>)
734 CONVERTER_CASE(Type::UINT64
, UInt64Type
, NumericValueDecoder
<UInt64Type
>)
735 REAL_CONVERTER_CASE(Type::FLOAT
, FloatType
, NumericValueDecoder
<FloatType
>)
736 REAL_CONVERTER_CASE(Type::DOUBLE
, DoubleType
, NumericValueDecoder
<DoubleType
>)
737 REAL_CONVERTER_CASE(Type::DECIMAL
, Decimal128Type
, DecimalValueDecoder
)
738 CONVERTER_CASE(Type::FIXED_SIZE_BINARY
, FixedSizeBinaryType
,
739 FixedSizeBinaryValueDecoder
)
740 CONVERTER_CASE(Type::BINARY
, BinaryType
, BinaryValueDecoder
<false>)
741 CONVERTER_CASE(Type::LARGE_BINARY
, LargeBinaryType
, BinaryValueDecoder
<false>)
744 if (options
.check_utf8
) {
745 ptr
= std::make_shared
<
746 TypedDictionaryConverter
<StringType
, BinaryValueDecoder
<true>>>(type
, options
,
749 ptr
= std::make_shared
<
750 TypedDictionaryConverter
<StringType
, BinaryValueDecoder
<false>>>(
751 type
, options
, pool
);
755 case Type::LARGE_STRING
:
756 if (options
.check_utf8
) {
757 ptr
= std::make_shared
<
758 TypedDictionaryConverter
<LargeStringType
, BinaryValueDecoder
<true>>>(
759 type
, options
, pool
);
761 ptr
= std::make_shared
<
762 TypedDictionaryConverter
<LargeStringType
, BinaryValueDecoder
<false>>>(
763 type
, options
, pool
);
768 return Status::NotImplemented("CSV dictionary conversion to ", type
->ToString(),
769 " is not supported");
772 #undef CONVERTER_CASE
773 #undef REAL_CONVERTER_CASE
775 RETURN_NOT_OK(ptr
->Initialize());