1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
25 #include <gtest/gtest.h>
27 #include "arrow/array.h"
28 #include "arrow/array/builder_decimal.h"
29 #include "arrow/array/builder_dict.h"
30 #include "arrow/array/builder_nested.h"
31 #include "arrow/chunked_array.h"
32 #include "arrow/status.h"
33 #include "arrow/table.h"
34 #include "arrow/testing/extension_type.h"
35 #include "arrow/testing/gtest_common.h"
36 #include "arrow/testing/gtest_util.h"
37 #include "arrow/testing/util.h"
38 #include "arrow/type.h"
39 #include "arrow/util/checked_cast.h"
40 #include "arrow/util/decimal.h"
44 using internal::checked_cast
;
45 using internal::checked_pointer_cast
;
47 void CheckTransposeMap(const Buffer
& map
, std::vector
<int32_t> expected
) {
48 AssertBufferEqual(map
, *Buffer::Wrap(expected
));
51 void CheckDictionaryArray(const std::shared_ptr
<Array
>& array
,
52 const std::shared_ptr
<Array
>& expected_values
,
53 const std::shared_ptr
<Array
>& expected_indices
) {
54 const auto& dict_array
= checked_cast
<const DictionaryArray
&>(*array
);
55 AssertArraysEqual(*expected_values
, *dict_array
.dictionary(), /*verbose=*/true);
56 AssertArraysEqual(*expected_indices
, *dict_array
.indices(), /*verbose=*/true);
59 std::shared_ptr
<Array
> DictExtensionFromJSON(const std::shared_ptr
<DataType
>& type
,
60 const std::string
& json
) {
61 auto ext_type
= checked_pointer_cast
<ExtensionType
>(type
);
62 auto storage
= ArrayFromJSON(ext_type
->storage_type(), json
);
63 auto ext_data
= storage
->data()->Copy();
64 ext_data
->type
= ext_type
;
65 return MakeArray(ext_data
);
68 // ----------------------------------------------------------------------
71 template <typename Type
>
72 class TestDictionaryBuilder
: public TestBuilder
{};
74 typedef ::testing::Types
<Int8Type
, UInt8Type
, Int16Type
, UInt16Type
, Int32Type
,
75 UInt32Type
, Int64Type
, UInt64Type
, FloatType
, DoubleType
>
76 PrimitiveDictionaries
;
78 TYPED_TEST_SUITE(TestDictionaryBuilder
, PrimitiveDictionaries
);
80 TYPED_TEST(TestDictionaryBuilder
, Basic
) {
81 using c_type
= typename
TypeParam::c_type
;
83 DictionaryBuilder
<TypeParam
> builder
;
84 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
85 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
86 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
87 ASSERT_OK(builder
.AppendNull());
89 ASSERT_EQ(builder
.length(), 4);
90 ASSERT_EQ(builder
.null_count(), 1);
92 // Build expected data
93 auto value_type
= std::make_shared
<TypeParam
>();
94 auto dict_type
= dictionary(int8(), value_type
);
96 std::shared_ptr
<Array
> result
;
97 ASSERT_OK(builder
.Finish(&result
));
99 DictionaryArray
expected(dict_type
, ArrayFromJSON(int8(), "[0, 1, 0, null]"),
100 ArrayFromJSON(value_type
, "[1, 2]"));
101 ASSERT_TRUE(expected
.Equals(result
));
104 TYPED_TEST(TestDictionaryBuilder
, ArrayInit
) {
105 using c_type
= typename
TypeParam::c_type
;
107 auto value_type
= std::make_shared
<TypeParam
>();
108 auto dict_array
= ArrayFromJSON(value_type
, "[1, 2]");
109 auto dict_type
= dictionary(int8(), value_type
);
111 DictionaryBuilder
<TypeParam
> builder(dict_array
);
112 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
113 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
114 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
115 ASSERT_OK(builder
.AppendNull());
117 ASSERT_EQ(builder
.length(), 4);
118 ASSERT_EQ(builder
.null_count(), 1);
120 // Build expected data
122 std::shared_ptr
<Array
> result
;
123 ASSERT_OK(builder
.Finish(&result
));
125 auto indices
= ArrayFromJSON(int8(), "[0, 1, 0, null]");
126 DictionaryArray
expected(dict_type
, indices
, dict_array
);
128 AssertArraysEqual(expected
, *result
);
131 TYPED_TEST(TestDictionaryBuilder
, MakeBuilder
) {
132 using c_type
= typename
TypeParam::c_type
;
134 auto value_type
= std::make_shared
<TypeParam
>();
135 auto dict_array
= ArrayFromJSON(value_type
, "[1, 2]");
136 auto dict_type
= dictionary(int8(), value_type
);
137 std::unique_ptr
<ArrayBuilder
> boxed_builder
;
138 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type
, &boxed_builder
));
139 auto& builder
= checked_cast
<DictionaryBuilder
<TypeParam
>&>(*boxed_builder
);
141 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
142 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
143 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
144 ASSERT_OK(builder
.AppendNull());
146 ASSERT_EQ(builder
.length(), 4);
147 ASSERT_EQ(builder
.null_count(), 1);
149 // Build expected data
151 std::shared_ptr
<Array
> result
;
152 ASSERT_OK(builder
.Finish(&result
));
154 auto int_array
= ArrayFromJSON(int8(), "[0, 1, 0, null]");
155 DictionaryArray
expected(dict_type
, int_array
, dict_array
);
157 AssertArraysEqual(expected
, *result
);
160 TYPED_TEST(TestDictionaryBuilder
, ArrayConversion
) {
161 auto type
= std::make_shared
<TypeParam
>();
163 auto intermediate_result
= ArrayFromJSON(type
, "[1, 2, 1]");
164 DictionaryBuilder
<TypeParam
> dictionary_builder
;
165 ASSERT_OK(dictionary_builder
.AppendArray(*intermediate_result
));
166 std::shared_ptr
<Array
> result
;
167 ASSERT_OK(dictionary_builder
.Finish(&result
));
169 // Build expected data
170 auto dict_array
= ArrayFromJSON(type
, "[1, 2]");
171 auto dict_type
= dictionary(int8(), type
);
173 auto int_array
= ArrayFromJSON(int8(), "[0, 1, 0]");
174 DictionaryArray
expected(dict_type
, int_array
, dict_array
);
176 ASSERT_TRUE(expected
.Equals(result
));
179 TYPED_TEST(TestDictionaryBuilder
, DoubleTableSize
) {
180 using Scalar
= typename
TypeParam::c_type
;
181 // Skip this test for (u)int8
182 if (sizeof(Scalar
) > 1) {
183 // Build the dictionary Array
184 DictionaryBuilder
<TypeParam
> builder
;
185 // Build expected data
186 NumericBuilder
<TypeParam
> dict_builder
;
187 Int16Builder int_builder
;
189 // Fill with 1024 different values
190 for (int64_t i
= 0; i
< 1024; i
++) {
191 ASSERT_OK(builder
.Append(static_cast<Scalar
>(i
)));
192 ASSERT_OK(dict_builder
.Append(static_cast<Scalar
>(i
)));
193 ASSERT_OK(int_builder
.Append(static_cast<uint16_t>(i
)));
195 // Fill with an already existing value
196 for (int64_t i
= 0; i
< 1024; i
++) {
197 ASSERT_OK(builder
.Append(static_cast<Scalar
>(1)));
198 ASSERT_OK(int_builder
.Append(1));
202 std::shared_ptr
<Array
> result
;
203 FinishAndCheckPadding(&builder
, &result
);
205 // Finalize expected data
206 std::shared_ptr
<Array
> dict_array
;
207 ASSERT_OK(dict_builder
.Finish(&dict_array
));
209 auto dtype
= dictionary(int16(), dict_array
->type());
210 std::shared_ptr
<Array
> int_array
;
211 ASSERT_OK(int_builder
.Finish(&int_array
));
213 DictionaryArray
expected(dtype
, int_array
, dict_array
);
214 AssertArraysEqual(expected
, *result
);
218 TYPED_TEST(TestDictionaryBuilder
, DeltaDictionary
) {
219 using c_type
= typename
TypeParam::c_type
;
220 auto type
= std::make_shared
<TypeParam
>();
222 DictionaryBuilder
<TypeParam
> builder
;
224 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
225 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
226 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
227 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
228 std::shared_ptr
<Array
> result
;
229 FinishAndCheckPadding(&builder
, &result
);
231 // Build expected data for the initial dictionary
232 auto ex_dict
= ArrayFromJSON(type
, "[1, 2]");
233 auto dict_type1
= dictionary(int8(), type
);
234 DictionaryArray
expected(dict_type1
, ArrayFromJSON(int8(), "[0, 1, 0, 1]"), ex_dict
);
236 ASSERT_TRUE(expected
.Equals(result
));
238 // extend the dictionary builder with new data
239 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
240 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
241 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
242 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
243 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
245 std::shared_ptr
<Array
> result_indices
, result_delta
;
246 ASSERT_OK(builder
.FinishDelta(&result_indices
, &result_delta
));
247 AssertArraysEqual(*ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]"), *result_indices
);
248 AssertArraysEqual(*ArrayFromJSON(type
, "[3]"), *result_delta
);
251 TYPED_TEST(TestDictionaryBuilder
, DoubleDeltaDictionary
) {
252 using c_type
= typename
TypeParam::c_type
;
253 auto type
= std::make_shared
<TypeParam
>();
254 auto dict_type
= dictionary(int8(), type
);
256 DictionaryBuilder
<TypeParam
> builder
;
258 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
259 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
260 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
261 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
262 std::shared_ptr
<Array
> result
;
263 FinishAndCheckPadding(&builder
, &result
);
265 // Build expected data for the initial dictionary
266 auto ex_dict1
= ArrayFromJSON(type
, "[1, 2]");
267 DictionaryArray
expected(dict_type
, ArrayFromJSON(int8(), "[0, 1, 0, 1]"), ex_dict1
);
269 ASSERT_TRUE(expected
.Equals(result
));
271 // extend the dictionary builder with new data
272 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
273 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
274 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
275 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
276 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
278 std::shared_ptr
<Array
> result_indices1
, result_delta1
;
279 ASSERT_OK(builder
.FinishDelta(&result_indices1
, &result_delta1
));
280 AssertArraysEqual(*ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]"), *result_indices1
);
281 AssertArraysEqual(*ArrayFromJSON(type
, "[3]"), *result_delta1
);
283 // extend the dictionary builder with new data again
284 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
285 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
286 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
287 ASSERT_OK(builder
.Append(static_cast<c_type
>(4)));
288 ASSERT_OK(builder
.Append(static_cast<c_type
>(5)));
290 std::shared_ptr
<Array
> result_indices2
, result_delta2
;
291 ASSERT_OK(builder
.FinishDelta(&result_indices2
, &result_delta2
));
292 AssertArraysEqual(*ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]"), *result_indices2
);
293 AssertArraysEqual(*ArrayFromJSON(type
, "[4, 5]"), *result_delta2
);
296 TYPED_TEST(TestDictionaryBuilder
, Dictionary32_BasicPrimitive
) {
297 using c_type
= typename
TypeParam::c_type
;
298 auto type
= std::make_shared
<TypeParam
>();
299 auto dict_type
= dictionary(int32(), type
);
301 Dictionary32Builder
<TypeParam
> builder
;
303 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
304 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
305 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
306 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
307 std::shared_ptr
<Array
> result
;
308 FinishAndCheckPadding(&builder
, &result
);
310 // Build expected data for the initial dictionary
311 auto ex_dict1
= ArrayFromJSON(type
, "[1, 2]");
312 DictionaryArray
expected(dict_type
, ArrayFromJSON(int32(), "[0, 1, 0, 1]"), ex_dict1
);
313 ASSERT_TRUE(expected
.Equals(result
));
316 TYPED_TEST(TestDictionaryBuilder
, FinishResetBehavior
) {
318 using c_type
= typename
TypeParam::c_type
;
319 auto type
= std::make_shared
<TypeParam
>();
321 Dictionary32Builder
<TypeParam
> builder
;
323 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
324 ASSERT_OK(builder
.AppendNull());
325 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
326 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
328 // Properties from indices_builder propagated
329 ASSERT_LT(0, builder
.capacity());
330 ASSERT_LT(0, builder
.null_count());
331 ASSERT_EQ(4, builder
.length());
333 std::shared_ptr
<Array
> result
;
334 ASSERT_OK(builder
.Finish(&result
));
337 ASSERT_EQ(0, builder
.capacity());
338 ASSERT_EQ(0, builder
.length());
339 ASSERT_EQ(0, builder
.null_count());
341 // Use the builder again
342 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
343 ASSERT_OK(builder
.AppendNull());
344 ASSERT_OK(builder
.Append(static_cast<c_type
>(4)));
346 ASSERT_OK(builder
.Finish(&result
));
348 // Dictionary has 4 elements because the dictionary memo was not reset
349 ASSERT_EQ(4, static_cast<const DictionaryArray
&>(*result
).dictionary()->length());
352 TYPED_TEST(TestDictionaryBuilder
, ResetFull
) {
353 using c_type
= typename
TypeParam::c_type
;
354 auto type
= std::make_shared
<TypeParam
>();
356 Dictionary32Builder
<TypeParam
> builder
;
358 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
359 ASSERT_OK(builder
.AppendNull());
360 ASSERT_OK(builder
.Append(static_cast<c_type
>(1)));
361 ASSERT_OK(builder
.Append(static_cast<c_type
>(2)));
363 std::shared_ptr
<Array
> result
;
364 ASSERT_OK(builder
.Finish(&result
));
366 ASSERT_OK(builder
.Append(static_cast<c_type
>(3)));
367 ASSERT_OK(builder
.Finish(&result
));
369 // Dictionary expanded
370 const auto& dict_result
= static_cast<const DictionaryArray
&>(*result
);
371 AssertArraysEqual(*ArrayFromJSON(int32(), "[2]"), *dict_result
.indices());
372 AssertArraysEqual(*ArrayFromJSON(type
, "[1, 2, 3]"),
373 *static_cast<const DictionaryArray
&>(*result
).dictionary());
376 ASSERT_OK(builder
.Append(static_cast<c_type
>(4)));
377 ASSERT_OK(builder
.Finish(&result
));
378 const auto& dict_result2
= static_cast<const DictionaryArray
&>(*result
);
379 AssertArraysEqual(*ArrayFromJSON(int32(), "[0]"), *dict_result2
.indices());
380 AssertArraysEqual(*ArrayFromJSON(type
, "[4]"), *dict_result2
.dictionary());
383 TEST(TestDictionaryBuilderAdHoc
, AppendIndicesUpdateCapacity
) {
384 DictionaryBuilder
<Int32Type
> builder
;
385 Dictionary32Builder
<Int32Type
> builder32
;
387 std::vector
<int32_t> indices_i32
= {0, 1, 2};
388 std::vector
<int64_t> indices_i64
= {0, 1, 2};
390 ASSERT_OK(builder
.AppendIndices(indices_i64
.data(), 3));
391 ASSERT_OK(builder32
.AppendIndices(indices_i32
.data(), 3));
393 ASSERT_LT(0, builder
.capacity());
394 ASSERT_LT(0, builder32
.capacity());
397 TEST(TestStringDictionaryBuilder
, Basic
) {
398 // Build the dictionary Array
399 StringDictionaryBuilder builder
;
400 ASSERT_OK(builder
.Append("test"));
401 ASSERT_OK(builder
.Append("test2"));
402 ASSERT_OK(builder
.Append("test", 4));
404 std::shared_ptr
<Array
> result
;
405 ASSERT_OK(builder
.Finish(&result
));
407 // Build expected data
408 auto ex_dict
= ArrayFromJSON(utf8(), "[\"test\", \"test2\"]");
409 auto dtype
= dictionary(int8(), utf8());
410 auto int_array
= ArrayFromJSON(int8(), "[0, 1, 0]");
411 DictionaryArray
expected(dtype
, int_array
, ex_dict
);
413 ASSERT_TRUE(expected
.Equals(result
));
416 template <typename BuilderType
, typename IndexType
, typename AppendCType
>
417 void TestStringDictionaryAppendIndices() {
418 auto index_type
= TypeTraits
<IndexType
>::type_singleton();
420 auto ex_dict
= ArrayFromJSON(utf8(), R
"(["c
", "a
", "b
", "d
"])");
421 auto invalid_dict
= ArrayFromJSON(binary(), R
"(["e
", "f
"])");
424 ASSERT_OK(builder
.InsertMemoValues(*ex_dict
));
426 // Inserting again should have no effect
427 ASSERT_OK(builder
.InsertMemoValues(*ex_dict
));
430 ASSERT_RAISES(Invalid
, builder
.InsertMemoValues(*invalid_dict
));
432 std::vector
<AppendCType
> raw_indices
= {0, 1, 2, -1, 3};
433 std::vector
<uint8_t> is_valid
= {1, 1, 1, 0, 1};
434 for (int i
= 0; i
< 2; ++i
) {
435 ASSERT_OK(builder
.AppendIndices(
436 raw_indices
.data(), static_cast<int64_t>(raw_indices
.size()), is_valid
.data()));
439 ASSERT_EQ(10, builder
.length());
441 std::shared_ptr
<Array
> result
;
442 ASSERT_OK(builder
.Finish(&result
));
444 auto ex_indices
= ArrayFromJSON(index_type
, R
"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
445 auto dtype
= dictionary(index_type
, utf8());
446 DictionaryArray
expected(dtype
, ex_indices
, ex_dict
);
447 ASSERT_TRUE(expected
.Equals(result
));
450 TEST(TestStringDictionaryBuilder
, AppendIndices
) {
451 // Currently AdaptiveIntBuilder only accepts int64_t in bulk appends
452 TestStringDictionaryAppendIndices
<StringDictionaryBuilder
, Int8Type
, int64_t>();
454 TestStringDictionaryAppendIndices
<StringDictionary32Builder
, Int32Type
, int32_t>();
457 TEST(TestStringDictionaryBuilder
, ArrayInit
) {
458 auto dict_array
= ArrayFromJSON(utf8(), R
"(["test
", "test2
"])");
459 auto int_array
= ArrayFromJSON(int8(), "[0, 1, 0]");
461 // Build the dictionary Array
462 StringDictionaryBuilder
builder(dict_array
);
463 ASSERT_OK(builder
.Append("test"));
464 ASSERT_OK(builder
.Append("test2"));
465 ASSERT_OK(builder
.Append("test"));
467 std::shared_ptr
<Array
> result
;
468 ASSERT_OK(builder
.Finish(&result
));
470 // Build expected data
471 DictionaryArray
expected(dictionary(int8(), utf8()), int_array
, dict_array
);
473 AssertArraysEqual(expected
, *result
);
476 template <typename BuilderType
>
477 void TestStringDictionaryMakeBuilder(const std::shared_ptr
<DataType
>& value_type
) {
478 auto dict_array
= ArrayFromJSON(value_type
, R
"(["test
", "test2
"])");
479 auto dict_type
= dictionary(int8(), value_type
);
480 auto int_array
= ArrayFromJSON(int8(), "[0, 1, 0]");
481 std::unique_ptr
<ArrayBuilder
> boxed_builder
;
482 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type
, &boxed_builder
));
483 auto& builder
= checked_cast
<BuilderType
&>(*boxed_builder
);
485 // Build the dictionary Array
486 ASSERT_OK(builder
.Append("test"));
487 ASSERT_OK(builder
.Append("test2"));
488 ASSERT_OK(builder
.Append("test"));
490 std::shared_ptr
<Array
> result
;
491 ASSERT_OK(builder
.Finish(&result
));
493 // Build expected data
494 DictionaryArray
expected(dict_type
, int_array
, dict_array
);
496 AssertArraysEqual(expected
, *result
);
499 TEST(TestStringDictionaryBuilder
, MakeBuilder
) {
500 TestStringDictionaryMakeBuilder
<DictionaryBuilder
<StringType
>>(utf8());
503 TEST(TestLargeStringDictionaryBuilder
, MakeBuilder
) {
504 TestStringDictionaryMakeBuilder
<DictionaryBuilder
<LargeStringType
>>(large_utf8());
508 TEST(TestStringDictionaryBuilder
, OnlyNull
) {
509 // Build the dictionary Array
510 StringDictionaryBuilder builder
;
511 ASSERT_OK(builder
.AppendNull());
513 std::shared_ptr
<Array
> result
;
514 ASSERT_OK(builder
.Finish(&result
));
516 // Build expected data
517 auto dict
= ArrayFromJSON(utf8(), "[]");
518 auto dtype
= dictionary(int8(), utf8());
519 auto int_array
= ArrayFromJSON(int8(), "[null]");
520 DictionaryArray
expected(dtype
, int_array
, dict
);
522 ASSERT_TRUE(expected
.Equals(result
));
525 TEST(TestStringDictionaryBuilder
, DoubleTableSize
) {
526 // Build the dictionary Array
527 StringDictionaryBuilder builder
;
528 // Build expected data
529 StringBuilder str_builder
;
530 Int16Builder int_builder
;
532 // Fill with 1024 different values
533 for (int64_t i
= 0; i
< 1024; i
++) {
534 std::stringstream ss
;
536 ASSERT_OK(builder
.Append(ss
.str()));
537 ASSERT_OK(str_builder
.Append(ss
.str()));
538 ASSERT_OK(int_builder
.Append(static_cast<uint16_t>(i
)));
540 // Fill with an already existing value
541 for (int64_t i
= 0; i
< 1024; i
++) {
542 ASSERT_OK(builder
.Append("test1"));
543 ASSERT_OK(int_builder
.Append(1));
547 std::shared_ptr
<Array
> result
;
548 FinishAndCheckPadding(&builder
, &result
);
550 // Finalize expected data
551 std::shared_ptr
<Array
> str_array
;
552 ASSERT_OK(str_builder
.Finish(&str_array
));
553 auto dtype
= dictionary(int16(), utf8());
554 std::shared_ptr
<Array
> int_array
;
555 ASSERT_OK(int_builder
.Finish(&int_array
));
557 DictionaryArray
expected(dtype
, int_array
, str_array
);
558 ASSERT_TRUE(expected
.Equals(result
));
561 TEST(TestStringDictionaryBuilder
, DeltaDictionary
) {
562 // Build the dictionary Array
563 StringDictionaryBuilder builder
;
564 ASSERT_OK(builder
.Append("test"));
565 ASSERT_OK(builder
.Append("test2"));
566 ASSERT_OK(builder
.Append("test"));
568 std::shared_ptr
<Array
> result
;
569 ASSERT_OK(builder
.Finish(&result
));
571 // Build expected data
572 auto dict
= ArrayFromJSON(utf8(), "[\"test\", \"test2\"]");
573 auto dtype
= dictionary(int8(), utf8());
574 auto int_array
= ArrayFromJSON(int8(), "[0, 1, 0]");
575 DictionaryArray
expected(dtype
, int_array
, dict
);
577 ASSERT_TRUE(expected
.Equals(result
));
579 // build a delta dictionary
580 ASSERT_OK(builder
.Append("test2"));
581 ASSERT_OK(builder
.Append("test3"));
582 ASSERT_OK(builder
.Append("test2"));
584 std::shared_ptr
<Array
> result_indices
, result_delta
;
585 ASSERT_OK(builder
.FinishDelta(&result_indices
, &result_delta
));
587 // Build expected data
588 AssertArraysEqual(*ArrayFromJSON(int8(), "[1, 2, 1]"), *result_indices
);
589 AssertArraysEqual(*ArrayFromJSON(utf8(), "[\"test3\"]"), *result_delta
);
592 TEST(TestStringDictionaryBuilder
, BigDeltaDictionary
) {
593 constexpr int16_t kTestLength
= 2048;
594 // Build the dictionary Array
595 StringDictionaryBuilder builder
;
597 StringBuilder str_builder1
;
598 Int16Builder int_builder1
;
600 for (int16_t idx
= 0; idx
< kTestLength
; ++idx
) {
601 std::stringstream sstream
;
602 sstream
<< "test" << idx
;
603 ASSERT_OK(builder
.Append(sstream
.str()));
604 ASSERT_OK(str_builder1
.Append(sstream
.str()));
605 ASSERT_OK(int_builder1
.Append(idx
));
608 std::shared_ptr
<Array
> result
;
609 FinishAndCheckPadding(&builder
, &result
);
611 std::shared_ptr
<Array
> str_array1
;
612 ASSERT_OK(str_builder1
.Finish(&str_array1
));
614 auto dtype1
= dictionary(int16(), utf8());
616 std::shared_ptr
<Array
> int_array1
;
617 ASSERT_OK(int_builder1
.Finish(&int_array1
));
619 DictionaryArray
expected(dtype1
, int_array1
, str_array1
);
620 ASSERT_TRUE(expected
.Equals(result
));
623 StringBuilder str_builder2
;
624 Int16Builder int_builder2
;
626 for (int16_t idx
= 0; idx
< kTestLength
; ++idx
) {
627 ASSERT_OK(builder
.Append("test1"));
628 ASSERT_OK(int_builder2
.Append(1));
631 for (int16_t idx
= 0; idx
< kTestLength
; ++idx
) {
632 ASSERT_OK(builder
.Append("test_new_value1"));
633 ASSERT_OK(int_builder2
.Append(kTestLength
));
635 ASSERT_OK(str_builder2
.Append("test_new_value1"));
637 std::shared_ptr
<Array
> indices2
, delta2
;
638 ASSERT_OK(builder
.FinishDelta(&indices2
, &delta2
));
640 std::shared_ptr
<Array
> str_array2
;
641 ASSERT_OK(str_builder2
.Finish(&str_array2
));
643 std::shared_ptr
<Array
> int_array2
;
644 ASSERT_OK(int_builder2
.Finish(&int_array2
));
646 AssertArraysEqual(*int_array2
, *indices2
);
647 AssertArraysEqual(*str_array2
, *delta2
);
650 StringBuilder str_builder3
;
651 Int16Builder int_builder3
;
653 for (int16_t idx
= 0; idx
< kTestLength
; ++idx
) {
654 ASSERT_OK(builder
.Append("test2"));
655 ASSERT_OK(int_builder3
.Append(2));
658 for (int16_t idx
= 0; idx
< kTestLength
; ++idx
) {
659 ASSERT_OK(builder
.Append("test_new_value2"));
660 ASSERT_OK(int_builder3
.Append(kTestLength
+ 1));
662 ASSERT_OK(str_builder3
.Append("test_new_value2"));
664 std::shared_ptr
<Array
> indices3
, delta3
;
665 ASSERT_OK(builder
.FinishDelta(&indices3
, &delta3
));
667 std::shared_ptr
<Array
> str_array3
;
668 ASSERT_OK(str_builder3
.Finish(&str_array3
));
670 std::shared_ptr
<Array
> int_array3
;
671 ASSERT_OK(int_builder3
.Finish(&int_array3
));
673 AssertArraysEqual(*int_array3
, *indices3
);
674 AssertArraysEqual(*str_array3
, *delta3
);
677 TEST(TestFixedSizeBinaryDictionaryBuilder
, Basic
) {
678 // Build the dictionary Array
679 DictionaryBuilder
<FixedSizeBinaryType
> builder(arrow::fixed_size_binary(4));
680 std::vector
<uint8_t> test
{12, 12, 11, 12};
681 std::vector
<uint8_t> test2
{12, 12, 11, 11};
682 ASSERT_OK(builder
.Append(test
.data()));
683 ASSERT_OK(builder
.Append(test2
.data()));
684 ASSERT_OK(builder
.Append(test
.data()));
686 std::shared_ptr
<Array
> result
;
687 FinishAndCheckPadding(&builder
, &result
);
689 // Build expected data
690 auto value_type
= arrow::fixed_size_binary(4);
691 FixedSizeBinaryBuilder
fsb_builder(value_type
);
692 ASSERT_OK(fsb_builder
.Append(test
.data()));
693 ASSERT_OK(fsb_builder
.Append(test2
.data()));
694 std::shared_ptr
<Array
> fsb_array
;
695 ASSERT_OK(fsb_builder
.Finish(&fsb_array
));
697 auto dtype
= dictionary(int8(), value_type
);
699 Int8Builder int_builder
;
700 ASSERT_OK(int_builder
.Append(0));
701 ASSERT_OK(int_builder
.Append(1));
702 ASSERT_OK(int_builder
.Append(0));
703 std::shared_ptr
<Array
> int_array
;
704 ASSERT_OK(int_builder
.Finish(&int_array
));
706 DictionaryArray
expected(dtype
, int_array
, fsb_array
);
707 ASSERT_TRUE(expected
.Equals(result
));
710 TEST(TestFixedSizeBinaryDictionaryBuilder
, ArrayInit
) {
711 // Build the dictionary Array
712 auto value_type
= fixed_size_binary(4);
713 auto dict_array
= ArrayFromJSON(value_type
, R
"(["abcd
", "wxyz
"])");
714 util::string_view test
= "abcd", test2
= "wxyz";
715 DictionaryBuilder
<FixedSizeBinaryType
> builder(dict_array
);
716 ASSERT_OK(builder
.Append(test
));
717 ASSERT_OK(builder
.Append(test2
));
718 ASSERT_OK(builder
.Append(test
));
720 std::shared_ptr
<Array
> result
;
721 FinishAndCheckPadding(&builder
, &result
);
723 // Build expected data
724 auto indices
= ArrayFromJSON(int8(), "[0, 1, 0]");
725 DictionaryArray
expected(dictionary(int8(), value_type
), indices
, dict_array
);
726 AssertArraysEqual(expected
, *result
);
729 TEST(TestFixedSizeBinaryDictionaryBuilder
, MakeBuilder
) {
730 // Build the dictionary Array
731 auto value_type
= fixed_size_binary(4);
732 auto dict_array
= ArrayFromJSON(value_type
, R
"(["abcd
", "wxyz
"])");
733 auto dict_type
= dictionary(int8(), value_type
);
735 std::unique_ptr
<ArrayBuilder
> boxed_builder
;
736 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type
, &boxed_builder
));
737 auto& builder
= checked_cast
<DictionaryBuilder
<FixedSizeBinaryType
>&>(*boxed_builder
);
738 util::string_view test
= "abcd", test2
= "wxyz";
739 ASSERT_OK(builder
.Append(test
));
740 ASSERT_OK(builder
.Append(test2
));
741 ASSERT_OK(builder
.Append(test
));
743 std::shared_ptr
<Array
> result
;
744 FinishAndCheckPadding(&builder
, &result
);
746 // Build expected data
747 auto indices
= ArrayFromJSON(int8(), "[0, 1, 0]");
748 DictionaryArray
expected(dict_type
, indices
, dict_array
);
749 AssertArraysEqual(expected
, *result
);
752 TEST(TestFixedSizeBinaryDictionaryBuilder
, DeltaDictionary
) {
753 // Build the dictionary Array
754 auto value_type
= arrow::fixed_size_binary(4);
755 auto dict_type
= dictionary(int8(), value_type
);
757 DictionaryBuilder
<FixedSizeBinaryType
> builder(value_type
);
758 std::vector
<uint8_t> test
{12, 12, 11, 12};
759 std::vector
<uint8_t> test2
{12, 12, 11, 11};
760 std::vector
<uint8_t> test3
{12, 12, 11, 10};
762 ASSERT_OK(builder
.Append(test
.data()));
763 ASSERT_OK(builder
.Append(test2
.data()));
764 ASSERT_OK(builder
.Append(test
.data()));
766 std::shared_ptr
<Array
> result1
;
767 FinishAndCheckPadding(&builder
, &result1
);
769 // Build expected data
770 FixedSizeBinaryBuilder
fsb_builder1(value_type
);
771 ASSERT_OK(fsb_builder1
.Append(test
.data()));
772 ASSERT_OK(fsb_builder1
.Append(test2
.data()));
773 std::shared_ptr
<Array
> fsb_array1
;
774 ASSERT_OK(fsb_builder1
.Finish(&fsb_array1
));
776 Int8Builder int_builder1
;
777 ASSERT_OK(int_builder1
.Append(0));
778 ASSERT_OK(int_builder1
.Append(1));
779 ASSERT_OK(int_builder1
.Append(0));
780 std::shared_ptr
<Array
> int_array1
;
781 ASSERT_OK(int_builder1
.Finish(&int_array1
));
783 DictionaryArray
expected1(dict_type
, int_array1
, fsb_array1
);
784 ASSERT_TRUE(expected1
.Equals(result1
));
786 // build delta dictionary
787 ASSERT_OK(builder
.Append(test
.data()));
788 ASSERT_OK(builder
.Append(test2
.data()));
789 ASSERT_OK(builder
.Append(test3
.data()));
791 std::shared_ptr
<Array
> indices2
, delta2
;
792 ASSERT_OK(builder
.FinishDelta(&indices2
, &delta2
));
794 // Build expected data
795 FixedSizeBinaryBuilder
fsb_builder2(value_type
);
796 ASSERT_OK(fsb_builder2
.Append(test3
.data()));
797 std::shared_ptr
<Array
> fsb_array2
;
798 ASSERT_OK(fsb_builder2
.Finish(&fsb_array2
));
800 Int8Builder int_builder2
;
801 ASSERT_OK(int_builder2
.Append(0));
802 ASSERT_OK(int_builder2
.Append(1));
803 ASSERT_OK(int_builder2
.Append(2));
805 std::shared_ptr
<Array
> int_array2
;
806 ASSERT_OK(int_builder2
.Finish(&int_array2
));
808 AssertArraysEqual(*int_array2
, *indices2
);
809 AssertArraysEqual(*fsb_array2
, *delta2
);
812 TEST(TestFixedSizeBinaryDictionaryBuilder
, DoubleTableSize
) {
813 // Build the dictionary Array
814 auto value_type
= arrow::fixed_size_binary(4);
815 auto dict_type
= dictionary(int16(), value_type
);
817 DictionaryBuilder
<FixedSizeBinaryType
> builder(value_type
);
818 // Build expected data
819 FixedSizeBinaryBuilder
fsb_builder(value_type
);
820 Int16Builder int_builder
;
822 // Fill with 1024 different values
823 for (int64_t i
= 0; i
< 1024; i
++) {
824 std::vector
<uint8_t> value
{12, 12, static_cast<uint8_t>(i
/ 128),
825 static_cast<uint8_t>(i
% 128)};
826 ASSERT_OK(builder
.Append(value
.data()));
827 ASSERT_OK(fsb_builder
.Append(value
.data()));
828 ASSERT_OK(int_builder
.Append(static_cast<uint16_t>(i
)));
830 // Fill with an already existing value
831 std::vector
<uint8_t> known_value
{12, 12, 0, 1};
832 for (int64_t i
= 0; i
< 1024; i
++) {
833 ASSERT_OK(builder
.Append(known_value
.data()));
834 ASSERT_OK(int_builder
.Append(1));
838 std::shared_ptr
<Array
> result
;
839 ASSERT_OK(builder
.Finish(&result
));
841 // Finalize expected data
842 std::shared_ptr
<Array
> fsb_array
;
843 ASSERT_OK(fsb_builder
.Finish(&fsb_array
));
844 std::shared_ptr
<Array
> int_array
;
845 ASSERT_OK(int_builder
.Finish(&int_array
));
847 DictionaryArray
expected(dict_type
, int_array
, fsb_array
);
848 ASSERT_TRUE(expected
.Equals(result
));
852 TEST(TestFixedSizeBinaryDictionaryBuilder
, AppendArrayInvalidType
) {
853 // Build the dictionary Array
854 auto value_type
= fixed_size_binary(4);
855 DictionaryBuilder
<FixedSizeBinaryType
> builder(value_type
);
856 // Build an array with different byte width
857 auto fsb_array
= ArrayFromJSON(fixed_size_binary(3), R
"(["foo
", "bar
"])");
859 ASSERT_RAISES(TypeError
, builder
.AppendArray(*fsb_array
));
863 template <typename DecimalValue
>
864 void TestDecimalDictionaryBuilderBasic(std::shared_ptr
<DataType
> decimal_type
) {
865 // Build the dictionary Array
866 DictionaryBuilder
<FixedSizeBinaryType
> builder(decimal_type
);
869 std::vector
<DecimalValue
> test
{12, 12, 11, 12};
870 for (const auto& value
: test
) {
871 ASSERT_OK(builder
.Append(value
.ToBytes().data()));
874 std::shared_ptr
<Array
> result
;
875 ASSERT_OK(builder
.Finish(&result
));
877 // Build expected data
878 DictionaryArray
expected(dictionary(int8(), decimal_type
),
879 ArrayFromJSON(int8(), "[0, 0, 1, 0]"),
880 ArrayFromJSON(decimal_type
, "[\"12\", \"11\"]"));
882 ASSERT_TRUE(expected
.Equals(result
));
885 TEST(TestDecimal128DictionaryBuilder
, Basic
) {
886 TestDecimalDictionaryBuilderBasic
<Decimal128
>(arrow::decimal128(2, 0));
889 TEST(TestDecimal256DictionaryBuilder
, Basic
) {
890 TestDecimalDictionaryBuilderBasic
<Decimal256
>(arrow::decimal256(76, 0));
893 void TestDecimalDictionaryBuilderDoubleTableSize(
894 std::shared_ptr
<DataType
> decimal_type
, FixedSizeBinaryBuilder
& decimal_builder
) {
895 // Build the dictionary Array
896 DictionaryBuilder
<FixedSizeBinaryType
> dict_builder(decimal_type
);
898 // Build expected data
899 Int16Builder int_builder
;
901 // Fill with 1024 different values
902 for (int64_t i
= 0; i
< 1024; i
++) {
903 // Decimal256Builder takes 32 bytes, while Decimal128Builder takes only the first 16
905 const uint8_t bytes
[32] = {0,
919 static_cast<uint8_t>(i
/ 128),
920 static_cast<uint8_t>(i
% 128)};
921 ASSERT_OK(dict_builder
.Append(bytes
));
922 ASSERT_OK(decimal_builder
.Append(bytes
));
923 ASSERT_OK(int_builder
.Append(static_cast<uint16_t>(i
)));
925 // Fill with an already existing value
926 const uint8_t known_value
[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
927 for (int64_t i
= 0; i
< 1024; i
++) {
928 ASSERT_OK(dict_builder
.Append(known_value
));
929 ASSERT_OK(int_builder
.Append(1));
933 std::shared_ptr
<Array
> result
;
934 ASSERT_OK(dict_builder
.Finish(&result
));
936 // Finalize expected data
937 std::shared_ptr
<Array
> decimal_array
;
938 ASSERT_OK(decimal_builder
.Finish(&decimal_array
));
940 std::shared_ptr
<Array
> int_array
;
941 ASSERT_OK(int_builder
.Finish(&int_array
));
943 DictionaryArray
expected(dictionary(int16(), decimal_type
), int_array
, decimal_array
);
944 ASSERT_TRUE(expected
.Equals(result
));
947 TEST(TestDecimal128DictionaryBuilder
, DoubleTableSize
) {
948 const auto& decimal_type
= arrow::decimal128(21, 0);
949 Decimal128Builder
decimal_builder(decimal_type
);
950 TestDecimalDictionaryBuilderDoubleTableSize(decimal_type
, decimal_builder
);
953 TEST(TestDecimal256DictionaryBuilder
, DoubleTableSize
) {
954 const auto& decimal_type
= arrow::decimal256(21, 0);
955 Decimal256Builder
decimal_builder(decimal_type
);
956 TestDecimalDictionaryBuilderDoubleTableSize(decimal_type
, decimal_builder
);
959 TEST(TestNullDictionaryBuilder
, Basic
) {
961 auto dict_type
= dictionary(int8(), null());
962 std::unique_ptr
<ArrayBuilder
> boxed_builder
;
963 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type
, &boxed_builder
));
964 auto& builder
= checked_cast
<DictionaryBuilder
<NullType
>&>(*boxed_builder
);
966 ASSERT_OK(builder
.AppendNull());
967 ASSERT_OK(builder
.AppendNull());
968 ASSERT_OK(builder
.AppendNull());
969 ASSERT_EQ(3, builder
.length());
970 ASSERT_EQ(3, builder
.null_count());
972 ASSERT_OK(builder
.AppendNulls(4));
973 ASSERT_EQ(7, builder
.length());
974 ASSERT_EQ(7, builder
.null_count());
976 auto null_array
= ArrayFromJSON(null(), "[null, null, null, null]");
977 ASSERT_OK(builder
.AppendArray(*null_array
));
978 ASSERT_EQ(11, builder
.length());
979 ASSERT_EQ(11, builder
.null_count());
981 std::shared_ptr
<Array
> result
;
982 ASSERT_OK(builder
.Finish(&result
));
983 AssertTypeEqual(*dict_type
, *result
->type());
984 ASSERT_EQ(11, result
->length());
985 ASSERT_EQ(11, result
->null_count());
989 TEST(TestNullDictionaryBuilder
, AppendArrayInvalidType
) {
991 auto dict_type
= dictionary(int8(), null());
992 std::unique_ptr
<ArrayBuilder
> boxed_builder
;
993 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type
, &boxed_builder
));
994 auto& builder
= checked_cast
<DictionaryBuilder
<NullType
>&>(*boxed_builder
);
996 auto int8_array
= ArrayFromJSON(int8(), "[0, 1, 0, null]");
997 ASSERT_RAISES(TypeError
, builder
.AppendArray(*int8_array
));
1001 // ----------------------------------------------------------------------
1002 // Index byte width tests
1004 template <typename IndexType
, typename ValueType
>
1005 void AssertIndexByteWidth(const std::shared_ptr
<DataType
>& value_type
=
1006 TypeTraits
<ValueType
>::type_singleton()) {
1007 auto index_type
= TypeTraits
<IndexType
>::type_singleton();
1009 checked_pointer_cast
<DictionaryType
>(dictionary(index_type
, value_type
));
1010 std::unique_ptr
<ArrayBuilder
> builder
;
1011 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type
, &builder
));
1012 auto builder_dict_type
= checked_pointer_cast
<DictionaryType
>(builder
->type());
1013 AssertTypeEqual(dict_type
->index_type(), builder_dict_type
->index_type());
1016 typedef ::testing::Types
<Int8Type
, Int16Type
, Int32Type
, Int64Type
> IndexTypes
;
1018 template <typename Type
>
1019 class TestDictionaryBuilderIndexByteWidth
: public TestBuilder
{};
1021 TYPED_TEST_SUITE(TestDictionaryBuilderIndexByteWidth
, IndexTypes
);
1023 TYPED_TEST(TestDictionaryBuilderIndexByteWidth
, MakeBuilder
) {
1024 AssertIndexByteWidth
<TypeParam
, FloatType
>();
1025 AssertIndexByteWidth
<TypeParam
, BinaryType
>();
1026 AssertIndexByteWidth
<TypeParam
, StringType
>();
1027 AssertIndexByteWidth
<TypeParam
, FixedSizeBinaryType
>(fixed_size_binary(4));
1028 AssertIndexByteWidth
<TypeParam
, NullType
>();
1031 // ----------------------------------------------------------------------
1032 // DictionaryArray tests
1034 TEST(TestDictionary
, Equals
) {
1035 std::vector
<bool> is_valid
= {true, true, false, true, true, true};
1036 std::shared_ptr
<Array
> dict
, dict2
, indices
, indices2
, indices3
;
1038 dict
= ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1039 std::shared_ptr
<DataType
> dict_type
= dictionary(int16(), utf8());
1041 dict2
= ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]");
1042 std::shared_ptr
<DataType
> dict2_type
= dictionary(int16(), utf8());
1044 std::vector
<int16_t> indices_values
= {1, 2, -1, 0, 2, 0};
1045 ArrayFromVector
<Int16Type
, int16_t>(is_valid
, indices_values
, &indices
);
1047 std::vector
<int16_t> indices2_values
= {1, 2, 0, 0, 2, 0};
1048 ArrayFromVector
<Int16Type
, int16_t>(is_valid
, indices2_values
, &indices2
);
1050 std::vector
<int16_t> indices3_values
= {1, 1, 0, 0, 2, 0};
1051 ArrayFromVector
<Int16Type
, int16_t>(is_valid
, indices3_values
, &indices3
);
1053 auto array
= std::make_shared
<DictionaryArray
>(dict_type
, indices
, dict
);
1054 auto array2
= std::make_shared
<DictionaryArray
>(dict_type
, indices2
, dict
);
1055 auto array3
= std::make_shared
<DictionaryArray
>(dict2_type
, indices
, dict2
);
1056 auto array4
= std::make_shared
<DictionaryArray
>(dict_type
, indices3
, dict
);
1058 ASSERT_TRUE(array
->Equals(array
));
1060 // Equal, because the unequal index is masked by null
1061 ASSERT_TRUE(array
->Equals(array2
));
1063 // Unequal dictionaries
1064 ASSERT_FALSE(array
->Equals(array3
));
1067 ASSERT_FALSE(array
->Equals(array4
));
1070 ASSERT_TRUE(array
->RangeEquals(3, 6, 3, array4
));
1071 ASSERT_FALSE(array
->RangeEquals(1, 3, 1, array4
));
1073 // ARROW-33 Test slices
1074 const int64_t size
= array
->length();
1076 std::shared_ptr
<Array
> slice
, slice2
;
1077 slice
= array
->Array::Slice(2);
1078 slice2
= array
->Array::Slice(2);
1079 ASSERT_EQ(size
- 2, slice
->length());
1081 ASSERT_TRUE(slice
->Equals(slice2
));
1082 ASSERT_TRUE(array
->RangeEquals(2, array
->length(), 0, slice
));
1085 slice2
= array
->Array::Slice(1)->Array::Slice(1);
1086 ASSERT_TRUE(slice
->Equals(slice2
));
1088 slice
= array
->Slice(1, 3);
1089 slice2
= array
->Slice(1, 3);
1090 ASSERT_EQ(3, slice
->length());
1092 ASSERT_TRUE(slice
->Equals(slice2
));
1093 ASSERT_TRUE(array
->RangeEquals(1, 4, 0, slice
));
1096 TEST(TestDictionary
, Validate
) {
1097 auto dict
= ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1098 auto dict_type
= dictionary(int16(), utf8());
1100 auto indices
= ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]");
1101 std::shared_ptr
<Array
> arr
=
1102 std::make_shared
<DictionaryArray
>(dict_type
, indices
, dict
);
1104 // Only checking index type for now
1105 ASSERT_OK(arr
->ValidateFull());
1107 // ARROW-7008: Invalid dict was not being validated
1108 std::vector
<std::shared_ptr
<Buffer
>> buffers
= {nullptr, nullptr, nullptr};
1109 auto invalid_data
= std::make_shared
<ArrayData
>(utf8(), 0, buffers
);
1111 indices
= ArrayFromJSON(int16(), "[]");
1112 arr
= std::make_shared
<DictionaryArray
>(dict_type
, indices
, MakeArray(invalid_data
));
1113 ASSERT_RAISES(Invalid
, arr
->ValidateFull());
1115 // Make the data buffer non-null
1116 ASSERT_OK_AND_ASSIGN(buffers
[2], AllocateBuffer(0));
1117 arr
= std::make_shared
<DictionaryArray
>(dict_type
, indices
, MakeArray(invalid_data
));
1118 ASSERT_RAISES(Invalid
, arr
->ValidateFull());
1122 std::shared_ptr
<Array
> null_dict_arr
=
1123 std::make_shared
<DictionaryArray
>(dict_type
, indices
, nullptr);
1128 TEST(TestDictionary
, FromArrays
) {
1129 auto dict
= ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1130 for (auto index_ty
: all_dictionary_index_types()) {
1131 auto dict_type
= dictionary(index_ty
, utf8());
1133 auto indices1
= ArrayFromJSON(index_ty
, "[1, 2, 0, 0, 2, 0]");
1134 // Index out of bounds
1135 auto indices2
= ArrayFromJSON(index_ty
, "[1, 2, 0, 3, 2, 0]");
1137 ASSERT_OK_AND_ASSIGN(auto arr1
,
1138 DictionaryArray::FromArrays(dict_type
, indices1
, dict
));
1139 ASSERT_RAISES(IndexError
, DictionaryArray::FromArrays(dict_type
, indices2
, dict
));
1141 if (checked_cast
<const IntegerType
&>(*index_ty
).is_signed()) {
1142 // Invalid index is masked by null, so it's OK
1143 auto indices3
= ArrayFromJSON(index_ty
, "[1, 2, -1, null, 2, 0]");
1144 BitUtil::ClearBit(indices3
->data()->buffers
[0]->mutable_data(), 2);
1145 ASSERT_OK_AND_ASSIGN(auto arr3
,
1146 DictionaryArray::FromArrays(dict_type
, indices3
, dict
));
1149 auto indices4
= ArrayFromJSON(index_ty
, "[1, 2, null, 3, 2, 0]");
1150 ASSERT_RAISES(IndexError
, DictionaryArray::FromArrays(dict_type
, indices4
, dict
));
1152 // Probe other validation checks
1153 ASSERT_RAISES(TypeError
, DictionaryArray::FromArrays(index_ty
, indices4
, dict
));
1155 auto different_index_ty
=
1156 dictionary(index_ty
->id() == Type::INT8
? uint8() : int8(), utf8());
1157 ASSERT_RAISES(TypeError
,
1158 DictionaryArray::FromArrays(different_index_ty
, indices4
, dict
));
1162 static void CheckTranspose(const std::shared_ptr
<Array
>& input
,
1163 const int32_t* transpose_map
,
1164 const std::shared_ptr
<DataType
>& out_dict_type
,
1165 const std::shared_ptr
<Array
>& out_dict
,
1166 const std::shared_ptr
<Array
>& expected_indices
) {
1167 ASSERT_OK_AND_ASSIGN(auto transposed
,
1168 internal::checked_cast
<const DictionaryArray
&>(*input
).Transpose(
1169 out_dict_type
, out_dict
, transpose_map
));
1170 ASSERT_OK(transposed
->ValidateFull());
1172 ASSERT_OK_AND_ASSIGN(auto expected
, DictionaryArray::FromArrays(
1173 out_dict_type
, expected_indices
, out_dict
));
1174 AssertArraysEqual(*transposed
, *expected
);
1177 TEST(TestDictionary
, TransposeBasic
) {
1178 auto dict
= ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
1180 auto CheckIndexType
= [&](const std::shared_ptr
<DataType
>& index_ty
) {
1181 auto dict_type
= dictionary(index_ty
, utf8());
1182 auto indices
= ArrayFromJSON(index_ty
, "[1, 2, 0, 0]");
1183 // ["B", "C", "A", "A"]
1184 ASSERT_OK_AND_ASSIGN(auto arr
, DictionaryArray::FromArrays(dict_type
, indices
, dict
));
1186 auto sliced
= arr
->Slice(1, 2);
1188 // Transpose to same index type
1190 auto out_dict_type
= dict_type
;
1191 auto out_dict
= ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
1192 auto expected_indices
= ArrayFromJSON(index_ty
, "[3, 2, 1, 1]");
1193 std::vector
<int32_t> transpose_map
= {1, 3, 2};
1194 CheckTranspose(arr
, transpose_map
.data(), out_dict_type
, out_dict
,
1198 expected_indices
= ArrayFromJSON(index_ty
, "[2, 1]");
1199 CheckTranspose(sliced
, transpose_map
.data(), out_dict_type
, out_dict
,
1203 // Transpose to other index type
1204 auto out_dict
= ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
1205 std::vector
<int32_t> transpose_map
= {1, 3, 2};
1206 for (auto other_ty
: all_dictionary_index_types()) {
1207 auto out_dict_type
= dictionary(other_ty
, utf8());
1208 auto expected_indices
= ArrayFromJSON(other_ty
, "[3, 2, 1, 1]");
1209 CheckTranspose(arr
, transpose_map
.data(), out_dict_type
, out_dict
,
1213 expected_indices
= ArrayFromJSON(other_ty
, "[2, 1]");
1214 CheckTranspose(sliced
, transpose_map
.data(), out_dict_type
, out_dict
,
1219 for (auto ty
: all_dictionary_index_types()) {
1224 TEST(TestDictionary
, TransposeTrivial
) {
1225 // Test a trivial transposition, possibly optimized away
1227 auto dict
= ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
1228 auto dict_type
= dictionary(int16(), utf8());
1229 auto indices
= ArrayFromJSON(int16(), "[1, 2, 0, 0]");
1230 // ["B", "C", "A", "A"]
1231 ASSERT_OK_AND_ASSIGN(auto arr
, DictionaryArray::FromArrays(dict_type
, indices
, dict
));
1233 auto sliced
= arr
->Slice(1, 2);
1235 std::vector
<int32_t> transpose_map
= {0, 1, 2};
1237 // Transpose to same index type
1239 auto out_dict_type
= dict_type
;
1240 auto out_dict
= ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]");
1241 auto expected_indices
= ArrayFromJSON(int16(), "[1, 2, 0, 0]");
1242 CheckTranspose(arr
, transpose_map
.data(), out_dict_type
, out_dict
, expected_indices
);
1245 expected_indices
= ArrayFromJSON(int16(), "[2, 0]");
1246 CheckTranspose(sliced
, transpose_map
.data(), out_dict_type
, out_dict
,
1250 // Transpose to other index type
1252 auto out_dict_type
= dictionary(int8(), utf8());
1253 auto out_dict
= ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]");
1254 auto expected_indices
= ArrayFromJSON(int8(), "[1, 2, 0, 0]");
1255 CheckTranspose(arr
, transpose_map
.data(), out_dict_type
, out_dict
, expected_indices
);
1258 expected_indices
= ArrayFromJSON(int8(), "[2, 0]");
1259 CheckTranspose(sliced
, transpose_map
.data(), out_dict_type
, out_dict
,
1264 TEST(TestDictionary
, GetValueIndex
) {
1265 const char* indices_json
= "[5, 0, 1, 3, 2, 4]";
1266 auto indices_int64
= ArrayFromJSON(int64(), indices_json
);
1267 auto dict
= ArrayFromJSON(int32(), "[10, 20, 30, 40, 50, 60]");
1269 const auto& typed_indices_int64
= checked_cast
<const Int64Array
&>(*indices_int64
);
1270 for (auto index_ty
: all_dictionary_index_types()) {
1271 auto indices
= ArrayFromJSON(index_ty
, indices_json
);
1272 auto dict_ty
= dictionary(index_ty
, int32());
1274 DictionaryArray
dict_arr(dict_ty
, indices
, dict
);
1277 auto sliced_dict_arr
= dict_arr
.Slice(offset
);
1279 for (int64_t i
= 0; i
< indices
->length(); ++i
) {
1280 ASSERT_EQ(dict_arr
.GetValueIndex(i
), typed_indices_int64
.Value(i
));
1281 if (i
< sliced_dict_arr
->length()) {
1282 ASSERT_EQ(checked_cast
<const DictionaryArray
&>(*sliced_dict_arr
).GetValueIndex(i
),
1283 typed_indices_int64
.Value(i
+ offset
));
1289 TEST(TestDictionary
, TransposeNulls
) {
1290 auto dict
= ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
1291 auto dict_type
= dictionary(int16(), utf8());
1292 auto indices
= ArrayFromJSON(int16(), "[1, 2, null, 0]");
1293 // ["B", "C", null, "A"]
1294 ASSERT_OK_AND_ASSIGN(auto arr
, DictionaryArray::FromArrays(dict_type
, indices
, dict
));
1296 auto sliced
= arr
->Slice(1, 2);
1298 auto out_dict
= ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
1299 auto out_dict_type
= dictionary(int16(), utf8());
1300 auto expected_indices
= ArrayFromJSON(int16(), "[3, 2, null, 1]");
1302 std::vector
<int32_t> transpose_map
= {1, 3, 2};
1303 CheckTranspose(arr
, transpose_map
.data(), out_dict_type
, out_dict
, expected_indices
);
1306 expected_indices
= ArrayFromJSON(int16(), "[2, null]");
1307 CheckTranspose(sliced
, transpose_map
.data(), out_dict_type
, out_dict
, expected_indices
);
1310 TEST(TestDictionary
, ListOfDictionary
) {
1311 std::unique_ptr
<ArrayBuilder
> root_builder
;
1312 ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())),
1314 auto list_builder
= checked_cast
<ListBuilder
*>(root_builder
.get());
1316 checked_cast
<DictionaryBuilder
<StringType
>*>(list_builder
->value_builder());
1318 ASSERT_OK(list_builder
->Append());
1319 std::vector
<std::string
> expected
;
1320 for (char a
: util::string_view("abc")) {
1321 for (char d
: util::string_view("def")) {
1322 for (char g
: util::string_view("ghi")) {
1323 for (char j
: util::string_view("jkl")) {
1324 for (char m
: util::string_view("mno")) {
1325 for (char p
: util::string_view("pqr")) {
1326 if ((static_cast<int>(a
) + d
+ g
+ j
+ m
+ p
) % 16 == 0) {
1327 ASSERT_OK(list_builder
->Append());
1329 // 3**6 distinct strings; too large for int8
1330 char str
[] = {a
, d
, g
, j
, m
, p
, '\0'};
1331 ASSERT_OK(dict_builder
->Append(str
));
1332 expected
.push_back(str
);
1340 ASSERT_TRUE(list_builder
->type()->Equals(list(dictionary(int16(), utf8()))));
1342 std::shared_ptr
<Array
> expected_dict
;
1343 ArrayFromVector
<StringType
, std::string
>(expected
, &expected_dict
);
1345 std::shared_ptr
<Array
> array
;
1346 ASSERT_OK(root_builder
->Finish(&array
));
1347 ASSERT_OK(array
->ValidateFull());
1349 auto expected_type
= list(dictionary(int16(), utf8()));
1350 ASSERT_EQ(array
->type()->ToString(), expected_type
->ToString());
1352 auto list_array
= checked_cast
<const ListArray
*>(array
.get());
1354 checked_cast
<const DictionaryArray
&>(*list_array
->values()).dictionary();
1355 ASSERT_ARRAYS_EQUAL(*expected_dict
, *actual_dict
);
1358 TEST(TestDictionary
, CanCompareIndices
) {
1359 auto make_dict
= [](std::shared_ptr
<DataType
> index_type
,
1360 std::shared_ptr
<DataType
> value_type
, std::string dictionary_json
) {
1361 std::shared_ptr
<Array
> out
;
1363 DictionaryArray::FromArrays(dictionary(index_type
, value_type
),
1364 ArrayFromJSON(index_type
, "[]"),
1365 ArrayFromJSON(value_type
, dictionary_json
))
1367 return checked_pointer_cast
<DictionaryArray
>(out
);
1370 auto compare_and_swap
= [](const DictionaryArray
& l
, const DictionaryArray
& r
,
1372 ASSERT_EQ(l
.CanCompareIndices(r
), expected
)
1373 << "left: " << l
.ToString() << "\nright: " << r
.ToString();
1374 ASSERT_EQ(r
.CanCompareIndices(l
), expected
)
1375 << "left: " << r
.ToString() << "\nright: " << l
.ToString();
1379 auto array
= make_dict(int16(), utf8(), R
"(["foo
", "bar
"])");
1380 auto same
= make_dict(int16(), utf8(), R
"(["foo
", "bar
"])");
1381 compare_and_swap(*array
, *same
, true);
1385 auto array
= make_dict(int16(), utf8(), R
"(["foo
", "bar
", "quux
"])");
1386 auto prefix_dict
= make_dict(int16(), utf8(), R
"(["foo
", "bar
"])");
1387 compare_and_swap(*array
, *prefix_dict
, true);
1391 auto array
= make_dict(int16(), utf8(), R
"(["foo
", "bar
"])");
1392 auto indices_need_casting
= make_dict(int8(), utf8(), R
"(["foo
", "bar
"])");
1393 compare_and_swap(*array
, *indices_need_casting
, false);
1397 auto array
= make_dict(int16(), utf8(), R
"(["foo
", "bar
", "quux
"])");
1398 auto non_prefix_dict
= make_dict(int16(), utf8(), R
"(["foo
", "blink
"])");
1399 compare_and_swap(*array
, *non_prefix_dict
, false);
1403 TEST(TestDictionary
, IndicesArray
) {
1404 auto dict
= ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1405 auto dict_type
= dictionary(int16(), utf8());
1406 auto indices
= ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]");
1407 auto arr
= std::make_shared
<DictionaryArray
>(dict_type
, indices
, dict
);
1409 // The indices array should not have dictionary data
1410 ASSERT_EQ(arr
->indices()->data()->dictionary
, nullptr);
1412 // Validate the indices array
1413 ASSERT_OK(arr
->indices()->ValidateFull());
1416 TEST(TestDictionaryUnifier
, Numeric
) {
1417 auto dict_ty
= int64();
1419 auto d1
= ArrayFromJSON(dict_ty
, "[3, 4, 7]");
1420 auto d2
= ArrayFromJSON(dict_ty
, "[1, 7, 4, 8]");
1421 auto d3
= ArrayFromJSON(dict_ty
, "[1, -200]");
1423 auto expected
= dictionary(int8(), dict_ty
);
1424 auto expected_dict
= ArrayFromJSON(dict_ty
, "[3, 4, 7, 1, 8, -200]");
1426 ASSERT_OK_AND_ASSIGN(auto unifier
, DictionaryUnifier::Make(dict_ty
));
1428 std::shared_ptr
<DataType
> out_type
;
1429 std::shared_ptr
<Array
> out_dict
;
1431 ASSERT_OK(unifier
->Unify(*d1
));
1432 ASSERT_OK(unifier
->Unify(*d2
));
1433 ASSERT_OK(unifier
->Unify(*d3
));
1435 ASSERT_RAISES(Invalid
, unifier
->Unify(*ArrayFromJSON(int32(), "[1, -200]")));
1437 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1438 ASSERT_TRUE(out_type
->Equals(*expected
));
1439 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1441 std::shared_ptr
<Buffer
> b1
, b2
, b3
;
1443 ASSERT_OK(unifier
->Unify(*d1
, &b1
));
1444 ASSERT_OK(unifier
->Unify(*d2
, &b2
));
1445 ASSERT_OK(unifier
->Unify(*d3
, &b3
));
1446 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1447 ASSERT_TRUE(out_type
->Equals(*expected
));
1448 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1450 CheckTransposeMap(*b1
, {0, 1, 2});
1451 CheckTransposeMap(*b2
, {3, 2, 1, 4});
1452 CheckTransposeMap(*b3
, {3, 5});
1455 TEST(TestDictionaryUnifier
, String
) {
1456 auto dict_ty
= utf8();
1458 auto t1
= dictionary(int16(), dict_ty
);
1459 auto d1
= ArrayFromJSON(dict_ty
, "[\"foo\", \"bar\"]");
1461 auto t2
= dictionary(int32(), dict_ty
);
1462 auto d2
= ArrayFromJSON(dict_ty
, "[\"quux\", \"foo\"]");
1464 auto expected
= dictionary(int8(), dict_ty
);
1465 auto expected_dict
= ArrayFromJSON(dict_ty
, "[\"foo\", \"bar\", \"quux\"]");
1467 ASSERT_OK_AND_ASSIGN(auto unifier
, DictionaryUnifier::Make(dict_ty
));
1469 std::shared_ptr
<DataType
> out_type
;
1470 std::shared_ptr
<Array
> out_dict
;
1471 ASSERT_OK(unifier
->Unify(*d1
));
1472 ASSERT_OK(unifier
->Unify(*d2
));
1473 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1474 ASSERT_TRUE(out_type
->Equals(*expected
));
1475 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1477 std::shared_ptr
<Buffer
> b1
, b2
;
1479 ASSERT_OK(unifier
->Unify(*d1
, &b1
));
1480 ASSERT_OK(unifier
->Unify(*d2
, &b2
));
1481 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1482 ASSERT_TRUE(out_type
->Equals(*expected
));
1483 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1485 CheckTransposeMap(*b1
, {0, 1});
1486 CheckTransposeMap(*b2
, {2, 0});
1489 TEST(TestDictionaryUnifier
, FixedSizeBinary
) {
1490 auto type
= fixed_size_binary(3);
1492 std::string data
= "foobarbazqux";
1493 auto buf
= std::make_shared
<Buffer
>(data
);
1495 auto dict1
= std::make_shared
<FixedSizeBinaryArray
>(type
, 2, SliceBuffer(buf
, 0, 6));
1496 auto t1
= dictionary(int16(), type
);
1497 // ["bar", "baz", "qux"]
1498 auto dict2
= std::make_shared
<FixedSizeBinaryArray
>(type
, 3, SliceBuffer(buf
, 3, 9));
1499 auto t2
= dictionary(int16(), type
);
1501 // ["foo", "bar", "baz", "qux"]
1502 auto expected_dict
= std::make_shared
<FixedSizeBinaryArray
>(type
, 4, buf
);
1503 auto expected
= dictionary(int8(), type
);
1505 ASSERT_OK_AND_ASSIGN(auto unifier
, DictionaryUnifier::Make(type
));
1506 std::shared_ptr
<DataType
> out_type
;
1507 std::shared_ptr
<Array
> out_dict
;
1508 ASSERT_OK(unifier
->Unify(*dict1
));
1509 ASSERT_OK(unifier
->Unify(*dict2
));
1510 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1511 ASSERT_TRUE(out_type
->Equals(*expected
));
1512 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1514 std::shared_ptr
<Buffer
> b1
, b2
;
1515 ASSERT_OK(unifier
->Unify(*dict1
, &b1
));
1516 ASSERT_OK(unifier
->Unify(*dict2
, &b2
));
1517 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1518 ASSERT_TRUE(out_type
->Equals(*expected
));
1519 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1521 CheckTransposeMap(*b1
, {0, 1});
1522 CheckTransposeMap(*b2
, {1, 2, 3});
1525 TEST(TestDictionaryUnifier
, Large
) {
1526 // Unifying "large" dictionary types should choose the right index type
1527 std::shared_ptr
<Array
> dict1
, dict2
, expected_dict
;
1529 Int32Builder builder
;
1530 ASSERT_OK(builder
.Reserve(120));
1531 for (int32_t i
= 0; i
< 120; ++i
) {
1532 builder
.UnsafeAppend(i
);
1534 ASSERT_OK(builder
.Finish(&dict1
));
1535 ASSERT_EQ(dict1
->length(), 120);
1536 auto t1
= dictionary(int8(), int32());
1538 ASSERT_OK(builder
.Reserve(30));
1539 for (int32_t i
= 110; i
< 140; ++i
) {
1540 builder
.UnsafeAppend(i
);
1542 ASSERT_OK(builder
.Finish(&dict2
));
1543 ASSERT_EQ(dict2
->length(), 30);
1544 auto t2
= dictionary(int8(), int32());
1546 ASSERT_OK(builder
.Reserve(140));
1547 for (int32_t i
= 0; i
< 140; ++i
) {
1548 builder
.UnsafeAppend(i
);
1550 ASSERT_OK(builder
.Finish(&expected_dict
));
1551 ASSERT_EQ(expected_dict
->length(), 140);
1553 // int8 would be too narrow to hold all possible index values
1554 auto expected
= dictionary(int16(), int32());
1556 ASSERT_OK_AND_ASSIGN(auto unifier
, DictionaryUnifier::Make(int32()));
1557 std::shared_ptr
<DataType
> out_type
;
1558 std::shared_ptr
<Array
> out_dict
;
1559 ASSERT_OK(unifier
->Unify(*dict1
));
1560 ASSERT_OK(unifier
->Unify(*dict2
));
1561 ASSERT_OK(unifier
->GetResult(&out_type
, &out_dict
));
1562 ASSERT_TRUE(out_type
->Equals(*expected
));
1563 ASSERT_TRUE(out_dict
->Equals(*expected_dict
));
1566 TEST(TestDictionaryUnifier
, ChunkedArraySimple
) {
1567 auto type
= dictionary(int8(), utf8());
1568 auto chunk1
= ArrayFromJSON(type
, R
"(["ab
", "cd
", null, "cd
"])");
1569 auto chunk2
= ArrayFromJSON(type
, R
"(["ef
", "cd
", "ef
"])");
1570 auto chunk3
= ArrayFromJSON(type
, R
"(["ef
", "ab
", null, "ab
"])");
1571 auto chunk4
= ArrayFromJSON(type
, "[]");
1572 ASSERT_OK_AND_ASSIGN(auto chunked
,
1573 ChunkedArray::Make({chunk1
, chunk2
, chunk3
, chunk4
}));
1575 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1576 ASSERT_EQ(unified
->num_chunks(), 4);
1577 auto expected_dict
= ArrayFromJSON(utf8(), R
"(["ab
", "cd
", "ef
"])");
1578 CheckDictionaryArray(unified
->chunk(0), expected_dict
,
1579 ArrayFromJSON(int8(), "[0, 1, null, 1]"));
1580 CheckDictionaryArray(unified
->chunk(1), expected_dict
,
1581 ArrayFromJSON(int8(), "[2, 1, 2]"));
1582 CheckDictionaryArray(unified
->chunk(2), expected_dict
,
1583 ArrayFromJSON(int8(), "[2, 0, null, 0]"));
1584 CheckDictionaryArray(unified
->chunk(3), expected_dict
, ArrayFromJSON(int8(), "[]"));
1587 TEST(TestDictionaryUnifier
, ChunkedArrayZeroChunk
) {
1588 auto type
= dictionary(int8(), utf8());
1589 ASSERT_OK_AND_ASSIGN(auto chunked
, ChunkedArray::Make(ArrayVector
{}, type
));
1590 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1591 AssertChunkedEqual(*chunked
, *unified
);
1594 TEST(TestDictionaryUnifier
, ChunkedArrayOneChunk
) {
1595 auto type
= dictionary(int8(), utf8());
1596 auto chunk1
= ArrayFromJSON(type
, R
"(["ab
", "cd
", null, "cd
"])");
1597 ASSERT_OK_AND_ASSIGN(auto chunked
, ChunkedArray::Make({chunk1
}));
1598 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1599 AssertChunkedEqual(*chunked
, *unified
);
1602 TEST(TestDictionaryUnifier
, ChunkedArrayNoDict
) {
1604 auto chunk1
= ArrayFromJSON(type
, "[1, 1, 2, 3]");
1605 auto chunk2
= ArrayFromJSON(type
, "[5, 8, 13]");
1606 ASSERT_OK_AND_ASSIGN(auto chunked
, ChunkedArray::Make({chunk1
, chunk2
}));
1607 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1608 AssertChunkedEqual(*chunked
, *unified
);
1611 TEST(TestDictionaryUnifier
, ChunkedArrayNested
) {
1612 // Dict in a nested type: ok
1613 auto type
= list(dictionary(int16(), utf8()));
1614 auto chunk1
= ArrayFromJSON(type
, R
"([["ab
", "cd
"], ["cd
"]])");
1615 auto chunk2
= ArrayFromJSON(type
, R
"([[], ["ef
", "cd
", "ef
"]])");
1616 ASSERT_OK_AND_ASSIGN(auto chunked
, ChunkedArray::Make({chunk1
, chunk2
}));
1618 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1619 ASSERT_EQ(unified
->num_chunks(), 2);
1620 auto expected_dict
= ArrayFromJSON(utf8(), R
"(["ab
", "cd
", "ef
"])");
1621 auto unified1
= checked_pointer_cast
<ListArray
>(unified
->chunk(0));
1622 AssertArraysEqual(*unified1
->offsets(), *ArrayFromJSON(int32(), "[0, 2, 3]"));
1623 CheckDictionaryArray(unified1
->values(), expected_dict
,
1624 ArrayFromJSON(int16(), "[0, 1, 1]"));
1625 auto unified2
= checked_pointer_cast
<ListArray
>(unified
->chunk(1));
1626 AssertArraysEqual(*unified2
->offsets(), *ArrayFromJSON(int32(), "[0, 0, 3]"));
1627 CheckDictionaryArray(unified2
->values(), expected_dict
,
1628 ArrayFromJSON(int16(), "[2, 1, 2]"));
1631 TEST(TestDictionaryUnifier
, ChunkedArrayExtension
) {
1632 // Dict in an extension type: ok
1633 auto type
= dict_extension_type();
1634 auto chunk1
= DictExtensionFromJSON(type
, R
"(["ab
", null, "cd
", "ab
"])");
1635 auto chunk2
= DictExtensionFromJSON(type
, R
"(["ef
", "ab
", "ab
"])");
1636 ASSERT_OK_AND_ASSIGN(auto chunked
, ChunkedArray::Make({chunk1
, chunk2
}));
1638 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1639 ASSERT_EQ(unified
->num_chunks(), 2);
1641 auto expected_dict
= ArrayFromJSON(utf8(), R
"(["ab
", "cd
", "ef
"])");
1642 auto unified1
= checked_pointer_cast
<ExtensionArray
>(unified
->chunk(0));
1643 AssertTypeEqual(*type
, *unified1
->type());
1644 CheckDictionaryArray(unified1
->storage(), expected_dict
,
1645 ArrayFromJSON(int8(), "[0, null, 1, 0]"));
1646 auto unified2
= checked_pointer_cast
<ExtensionArray
>(unified
->chunk(1));
1647 AssertTypeEqual(*type
, *unified2
->type());
1648 CheckDictionaryArray(unified2
->storage(), expected_dict
,
1649 ArrayFromJSON(int8(), "[2, 0, 0]"));
1652 TEST(TestDictionaryUnifier
, ChunkedArrayNestedDict
) {
1653 // Dict in a dict type: unsupported
1654 auto inner_type
= list(dictionary(uint32(), utf8()));
1655 auto inner_dict1
= ArrayFromJSON(inner_type
, R
"([["ab
", "cd
"], [], ["cd
", null]])");
1656 ASSERT_OK_AND_ASSIGN(
1657 auto chunk1
, DictionaryArray::FromArrays(ArrayFromJSON(int32(), "[2, 1, 0, 1, 2]"),
1659 auto inner_dict2
= ArrayFromJSON(inner_type
, R
"([["cd
", "ef
"], ["cd
", null], []])");
1660 ASSERT_OK_AND_ASSIGN(
1662 DictionaryArray::FromArrays(ArrayFromJSON(int32(), "[1, 2, 2, 0]"), inner_dict2
));
1663 ASSERT_OK_AND_ASSIGN(auto chunked
, ChunkedArray::Make({chunk1
, chunk2
}));
1665 ASSERT_RAISES(NotImplemented
, DictionaryUnifier::UnifyChunkedArray(chunked
));
1668 TEST(TestDictionaryUnifier
, TableZeroColumns
) {
1669 auto schema
= ::arrow::schema(FieldVector
{});
1670 auto table
= Table::Make(schema
, ArrayVector
{}, /*num_rows=*/42);
1672 ASSERT_OK_AND_ASSIGN(auto unified
, DictionaryUnifier::UnifyTable(*table
));
1673 AssertSchemaEqual(*schema
, *unified
->schema());
1674 ASSERT_EQ(unified
->num_rows(), 42);
1675 AssertTablesEqual(*table
, *unified
);
1678 } // namespace arrow