]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/arrow/array/array_dict_test.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / array / array_dict_test.cc
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include <array>
19 #include <cstdint>
20 #include <memory>
21 #include <ostream>
22 #include <string>
23 #include <vector>
24
25 #include <gtest/gtest.h>
26
27 #include "arrow/array.h"
28 #include "arrow/array/builder_decimal.h"
29 #include "arrow/array/builder_dict.h"
30 #include "arrow/array/builder_nested.h"
31 #include "arrow/chunked_array.h"
32 #include "arrow/status.h"
33 #include "arrow/table.h"
34 #include "arrow/testing/extension_type.h"
35 #include "arrow/testing/gtest_common.h"
36 #include "arrow/testing/gtest_util.h"
37 #include "arrow/testing/util.h"
38 #include "arrow/type.h"
39 #include "arrow/util/checked_cast.h"
40 #include "arrow/util/decimal.h"
41
42 namespace arrow {
43
44 using internal::checked_cast;
45 using internal::checked_pointer_cast;
46
47 void CheckTransposeMap(const Buffer& map, std::vector<int32_t> expected) {
48 AssertBufferEqual(map, *Buffer::Wrap(expected));
49 }
50
51 void CheckDictionaryArray(const std::shared_ptr<Array>& array,
52 const std::shared_ptr<Array>& expected_values,
53 const std::shared_ptr<Array>& expected_indices) {
54 const auto& dict_array = checked_cast<const DictionaryArray&>(*array);
55 AssertArraysEqual(*expected_values, *dict_array.dictionary(), /*verbose=*/true);
56 AssertArraysEqual(*expected_indices, *dict_array.indices(), /*verbose=*/true);
57 }
58
59 std::shared_ptr<Array> DictExtensionFromJSON(const std::shared_ptr<DataType>& type,
60 const std::string& json) {
61 auto ext_type = checked_pointer_cast<ExtensionType>(type);
62 auto storage = ArrayFromJSON(ext_type->storage_type(), json);
63 auto ext_data = storage->data()->Copy();
64 ext_data->type = ext_type;
65 return MakeArray(ext_data);
66 }
67
68 // ----------------------------------------------------------------------
69 // Dictionary tests
70
71 template <typename Type>
72 class TestDictionaryBuilder : public TestBuilder {};
73
74 typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
75 UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>
76 PrimitiveDictionaries;
77
78 TYPED_TEST_SUITE(TestDictionaryBuilder, PrimitiveDictionaries);
79
80 TYPED_TEST(TestDictionaryBuilder, Basic) {
81 using c_type = typename TypeParam::c_type;
82
83 DictionaryBuilder<TypeParam> builder;
84 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
85 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
86 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
87 ASSERT_OK(builder.AppendNull());
88
89 ASSERT_EQ(builder.length(), 4);
90 ASSERT_EQ(builder.null_count(), 1);
91
92 // Build expected data
93 auto value_type = std::make_shared<TypeParam>();
94 auto dict_type = dictionary(int8(), value_type);
95
96 std::shared_ptr<Array> result;
97 ASSERT_OK(builder.Finish(&result));
98
99 DictionaryArray expected(dict_type, ArrayFromJSON(int8(), "[0, 1, 0, null]"),
100 ArrayFromJSON(value_type, "[1, 2]"));
101 ASSERT_TRUE(expected.Equals(result));
102 }
103
104 TYPED_TEST(TestDictionaryBuilder, ArrayInit) {
105 using c_type = typename TypeParam::c_type;
106
107 auto value_type = std::make_shared<TypeParam>();
108 auto dict_array = ArrayFromJSON(value_type, "[1, 2]");
109 auto dict_type = dictionary(int8(), value_type);
110
111 DictionaryBuilder<TypeParam> builder(dict_array);
112 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
113 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
114 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
115 ASSERT_OK(builder.AppendNull());
116
117 ASSERT_EQ(builder.length(), 4);
118 ASSERT_EQ(builder.null_count(), 1);
119
120 // Build expected data
121
122 std::shared_ptr<Array> result;
123 ASSERT_OK(builder.Finish(&result));
124
125 auto indices = ArrayFromJSON(int8(), "[0, 1, 0, null]");
126 DictionaryArray expected(dict_type, indices, dict_array);
127
128 AssertArraysEqual(expected, *result);
129 }
130
131 TYPED_TEST(TestDictionaryBuilder, MakeBuilder) {
132 using c_type = typename TypeParam::c_type;
133
134 auto value_type = std::make_shared<TypeParam>();
135 auto dict_array = ArrayFromJSON(value_type, "[1, 2]");
136 auto dict_type = dictionary(int8(), value_type);
137 std::unique_ptr<ArrayBuilder> boxed_builder;
138 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
139 auto& builder = checked_cast<DictionaryBuilder<TypeParam>&>(*boxed_builder);
140
141 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
142 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
143 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
144 ASSERT_OK(builder.AppendNull());
145
146 ASSERT_EQ(builder.length(), 4);
147 ASSERT_EQ(builder.null_count(), 1);
148
149 // Build expected data
150
151 std::shared_ptr<Array> result;
152 ASSERT_OK(builder.Finish(&result));
153
154 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
155 DictionaryArray expected(dict_type, int_array, dict_array);
156
157 AssertArraysEqual(expected, *result);
158 }
159
160 TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
161 auto type = std::make_shared<TypeParam>();
162
163 auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]");
164 DictionaryBuilder<TypeParam> dictionary_builder;
165 ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result));
166 std::shared_ptr<Array> result;
167 ASSERT_OK(dictionary_builder.Finish(&result));
168
169 // Build expected data
170 auto dict_array = ArrayFromJSON(type, "[1, 2]");
171 auto dict_type = dictionary(int8(), type);
172
173 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
174 DictionaryArray expected(dict_type, int_array, dict_array);
175
176 ASSERT_TRUE(expected.Equals(result));
177 }
178
179 TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
180 using Scalar = typename TypeParam::c_type;
181 // Skip this test for (u)int8
182 if (sizeof(Scalar) > 1) {
183 // Build the dictionary Array
184 DictionaryBuilder<TypeParam> builder;
185 // Build expected data
186 NumericBuilder<TypeParam> dict_builder;
187 Int16Builder int_builder;
188
189 // Fill with 1024 different values
190 for (int64_t i = 0; i < 1024; i++) {
191 ASSERT_OK(builder.Append(static_cast<Scalar>(i)));
192 ASSERT_OK(dict_builder.Append(static_cast<Scalar>(i)));
193 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
194 }
195 // Fill with an already existing value
196 for (int64_t i = 0; i < 1024; i++) {
197 ASSERT_OK(builder.Append(static_cast<Scalar>(1)));
198 ASSERT_OK(int_builder.Append(1));
199 }
200
201 // Finalize result
202 std::shared_ptr<Array> result;
203 FinishAndCheckPadding(&builder, &result);
204
205 // Finalize expected data
206 std::shared_ptr<Array> dict_array;
207 ASSERT_OK(dict_builder.Finish(&dict_array));
208
209 auto dtype = dictionary(int16(), dict_array->type());
210 std::shared_ptr<Array> int_array;
211 ASSERT_OK(int_builder.Finish(&int_array));
212
213 DictionaryArray expected(dtype, int_array, dict_array);
214 AssertArraysEqual(expected, *result);
215 }
216 }
217
218 TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) {
219 using c_type = typename TypeParam::c_type;
220 auto type = std::make_shared<TypeParam>();
221
222 DictionaryBuilder<TypeParam> builder;
223
224 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
225 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
226 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
227 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
228 std::shared_ptr<Array> result;
229 FinishAndCheckPadding(&builder, &result);
230
231 // Build expected data for the initial dictionary
232 auto ex_dict = ArrayFromJSON(type, "[1, 2]");
233 auto dict_type1 = dictionary(int8(), type);
234 DictionaryArray expected(dict_type1, ArrayFromJSON(int8(), "[0, 1, 0, 1]"), ex_dict);
235
236 ASSERT_TRUE(expected.Equals(result));
237
238 // extend the dictionary builder with new data
239 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
240 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
241 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
242 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
243 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
244
245 std::shared_ptr<Array> result_indices, result_delta;
246 ASSERT_OK(builder.FinishDelta(&result_indices, &result_delta));
247 AssertArraysEqual(*ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]"), *result_indices);
248 AssertArraysEqual(*ArrayFromJSON(type, "[3]"), *result_delta);
249 }
250
251 TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
252 using c_type = typename TypeParam::c_type;
253 auto type = std::make_shared<TypeParam>();
254 auto dict_type = dictionary(int8(), type);
255
256 DictionaryBuilder<TypeParam> builder;
257
258 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
259 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
260 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
261 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
262 std::shared_ptr<Array> result;
263 FinishAndCheckPadding(&builder, &result);
264
265 // Build expected data for the initial dictionary
266 auto ex_dict1 = ArrayFromJSON(type, "[1, 2]");
267 DictionaryArray expected(dict_type, ArrayFromJSON(int8(), "[0, 1, 0, 1]"), ex_dict1);
268
269 ASSERT_TRUE(expected.Equals(result));
270
271 // extend the dictionary builder with new data
272 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
273 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
274 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
275 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
276 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
277
278 std::shared_ptr<Array> result_indices1, result_delta1;
279 ASSERT_OK(builder.FinishDelta(&result_indices1, &result_delta1));
280 AssertArraysEqual(*ArrayFromJSON(int8(), "[1, 2, 2, 0, 2]"), *result_indices1);
281 AssertArraysEqual(*ArrayFromJSON(type, "[3]"), *result_delta1);
282
283 // extend the dictionary builder with new data again
284 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
285 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
286 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
287 ASSERT_OK(builder.Append(static_cast<c_type>(4)));
288 ASSERT_OK(builder.Append(static_cast<c_type>(5)));
289
290 std::shared_ptr<Array> result_indices2, result_delta2;
291 ASSERT_OK(builder.FinishDelta(&result_indices2, &result_delta2));
292 AssertArraysEqual(*ArrayFromJSON(int8(), "[0, 1, 2, 3, 4]"), *result_indices2);
293 AssertArraysEqual(*ArrayFromJSON(type, "[4, 5]"), *result_delta2);
294 }
295
296 TYPED_TEST(TestDictionaryBuilder, Dictionary32_BasicPrimitive) {
297 using c_type = typename TypeParam::c_type;
298 auto type = std::make_shared<TypeParam>();
299 auto dict_type = dictionary(int32(), type);
300
301 Dictionary32Builder<TypeParam> builder;
302
303 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
304 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
305 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
306 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
307 std::shared_ptr<Array> result;
308 FinishAndCheckPadding(&builder, &result);
309
310 // Build expected data for the initial dictionary
311 auto ex_dict1 = ArrayFromJSON(type, "[1, 2]");
312 DictionaryArray expected(dict_type, ArrayFromJSON(int32(), "[0, 1, 0, 1]"), ex_dict1);
313 ASSERT_TRUE(expected.Equals(result));
314 }
315
316 TYPED_TEST(TestDictionaryBuilder, FinishResetBehavior) {
317 // ARROW-6861
318 using c_type = typename TypeParam::c_type;
319 auto type = std::make_shared<TypeParam>();
320
321 Dictionary32Builder<TypeParam> builder;
322
323 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
324 ASSERT_OK(builder.AppendNull());
325 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
326 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
327
328 // Properties from indices_builder propagated
329 ASSERT_LT(0, builder.capacity());
330 ASSERT_LT(0, builder.null_count());
331 ASSERT_EQ(4, builder.length());
332
333 std::shared_ptr<Array> result;
334 ASSERT_OK(builder.Finish(&result));
335
336 // Everything reset
337 ASSERT_EQ(0, builder.capacity());
338 ASSERT_EQ(0, builder.length());
339 ASSERT_EQ(0, builder.null_count());
340
341 // Use the builder again
342 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
343 ASSERT_OK(builder.AppendNull());
344 ASSERT_OK(builder.Append(static_cast<c_type>(4)));
345
346 ASSERT_OK(builder.Finish(&result));
347
348 // Dictionary has 4 elements because the dictionary memo was not reset
349 ASSERT_EQ(4, static_cast<const DictionaryArray&>(*result).dictionary()->length());
350 }
351
352 TYPED_TEST(TestDictionaryBuilder, ResetFull) {
353 using c_type = typename TypeParam::c_type;
354 auto type = std::make_shared<TypeParam>();
355
356 Dictionary32Builder<TypeParam> builder;
357
358 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
359 ASSERT_OK(builder.AppendNull());
360 ASSERT_OK(builder.Append(static_cast<c_type>(1)));
361 ASSERT_OK(builder.Append(static_cast<c_type>(2)));
362
363 std::shared_ptr<Array> result;
364 ASSERT_OK(builder.Finish(&result));
365
366 ASSERT_OK(builder.Append(static_cast<c_type>(3)));
367 ASSERT_OK(builder.Finish(&result));
368
369 // Dictionary expanded
370 const auto& dict_result = static_cast<const DictionaryArray&>(*result);
371 AssertArraysEqual(*ArrayFromJSON(int32(), "[2]"), *dict_result.indices());
372 AssertArraysEqual(*ArrayFromJSON(type, "[1, 2, 3]"),
373 *static_cast<const DictionaryArray&>(*result).dictionary());
374
375 builder.ResetFull();
376 ASSERT_OK(builder.Append(static_cast<c_type>(4)));
377 ASSERT_OK(builder.Finish(&result));
378 const auto& dict_result2 = static_cast<const DictionaryArray&>(*result);
379 AssertArraysEqual(*ArrayFromJSON(int32(), "[0]"), *dict_result2.indices());
380 AssertArraysEqual(*ArrayFromJSON(type, "[4]"), *dict_result2.dictionary());
381 }
382
383 TEST(TestDictionaryBuilderAdHoc, AppendIndicesUpdateCapacity) {
384 DictionaryBuilder<Int32Type> builder;
385 Dictionary32Builder<Int32Type> builder32;
386
387 std::vector<int32_t> indices_i32 = {0, 1, 2};
388 std::vector<int64_t> indices_i64 = {0, 1, 2};
389
390 ASSERT_OK(builder.AppendIndices(indices_i64.data(), 3));
391 ASSERT_OK(builder32.AppendIndices(indices_i32.data(), 3));
392
393 ASSERT_LT(0, builder.capacity());
394 ASSERT_LT(0, builder32.capacity());
395 }
396
397 TEST(TestStringDictionaryBuilder, Basic) {
398 // Build the dictionary Array
399 StringDictionaryBuilder builder;
400 ASSERT_OK(builder.Append("test"));
401 ASSERT_OK(builder.Append("test2"));
402 ASSERT_OK(builder.Append("test", 4));
403
404 std::shared_ptr<Array> result;
405 ASSERT_OK(builder.Finish(&result));
406
407 // Build expected data
408 auto ex_dict = ArrayFromJSON(utf8(), "[\"test\", \"test2\"]");
409 auto dtype = dictionary(int8(), utf8());
410 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
411 DictionaryArray expected(dtype, int_array, ex_dict);
412
413 ASSERT_TRUE(expected.Equals(result));
414 }
415
416 template <typename BuilderType, typename IndexType, typename AppendCType>
417 void TestStringDictionaryAppendIndices() {
418 auto index_type = TypeTraits<IndexType>::type_singleton();
419
420 auto ex_dict = ArrayFromJSON(utf8(), R"(["c", "a", "b", "d"])");
421 auto invalid_dict = ArrayFromJSON(binary(), R"(["e", "f"])");
422
423 BuilderType builder;
424 ASSERT_OK(builder.InsertMemoValues(*ex_dict));
425
426 // Inserting again should have no effect
427 ASSERT_OK(builder.InsertMemoValues(*ex_dict));
428
429 // Type mismatch
430 ASSERT_RAISES(Invalid, builder.InsertMemoValues(*invalid_dict));
431
432 std::vector<AppendCType> raw_indices = {0, 1, 2, -1, 3};
433 std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
434 for (int i = 0; i < 2; ++i) {
435 ASSERT_OK(builder.AppendIndices(
436 raw_indices.data(), static_cast<int64_t>(raw_indices.size()), is_valid.data()));
437 }
438
439 ASSERT_EQ(10, builder.length());
440
441 std::shared_ptr<Array> result;
442 ASSERT_OK(builder.Finish(&result));
443
444 auto ex_indices = ArrayFromJSON(index_type, R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
445 auto dtype = dictionary(index_type, utf8());
446 DictionaryArray expected(dtype, ex_indices, ex_dict);
447 ASSERT_TRUE(expected.Equals(result));
448 }
449
450 TEST(TestStringDictionaryBuilder, AppendIndices) {
451 // Currently AdaptiveIntBuilder only accepts int64_t in bulk appends
452 TestStringDictionaryAppendIndices<StringDictionaryBuilder, Int8Type, int64_t>();
453
454 TestStringDictionaryAppendIndices<StringDictionary32Builder, Int32Type, int32_t>();
455 }
456
457 TEST(TestStringDictionaryBuilder, ArrayInit) {
458 auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
459 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
460
461 // Build the dictionary Array
462 StringDictionaryBuilder builder(dict_array);
463 ASSERT_OK(builder.Append("test"));
464 ASSERT_OK(builder.Append("test2"));
465 ASSERT_OK(builder.Append("test"));
466
467 std::shared_ptr<Array> result;
468 ASSERT_OK(builder.Finish(&result));
469
470 // Build expected data
471 DictionaryArray expected(dictionary(int8(), utf8()), int_array, dict_array);
472
473 AssertArraysEqual(expected, *result);
474 }
475
476 template <typename BuilderType>
477 void TestStringDictionaryMakeBuilder(const std::shared_ptr<DataType>& value_type) {
478 auto dict_array = ArrayFromJSON(value_type, R"(["test", "test2"])");
479 auto dict_type = dictionary(int8(), value_type);
480 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
481 std::unique_ptr<ArrayBuilder> boxed_builder;
482 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
483 auto& builder = checked_cast<BuilderType&>(*boxed_builder);
484
485 // Build the dictionary Array
486 ASSERT_OK(builder.Append("test"));
487 ASSERT_OK(builder.Append("test2"));
488 ASSERT_OK(builder.Append("test"));
489
490 std::shared_ptr<Array> result;
491 ASSERT_OK(builder.Finish(&result));
492
493 // Build expected data
494 DictionaryArray expected(dict_type, int_array, dict_array);
495
496 AssertArraysEqual(expected, *result);
497 }
498
499 TEST(TestStringDictionaryBuilder, MakeBuilder) {
500 TestStringDictionaryMakeBuilder<DictionaryBuilder<StringType>>(utf8());
501 }
502
503 TEST(TestLargeStringDictionaryBuilder, MakeBuilder) {
504 TestStringDictionaryMakeBuilder<DictionaryBuilder<LargeStringType>>(large_utf8());
505 }
506
507 // ARROW-4367
508 TEST(TestStringDictionaryBuilder, OnlyNull) {
509 // Build the dictionary Array
510 StringDictionaryBuilder builder;
511 ASSERT_OK(builder.AppendNull());
512
513 std::shared_ptr<Array> result;
514 ASSERT_OK(builder.Finish(&result));
515
516 // Build expected data
517 auto dict = ArrayFromJSON(utf8(), "[]");
518 auto dtype = dictionary(int8(), utf8());
519 auto int_array = ArrayFromJSON(int8(), "[null]");
520 DictionaryArray expected(dtype, int_array, dict);
521
522 ASSERT_TRUE(expected.Equals(result));
523 }
524
525 TEST(TestStringDictionaryBuilder, DoubleTableSize) {
526 // Build the dictionary Array
527 StringDictionaryBuilder builder;
528 // Build expected data
529 StringBuilder str_builder;
530 Int16Builder int_builder;
531
532 // Fill with 1024 different values
533 for (int64_t i = 0; i < 1024; i++) {
534 std::stringstream ss;
535 ss << "test" << i;
536 ASSERT_OK(builder.Append(ss.str()));
537 ASSERT_OK(str_builder.Append(ss.str()));
538 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
539 }
540 // Fill with an already existing value
541 for (int64_t i = 0; i < 1024; i++) {
542 ASSERT_OK(builder.Append("test1"));
543 ASSERT_OK(int_builder.Append(1));
544 }
545
546 // Finalize result
547 std::shared_ptr<Array> result;
548 FinishAndCheckPadding(&builder, &result);
549
550 // Finalize expected data
551 std::shared_ptr<Array> str_array;
552 ASSERT_OK(str_builder.Finish(&str_array));
553 auto dtype = dictionary(int16(), utf8());
554 std::shared_ptr<Array> int_array;
555 ASSERT_OK(int_builder.Finish(&int_array));
556
557 DictionaryArray expected(dtype, int_array, str_array);
558 ASSERT_TRUE(expected.Equals(result));
559 }
560
561 TEST(TestStringDictionaryBuilder, DeltaDictionary) {
562 // Build the dictionary Array
563 StringDictionaryBuilder builder;
564 ASSERT_OK(builder.Append("test"));
565 ASSERT_OK(builder.Append("test2"));
566 ASSERT_OK(builder.Append("test"));
567
568 std::shared_ptr<Array> result;
569 ASSERT_OK(builder.Finish(&result));
570
571 // Build expected data
572 auto dict = ArrayFromJSON(utf8(), "[\"test\", \"test2\"]");
573 auto dtype = dictionary(int8(), utf8());
574 auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
575 DictionaryArray expected(dtype, int_array, dict);
576
577 ASSERT_TRUE(expected.Equals(result));
578
579 // build a delta dictionary
580 ASSERT_OK(builder.Append("test2"));
581 ASSERT_OK(builder.Append("test3"));
582 ASSERT_OK(builder.Append("test2"));
583
584 std::shared_ptr<Array> result_indices, result_delta;
585 ASSERT_OK(builder.FinishDelta(&result_indices, &result_delta));
586
587 // Build expected data
588 AssertArraysEqual(*ArrayFromJSON(int8(), "[1, 2, 1]"), *result_indices);
589 AssertArraysEqual(*ArrayFromJSON(utf8(), "[\"test3\"]"), *result_delta);
590 }
591
592 TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {
593 constexpr int16_t kTestLength = 2048;
594 // Build the dictionary Array
595 StringDictionaryBuilder builder;
596
597 StringBuilder str_builder1;
598 Int16Builder int_builder1;
599
600 for (int16_t idx = 0; idx < kTestLength; ++idx) {
601 std::stringstream sstream;
602 sstream << "test" << idx;
603 ASSERT_OK(builder.Append(sstream.str()));
604 ASSERT_OK(str_builder1.Append(sstream.str()));
605 ASSERT_OK(int_builder1.Append(idx));
606 }
607
608 std::shared_ptr<Array> result;
609 FinishAndCheckPadding(&builder, &result);
610
611 std::shared_ptr<Array> str_array1;
612 ASSERT_OK(str_builder1.Finish(&str_array1));
613
614 auto dtype1 = dictionary(int16(), utf8());
615
616 std::shared_ptr<Array> int_array1;
617 ASSERT_OK(int_builder1.Finish(&int_array1));
618
619 DictionaryArray expected(dtype1, int_array1, str_array1);
620 ASSERT_TRUE(expected.Equals(result));
621
622 // build delta 1
623 StringBuilder str_builder2;
624 Int16Builder int_builder2;
625
626 for (int16_t idx = 0; idx < kTestLength; ++idx) {
627 ASSERT_OK(builder.Append("test1"));
628 ASSERT_OK(int_builder2.Append(1));
629 }
630
631 for (int16_t idx = 0; idx < kTestLength; ++idx) {
632 ASSERT_OK(builder.Append("test_new_value1"));
633 ASSERT_OK(int_builder2.Append(kTestLength));
634 }
635 ASSERT_OK(str_builder2.Append("test_new_value1"));
636
637 std::shared_ptr<Array> indices2, delta2;
638 ASSERT_OK(builder.FinishDelta(&indices2, &delta2));
639
640 std::shared_ptr<Array> str_array2;
641 ASSERT_OK(str_builder2.Finish(&str_array2));
642
643 std::shared_ptr<Array> int_array2;
644 ASSERT_OK(int_builder2.Finish(&int_array2));
645
646 AssertArraysEqual(*int_array2, *indices2);
647 AssertArraysEqual(*str_array2, *delta2);
648
649 // build delta 2
650 StringBuilder str_builder3;
651 Int16Builder int_builder3;
652
653 for (int16_t idx = 0; idx < kTestLength; ++idx) {
654 ASSERT_OK(builder.Append("test2"));
655 ASSERT_OK(int_builder3.Append(2));
656 }
657
658 for (int16_t idx = 0; idx < kTestLength; ++idx) {
659 ASSERT_OK(builder.Append("test_new_value2"));
660 ASSERT_OK(int_builder3.Append(kTestLength + 1));
661 }
662 ASSERT_OK(str_builder3.Append("test_new_value2"));
663
664 std::shared_ptr<Array> indices3, delta3;
665 ASSERT_OK(builder.FinishDelta(&indices3, &delta3));
666
667 std::shared_ptr<Array> str_array3;
668 ASSERT_OK(str_builder3.Finish(&str_array3));
669
670 std::shared_ptr<Array> int_array3;
671 ASSERT_OK(int_builder3.Finish(&int_array3));
672
673 AssertArraysEqual(*int_array3, *indices3);
674 AssertArraysEqual(*str_array3, *delta3);
675 }
676
677 TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
678 // Build the dictionary Array
679 DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4));
680 std::vector<uint8_t> test{12, 12, 11, 12};
681 std::vector<uint8_t> test2{12, 12, 11, 11};
682 ASSERT_OK(builder.Append(test.data()));
683 ASSERT_OK(builder.Append(test2.data()));
684 ASSERT_OK(builder.Append(test.data()));
685
686 std::shared_ptr<Array> result;
687 FinishAndCheckPadding(&builder, &result);
688
689 // Build expected data
690 auto value_type = arrow::fixed_size_binary(4);
691 FixedSizeBinaryBuilder fsb_builder(value_type);
692 ASSERT_OK(fsb_builder.Append(test.data()));
693 ASSERT_OK(fsb_builder.Append(test2.data()));
694 std::shared_ptr<Array> fsb_array;
695 ASSERT_OK(fsb_builder.Finish(&fsb_array));
696
697 auto dtype = dictionary(int8(), value_type);
698
699 Int8Builder int_builder;
700 ASSERT_OK(int_builder.Append(0));
701 ASSERT_OK(int_builder.Append(1));
702 ASSERT_OK(int_builder.Append(0));
703 std::shared_ptr<Array> int_array;
704 ASSERT_OK(int_builder.Finish(&int_array));
705
706 DictionaryArray expected(dtype, int_array, fsb_array);
707 ASSERT_TRUE(expected.Equals(result));
708 }
709
710 TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) {
711 // Build the dictionary Array
712 auto value_type = fixed_size_binary(4);
713 auto dict_array = ArrayFromJSON(value_type, R"(["abcd", "wxyz"])");
714 util::string_view test = "abcd", test2 = "wxyz";
715 DictionaryBuilder<FixedSizeBinaryType> builder(dict_array);
716 ASSERT_OK(builder.Append(test));
717 ASSERT_OK(builder.Append(test2));
718 ASSERT_OK(builder.Append(test));
719
720 std::shared_ptr<Array> result;
721 FinishAndCheckPadding(&builder, &result);
722
723 // Build expected data
724 auto indices = ArrayFromJSON(int8(), "[0, 1, 0]");
725 DictionaryArray expected(dictionary(int8(), value_type), indices, dict_array);
726 AssertArraysEqual(expected, *result);
727 }
728
729 TEST(TestFixedSizeBinaryDictionaryBuilder, MakeBuilder) {
730 // Build the dictionary Array
731 auto value_type = fixed_size_binary(4);
732 auto dict_array = ArrayFromJSON(value_type, R"(["abcd", "wxyz"])");
733 auto dict_type = dictionary(int8(), value_type);
734
735 std::unique_ptr<ArrayBuilder> boxed_builder;
736 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
737 auto& builder = checked_cast<DictionaryBuilder<FixedSizeBinaryType>&>(*boxed_builder);
738 util::string_view test = "abcd", test2 = "wxyz";
739 ASSERT_OK(builder.Append(test));
740 ASSERT_OK(builder.Append(test2));
741 ASSERT_OK(builder.Append(test));
742
743 std::shared_ptr<Array> result;
744 FinishAndCheckPadding(&builder, &result);
745
746 // Build expected data
747 auto indices = ArrayFromJSON(int8(), "[0, 1, 0]");
748 DictionaryArray expected(dict_type, indices, dict_array);
749 AssertArraysEqual(expected, *result);
750 }
751
752 TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
753 // Build the dictionary Array
754 auto value_type = arrow::fixed_size_binary(4);
755 auto dict_type = dictionary(int8(), value_type);
756
757 DictionaryBuilder<FixedSizeBinaryType> builder(value_type);
758 std::vector<uint8_t> test{12, 12, 11, 12};
759 std::vector<uint8_t> test2{12, 12, 11, 11};
760 std::vector<uint8_t> test3{12, 12, 11, 10};
761
762 ASSERT_OK(builder.Append(test.data()));
763 ASSERT_OK(builder.Append(test2.data()));
764 ASSERT_OK(builder.Append(test.data()));
765
766 std::shared_ptr<Array> result1;
767 FinishAndCheckPadding(&builder, &result1);
768
769 // Build expected data
770 FixedSizeBinaryBuilder fsb_builder1(value_type);
771 ASSERT_OK(fsb_builder1.Append(test.data()));
772 ASSERT_OK(fsb_builder1.Append(test2.data()));
773 std::shared_ptr<Array> fsb_array1;
774 ASSERT_OK(fsb_builder1.Finish(&fsb_array1));
775
776 Int8Builder int_builder1;
777 ASSERT_OK(int_builder1.Append(0));
778 ASSERT_OK(int_builder1.Append(1));
779 ASSERT_OK(int_builder1.Append(0));
780 std::shared_ptr<Array> int_array1;
781 ASSERT_OK(int_builder1.Finish(&int_array1));
782
783 DictionaryArray expected1(dict_type, int_array1, fsb_array1);
784 ASSERT_TRUE(expected1.Equals(result1));
785
786 // build delta dictionary
787 ASSERT_OK(builder.Append(test.data()));
788 ASSERT_OK(builder.Append(test2.data()));
789 ASSERT_OK(builder.Append(test3.data()));
790
791 std::shared_ptr<Array> indices2, delta2;
792 ASSERT_OK(builder.FinishDelta(&indices2, &delta2));
793
794 // Build expected data
795 FixedSizeBinaryBuilder fsb_builder2(value_type);
796 ASSERT_OK(fsb_builder2.Append(test3.data()));
797 std::shared_ptr<Array> fsb_array2;
798 ASSERT_OK(fsb_builder2.Finish(&fsb_array2));
799
800 Int8Builder int_builder2;
801 ASSERT_OK(int_builder2.Append(0));
802 ASSERT_OK(int_builder2.Append(1));
803 ASSERT_OK(int_builder2.Append(2));
804
805 std::shared_ptr<Array> int_array2;
806 ASSERT_OK(int_builder2.Finish(&int_array2));
807
808 AssertArraysEqual(*int_array2, *indices2);
809 AssertArraysEqual(*fsb_array2, *delta2);
810 }
811
812 TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
813 // Build the dictionary Array
814 auto value_type = arrow::fixed_size_binary(4);
815 auto dict_type = dictionary(int16(), value_type);
816
817 DictionaryBuilder<FixedSizeBinaryType> builder(value_type);
818 // Build expected data
819 FixedSizeBinaryBuilder fsb_builder(value_type);
820 Int16Builder int_builder;
821
822 // Fill with 1024 different values
823 for (int64_t i = 0; i < 1024; i++) {
824 std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128),
825 static_cast<uint8_t>(i % 128)};
826 ASSERT_OK(builder.Append(value.data()));
827 ASSERT_OK(fsb_builder.Append(value.data()));
828 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
829 }
830 // Fill with an already existing value
831 std::vector<uint8_t> known_value{12, 12, 0, 1};
832 for (int64_t i = 0; i < 1024; i++) {
833 ASSERT_OK(builder.Append(known_value.data()));
834 ASSERT_OK(int_builder.Append(1));
835 }
836
837 // Finalize result
838 std::shared_ptr<Array> result;
839 ASSERT_OK(builder.Finish(&result));
840
841 // Finalize expected data
842 std::shared_ptr<Array> fsb_array;
843 ASSERT_OK(fsb_builder.Finish(&fsb_array));
844 std::shared_ptr<Array> int_array;
845 ASSERT_OK(int_builder.Finish(&int_array));
846
847 DictionaryArray expected(dict_type, int_array, fsb_array);
848 ASSERT_TRUE(expected.Equals(result));
849 }
850
851 #ifndef NDEBUG
852 TEST(TestFixedSizeBinaryDictionaryBuilder, AppendArrayInvalidType) {
853 // Build the dictionary Array
854 auto value_type = fixed_size_binary(4);
855 DictionaryBuilder<FixedSizeBinaryType> builder(value_type);
856 // Build an array with different byte width
857 auto fsb_array = ArrayFromJSON(fixed_size_binary(3), R"(["foo", "bar"])");
858
859 ASSERT_RAISES(TypeError, builder.AppendArray(*fsb_array));
860 }
861 #endif
862
863 template <typename DecimalValue>
864 void TestDecimalDictionaryBuilderBasic(std::shared_ptr<DataType> decimal_type) {
865 // Build the dictionary Array
866 DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type);
867
868 // Test data
869 std::vector<DecimalValue> test{12, 12, 11, 12};
870 for (const auto& value : test) {
871 ASSERT_OK(builder.Append(value.ToBytes().data()));
872 }
873
874 std::shared_ptr<Array> result;
875 ASSERT_OK(builder.Finish(&result));
876
877 // Build expected data
878 DictionaryArray expected(dictionary(int8(), decimal_type),
879 ArrayFromJSON(int8(), "[0, 0, 1, 0]"),
880 ArrayFromJSON(decimal_type, "[\"12\", \"11\"]"));
881
882 ASSERT_TRUE(expected.Equals(result));
883 }
884
885 TEST(TestDecimal128DictionaryBuilder, Basic) {
886 TestDecimalDictionaryBuilderBasic<Decimal128>(arrow::decimal128(2, 0));
887 }
888
889 TEST(TestDecimal256DictionaryBuilder, Basic) {
890 TestDecimalDictionaryBuilderBasic<Decimal256>(arrow::decimal256(76, 0));
891 }
892
893 void TestDecimalDictionaryBuilderDoubleTableSize(
894 std::shared_ptr<DataType> decimal_type, FixedSizeBinaryBuilder& decimal_builder) {
895 // Build the dictionary Array
896 DictionaryBuilder<FixedSizeBinaryType> dict_builder(decimal_type);
897
898 // Build expected data
899 Int16Builder int_builder;
900
901 // Fill with 1024 different values
902 for (int64_t i = 0; i < 1024; i++) {
903 // Decimal256Builder takes 32 bytes, while Decimal128Builder takes only the first 16
904 // bytes.
905 const uint8_t bytes[32] = {0,
906 0,
907 0,
908 0,
909 0,
910 0,
911 0,
912 0,
913 0,
914 0,
915 0,
916 0,
917 12,
918 12,
919 static_cast<uint8_t>(i / 128),
920 static_cast<uint8_t>(i % 128)};
921 ASSERT_OK(dict_builder.Append(bytes));
922 ASSERT_OK(decimal_builder.Append(bytes));
923 ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
924 }
925 // Fill with an already existing value
926 const uint8_t known_value[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
927 for (int64_t i = 0; i < 1024; i++) {
928 ASSERT_OK(dict_builder.Append(known_value));
929 ASSERT_OK(int_builder.Append(1));
930 }
931
932 // Finalize result
933 std::shared_ptr<Array> result;
934 ASSERT_OK(dict_builder.Finish(&result));
935
936 // Finalize expected data
937 std::shared_ptr<Array> decimal_array;
938 ASSERT_OK(decimal_builder.Finish(&decimal_array));
939
940 std::shared_ptr<Array> int_array;
941 ASSERT_OK(int_builder.Finish(&int_array));
942
943 DictionaryArray expected(dictionary(int16(), decimal_type), int_array, decimal_array);
944 ASSERT_TRUE(expected.Equals(result));
945 }
946
947 TEST(TestDecimal128DictionaryBuilder, DoubleTableSize) {
948 const auto& decimal_type = arrow::decimal128(21, 0);
949 Decimal128Builder decimal_builder(decimal_type);
950 TestDecimalDictionaryBuilderDoubleTableSize(decimal_type, decimal_builder);
951 }
952
953 TEST(TestDecimal256DictionaryBuilder, DoubleTableSize) {
954 const auto& decimal_type = arrow::decimal256(21, 0);
955 Decimal256Builder decimal_builder(decimal_type);
956 TestDecimalDictionaryBuilderDoubleTableSize(decimal_type, decimal_builder);
957 }
958
959 TEST(TestNullDictionaryBuilder, Basic) {
960 // MakeBuilder
961 auto dict_type = dictionary(int8(), null());
962 std::unique_ptr<ArrayBuilder> boxed_builder;
963 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
964 auto& builder = checked_cast<DictionaryBuilder<NullType>&>(*boxed_builder);
965
966 ASSERT_OK(builder.AppendNull());
967 ASSERT_OK(builder.AppendNull());
968 ASSERT_OK(builder.AppendNull());
969 ASSERT_EQ(3, builder.length());
970 ASSERT_EQ(3, builder.null_count());
971
972 ASSERT_OK(builder.AppendNulls(4));
973 ASSERT_EQ(7, builder.length());
974 ASSERT_EQ(7, builder.null_count());
975
976 auto null_array = ArrayFromJSON(null(), "[null, null, null, null]");
977 ASSERT_OK(builder.AppendArray(*null_array));
978 ASSERT_EQ(11, builder.length());
979 ASSERT_EQ(11, builder.null_count());
980
981 std::shared_ptr<Array> result;
982 ASSERT_OK(builder.Finish(&result));
983 AssertTypeEqual(*dict_type, *result->type());
984 ASSERT_EQ(11, result->length());
985 ASSERT_EQ(11, result->null_count());
986 }
987
988 #ifndef NDEBUG
989 TEST(TestNullDictionaryBuilder, AppendArrayInvalidType) {
990 // MakeBuilder
991 auto dict_type = dictionary(int8(), null());
992 std::unique_ptr<ArrayBuilder> boxed_builder;
993 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
994 auto& builder = checked_cast<DictionaryBuilder<NullType>&>(*boxed_builder);
995
996 auto int8_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
997 ASSERT_RAISES(TypeError, builder.AppendArray(*int8_array));
998 }
999 #endif
1000
1001 // ----------------------------------------------------------------------
1002 // Index byte width tests
1003
1004 template <typename IndexType, typename ValueType>
1005 void AssertIndexByteWidth(const std::shared_ptr<DataType>& value_type =
1006 TypeTraits<ValueType>::type_singleton()) {
1007 auto index_type = TypeTraits<IndexType>::type_singleton();
1008 auto dict_type =
1009 checked_pointer_cast<DictionaryType>(dictionary(index_type, value_type));
1010 std::unique_ptr<ArrayBuilder> builder;
1011 ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &builder));
1012 auto builder_dict_type = checked_pointer_cast<DictionaryType>(builder->type());
1013 AssertTypeEqual(dict_type->index_type(), builder_dict_type->index_type());
1014 }
1015
1016 typedef ::testing::Types<Int8Type, Int16Type, Int32Type, Int64Type> IndexTypes;
1017
1018 template <typename Type>
1019 class TestDictionaryBuilderIndexByteWidth : public TestBuilder {};
1020
1021 TYPED_TEST_SUITE(TestDictionaryBuilderIndexByteWidth, IndexTypes);
1022
1023 TYPED_TEST(TestDictionaryBuilderIndexByteWidth, MakeBuilder) {
1024 AssertIndexByteWidth<TypeParam, FloatType>();
1025 AssertIndexByteWidth<TypeParam, BinaryType>();
1026 AssertIndexByteWidth<TypeParam, StringType>();
1027 AssertIndexByteWidth<TypeParam, FixedSizeBinaryType>(fixed_size_binary(4));
1028 AssertIndexByteWidth<TypeParam, NullType>();
1029 }
1030
1031 // ----------------------------------------------------------------------
1032 // DictionaryArray tests
1033
1034 TEST(TestDictionary, Equals) {
1035 std::vector<bool> is_valid = {true, true, false, true, true, true};
1036 std::shared_ptr<Array> dict, dict2, indices, indices2, indices3;
1037
1038 dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1039 std::shared_ptr<DataType> dict_type = dictionary(int16(), utf8());
1040
1041 dict2 = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\", \"qux\"]");
1042 std::shared_ptr<DataType> dict2_type = dictionary(int16(), utf8());
1043
1044 std::vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0};
1045 ArrayFromVector<Int16Type, int16_t>(is_valid, indices_values, &indices);
1046
1047 std::vector<int16_t> indices2_values = {1, 2, 0, 0, 2, 0};
1048 ArrayFromVector<Int16Type, int16_t>(is_valid, indices2_values, &indices2);
1049
1050 std::vector<int16_t> indices3_values = {1, 1, 0, 0, 2, 0};
1051 ArrayFromVector<Int16Type, int16_t>(is_valid, indices3_values, &indices3);
1052
1053 auto array = std::make_shared<DictionaryArray>(dict_type, indices, dict);
1054 auto array2 = std::make_shared<DictionaryArray>(dict_type, indices2, dict);
1055 auto array3 = std::make_shared<DictionaryArray>(dict2_type, indices, dict2);
1056 auto array4 = std::make_shared<DictionaryArray>(dict_type, indices3, dict);
1057
1058 ASSERT_TRUE(array->Equals(array));
1059
1060 // Equal, because the unequal index is masked by null
1061 ASSERT_TRUE(array->Equals(array2));
1062
1063 // Unequal dictionaries
1064 ASSERT_FALSE(array->Equals(array3));
1065
1066 // Unequal indices
1067 ASSERT_FALSE(array->Equals(array4));
1068
1069 // RangeEquals
1070 ASSERT_TRUE(array->RangeEquals(3, 6, 3, array4));
1071 ASSERT_FALSE(array->RangeEquals(1, 3, 1, array4));
1072
1073 // ARROW-33 Test slices
1074 const int64_t size = array->length();
1075
1076 std::shared_ptr<Array> slice, slice2;
1077 slice = array->Array::Slice(2);
1078 slice2 = array->Array::Slice(2);
1079 ASSERT_EQ(size - 2, slice->length());
1080
1081 ASSERT_TRUE(slice->Equals(slice2));
1082 ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice));
1083
1084 // Chained slices
1085 slice2 = array->Array::Slice(1)->Array::Slice(1);
1086 ASSERT_TRUE(slice->Equals(slice2));
1087
1088 slice = array->Slice(1, 3);
1089 slice2 = array->Slice(1, 3);
1090 ASSERT_EQ(3, slice->length());
1091
1092 ASSERT_TRUE(slice->Equals(slice2));
1093 ASSERT_TRUE(array->RangeEquals(1, 4, 0, slice));
1094 }
1095
1096 TEST(TestDictionary, Validate) {
1097 auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1098 auto dict_type = dictionary(int16(), utf8());
1099
1100 auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]");
1101 std::shared_ptr<Array> arr =
1102 std::make_shared<DictionaryArray>(dict_type, indices, dict);
1103
1104 // Only checking index type for now
1105 ASSERT_OK(arr->ValidateFull());
1106
1107 // ARROW-7008: Invalid dict was not being validated
1108 std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, nullptr, nullptr};
1109 auto invalid_data = std::make_shared<ArrayData>(utf8(), 0, buffers);
1110
1111 indices = ArrayFromJSON(int16(), "[]");
1112 arr = std::make_shared<DictionaryArray>(dict_type, indices, MakeArray(invalid_data));
1113 ASSERT_RAISES(Invalid, arr->ValidateFull());
1114
1115 // Make the data buffer non-null
1116 ASSERT_OK_AND_ASSIGN(buffers[2], AllocateBuffer(0));
1117 arr = std::make_shared<DictionaryArray>(dict_type, indices, MakeArray(invalid_data));
1118 ASSERT_RAISES(Invalid, arr->ValidateFull());
1119
1120 ASSERT_DEATH(
1121 {
1122 std::shared_ptr<Array> null_dict_arr =
1123 std::make_shared<DictionaryArray>(dict_type, indices, nullptr);
1124 },
1125 "");
1126 }
1127
1128 TEST(TestDictionary, FromArrays) {
1129 auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1130 for (auto index_ty : all_dictionary_index_types()) {
1131 auto dict_type = dictionary(index_ty, utf8());
1132
1133 auto indices1 = ArrayFromJSON(index_ty, "[1, 2, 0, 0, 2, 0]");
1134 // Index out of bounds
1135 auto indices2 = ArrayFromJSON(index_ty, "[1, 2, 0, 3, 2, 0]");
1136
1137 ASSERT_OK_AND_ASSIGN(auto arr1,
1138 DictionaryArray::FromArrays(dict_type, indices1, dict));
1139 ASSERT_RAISES(IndexError, DictionaryArray::FromArrays(dict_type, indices2, dict));
1140
1141 if (checked_cast<const IntegerType&>(*index_ty).is_signed()) {
1142 // Invalid index is masked by null, so it's OK
1143 auto indices3 = ArrayFromJSON(index_ty, "[1, 2, -1, null, 2, 0]");
1144 BitUtil::ClearBit(indices3->data()->buffers[0]->mutable_data(), 2);
1145 ASSERT_OK_AND_ASSIGN(auto arr3,
1146 DictionaryArray::FromArrays(dict_type, indices3, dict));
1147 }
1148
1149 auto indices4 = ArrayFromJSON(index_ty, "[1, 2, null, 3, 2, 0]");
1150 ASSERT_RAISES(IndexError, DictionaryArray::FromArrays(dict_type, indices4, dict));
1151
1152 // Probe other validation checks
1153 ASSERT_RAISES(TypeError, DictionaryArray::FromArrays(index_ty, indices4, dict));
1154
1155 auto different_index_ty =
1156 dictionary(index_ty->id() == Type::INT8 ? uint8() : int8(), utf8());
1157 ASSERT_RAISES(TypeError,
1158 DictionaryArray::FromArrays(different_index_ty, indices4, dict));
1159 }
1160 }
1161
1162 static void CheckTranspose(const std::shared_ptr<Array>& input,
1163 const int32_t* transpose_map,
1164 const std::shared_ptr<DataType>& out_dict_type,
1165 const std::shared_ptr<Array>& out_dict,
1166 const std::shared_ptr<Array>& expected_indices) {
1167 ASSERT_OK_AND_ASSIGN(auto transposed,
1168 internal::checked_cast<const DictionaryArray&>(*input).Transpose(
1169 out_dict_type, out_dict, transpose_map));
1170 ASSERT_OK(transposed->ValidateFull());
1171
1172 ASSERT_OK_AND_ASSIGN(auto expected, DictionaryArray::FromArrays(
1173 out_dict_type, expected_indices, out_dict));
1174 AssertArraysEqual(*transposed, *expected);
1175 }
1176
1177 TEST(TestDictionary, TransposeBasic) {
1178 auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
1179
1180 auto CheckIndexType = [&](const std::shared_ptr<DataType>& index_ty) {
1181 auto dict_type = dictionary(index_ty, utf8());
1182 auto indices = ArrayFromJSON(index_ty, "[1, 2, 0, 0]");
1183 // ["B", "C", "A", "A"]
1184 ASSERT_OK_AND_ASSIGN(auto arr, DictionaryArray::FromArrays(dict_type, indices, dict));
1185 // ["C", "A"]
1186 auto sliced = arr->Slice(1, 2);
1187
1188 // Transpose to same index type
1189 {
1190 auto out_dict_type = dict_type;
1191 auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
1192 auto expected_indices = ArrayFromJSON(index_ty, "[3, 2, 1, 1]");
1193 std::vector<int32_t> transpose_map = {1, 3, 2};
1194 CheckTranspose(arr, transpose_map.data(), out_dict_type, out_dict,
1195 expected_indices);
1196
1197 // Sliced
1198 expected_indices = ArrayFromJSON(index_ty, "[2, 1]");
1199 CheckTranspose(sliced, transpose_map.data(), out_dict_type, out_dict,
1200 expected_indices);
1201 }
1202
1203 // Transpose to other index type
1204 auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
1205 std::vector<int32_t> transpose_map = {1, 3, 2};
1206 for (auto other_ty : all_dictionary_index_types()) {
1207 auto out_dict_type = dictionary(other_ty, utf8());
1208 auto expected_indices = ArrayFromJSON(other_ty, "[3, 2, 1, 1]");
1209 CheckTranspose(arr, transpose_map.data(), out_dict_type, out_dict,
1210 expected_indices);
1211
1212 // Sliced
1213 expected_indices = ArrayFromJSON(other_ty, "[2, 1]");
1214 CheckTranspose(sliced, transpose_map.data(), out_dict_type, out_dict,
1215 expected_indices);
1216 }
1217 };
1218
1219 for (auto ty : all_dictionary_index_types()) {
1220 CheckIndexType(ty);
1221 }
1222 }
1223
1224 TEST(TestDictionary, TransposeTrivial) {
1225 // Test a trivial transposition, possibly optimized away
1226
1227 auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
1228 auto dict_type = dictionary(int16(), utf8());
1229 auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
1230 // ["B", "C", "A", "A"]
1231 ASSERT_OK_AND_ASSIGN(auto arr, DictionaryArray::FromArrays(dict_type, indices, dict));
1232 // ["C", "A"]
1233 auto sliced = arr->Slice(1, 2);
1234
1235 std::vector<int32_t> transpose_map = {0, 1, 2};
1236
1237 // Transpose to same index type
1238 {
1239 auto out_dict_type = dict_type;
1240 auto out_dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]");
1241 auto expected_indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
1242 CheckTranspose(arr, transpose_map.data(), out_dict_type, out_dict, expected_indices);
1243
1244 // Sliced
1245 expected_indices = ArrayFromJSON(int16(), "[2, 0]");
1246 CheckTranspose(sliced, transpose_map.data(), out_dict_type, out_dict,
1247 expected_indices);
1248 }
1249
1250 // Transpose to other index type
1251 {
1252 auto out_dict_type = dictionary(int8(), utf8());
1253 auto out_dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]");
1254 auto expected_indices = ArrayFromJSON(int8(), "[1, 2, 0, 0]");
1255 CheckTranspose(arr, transpose_map.data(), out_dict_type, out_dict, expected_indices);
1256
1257 // Sliced
1258 expected_indices = ArrayFromJSON(int8(), "[2, 0]");
1259 CheckTranspose(sliced, transpose_map.data(), out_dict_type, out_dict,
1260 expected_indices);
1261 }
1262 }
1263
1264 TEST(TestDictionary, GetValueIndex) {
1265 const char* indices_json = "[5, 0, 1, 3, 2, 4]";
1266 auto indices_int64 = ArrayFromJSON(int64(), indices_json);
1267 auto dict = ArrayFromJSON(int32(), "[10, 20, 30, 40, 50, 60]");
1268
1269 const auto& typed_indices_int64 = checked_cast<const Int64Array&>(*indices_int64);
1270 for (auto index_ty : all_dictionary_index_types()) {
1271 auto indices = ArrayFromJSON(index_ty, indices_json);
1272 auto dict_ty = dictionary(index_ty, int32());
1273
1274 DictionaryArray dict_arr(dict_ty, indices, dict);
1275
1276 int64_t offset = 1;
1277 auto sliced_dict_arr = dict_arr.Slice(offset);
1278
1279 for (int64_t i = 0; i < indices->length(); ++i) {
1280 ASSERT_EQ(dict_arr.GetValueIndex(i), typed_indices_int64.Value(i));
1281 if (i < sliced_dict_arr->length()) {
1282 ASSERT_EQ(checked_cast<const DictionaryArray&>(*sliced_dict_arr).GetValueIndex(i),
1283 typed_indices_int64.Value(i + offset));
1284 }
1285 }
1286 }
1287 }
1288
1289 TEST(TestDictionary, TransposeNulls) {
1290 auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
1291 auto dict_type = dictionary(int16(), utf8());
1292 auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]");
1293 // ["B", "C", null, "A"]
1294 ASSERT_OK_AND_ASSIGN(auto arr, DictionaryArray::FromArrays(dict_type, indices, dict));
1295 // ["C", null]
1296 auto sliced = arr->Slice(1, 2);
1297
1298 auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
1299 auto out_dict_type = dictionary(int16(), utf8());
1300 auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");
1301
1302 std::vector<int32_t> transpose_map = {1, 3, 2};
1303 CheckTranspose(arr, transpose_map.data(), out_dict_type, out_dict, expected_indices);
1304
1305 // Sliced
1306 expected_indices = ArrayFromJSON(int16(), "[2, null]");
1307 CheckTranspose(sliced, transpose_map.data(), out_dict_type, out_dict, expected_indices);
1308 }
1309
1310 TEST(TestDictionary, ListOfDictionary) {
1311 std::unique_ptr<ArrayBuilder> root_builder;
1312 ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())),
1313 &root_builder));
1314 auto list_builder = checked_cast<ListBuilder*>(root_builder.get());
1315 auto dict_builder =
1316 checked_cast<DictionaryBuilder<StringType>*>(list_builder->value_builder());
1317
1318 ASSERT_OK(list_builder->Append());
1319 std::vector<std::string> expected;
1320 for (char a : util::string_view("abc")) {
1321 for (char d : util::string_view("def")) {
1322 for (char g : util::string_view("ghi")) {
1323 for (char j : util::string_view("jkl")) {
1324 for (char m : util::string_view("mno")) {
1325 for (char p : util::string_view("pqr")) {
1326 if ((static_cast<int>(a) + d + g + j + m + p) % 16 == 0) {
1327 ASSERT_OK(list_builder->Append());
1328 }
1329 // 3**6 distinct strings; too large for int8
1330 char str[] = {a, d, g, j, m, p, '\0'};
1331 ASSERT_OK(dict_builder->Append(str));
1332 expected.push_back(str);
1333 }
1334 }
1335 }
1336 }
1337 }
1338 }
1339
1340 ASSERT_TRUE(list_builder->type()->Equals(list(dictionary(int16(), utf8()))));
1341
1342 std::shared_ptr<Array> expected_dict;
1343 ArrayFromVector<StringType, std::string>(expected, &expected_dict);
1344
1345 std::shared_ptr<Array> array;
1346 ASSERT_OK(root_builder->Finish(&array));
1347 ASSERT_OK(array->ValidateFull());
1348
1349 auto expected_type = list(dictionary(int16(), utf8()));
1350 ASSERT_EQ(array->type()->ToString(), expected_type->ToString());
1351
1352 auto list_array = checked_cast<const ListArray*>(array.get());
1353 auto actual_dict =
1354 checked_cast<const DictionaryArray&>(*list_array->values()).dictionary();
1355 ASSERT_ARRAYS_EQUAL(*expected_dict, *actual_dict);
1356 }
1357
1358 TEST(TestDictionary, CanCompareIndices) {
1359 auto make_dict = [](std::shared_ptr<DataType> index_type,
1360 std::shared_ptr<DataType> value_type, std::string dictionary_json) {
1361 std::shared_ptr<Array> out;
1362 ARROW_EXPECT_OK(
1363 DictionaryArray::FromArrays(dictionary(index_type, value_type),
1364 ArrayFromJSON(index_type, "[]"),
1365 ArrayFromJSON(value_type, dictionary_json))
1366 .Value(&out));
1367 return checked_pointer_cast<DictionaryArray>(out);
1368 };
1369
1370 auto compare_and_swap = [](const DictionaryArray& l, const DictionaryArray& r,
1371 bool expected) {
1372 ASSERT_EQ(l.CanCompareIndices(r), expected)
1373 << "left: " << l.ToString() << "\nright: " << r.ToString();
1374 ASSERT_EQ(r.CanCompareIndices(l), expected)
1375 << "left: " << r.ToString() << "\nright: " << l.ToString();
1376 };
1377
1378 {
1379 auto array = make_dict(int16(), utf8(), R"(["foo", "bar"])");
1380 auto same = make_dict(int16(), utf8(), R"(["foo", "bar"])");
1381 compare_and_swap(*array, *same, true);
1382 }
1383
1384 {
1385 auto array = make_dict(int16(), utf8(), R"(["foo", "bar", "quux"])");
1386 auto prefix_dict = make_dict(int16(), utf8(), R"(["foo", "bar"])");
1387 compare_and_swap(*array, *prefix_dict, true);
1388 }
1389
1390 {
1391 auto array = make_dict(int16(), utf8(), R"(["foo", "bar"])");
1392 auto indices_need_casting = make_dict(int8(), utf8(), R"(["foo", "bar"])");
1393 compare_and_swap(*array, *indices_need_casting, false);
1394 }
1395
1396 {
1397 auto array = make_dict(int16(), utf8(), R"(["foo", "bar", "quux"])");
1398 auto non_prefix_dict = make_dict(int16(), utf8(), R"(["foo", "blink"])");
1399 compare_and_swap(*array, *non_prefix_dict, false);
1400 }
1401 }
1402
1403 TEST(TestDictionary, IndicesArray) {
1404 auto dict = ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"baz\"]");
1405 auto dict_type = dictionary(int16(), utf8());
1406 auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0, 2, 0]");
1407 auto arr = std::make_shared<DictionaryArray>(dict_type, indices, dict);
1408
1409 // The indices array should not have dictionary data
1410 ASSERT_EQ(arr->indices()->data()->dictionary, nullptr);
1411
1412 // Validate the indices array
1413 ASSERT_OK(arr->indices()->ValidateFull());
1414 }
1415
1416 TEST(TestDictionaryUnifier, Numeric) {
1417 auto dict_ty = int64();
1418
1419 auto d1 = ArrayFromJSON(dict_ty, "[3, 4, 7]");
1420 auto d2 = ArrayFromJSON(dict_ty, "[1, 7, 4, 8]");
1421 auto d3 = ArrayFromJSON(dict_ty, "[1, -200]");
1422
1423 auto expected = dictionary(int8(), dict_ty);
1424 auto expected_dict = ArrayFromJSON(dict_ty, "[3, 4, 7, 1, 8, -200]");
1425
1426 ASSERT_OK_AND_ASSIGN(auto unifier, DictionaryUnifier::Make(dict_ty));
1427
1428 std::shared_ptr<DataType> out_type;
1429 std::shared_ptr<Array> out_dict;
1430
1431 ASSERT_OK(unifier->Unify(*d1));
1432 ASSERT_OK(unifier->Unify(*d2));
1433 ASSERT_OK(unifier->Unify(*d3));
1434
1435 ASSERT_RAISES(Invalid, unifier->Unify(*ArrayFromJSON(int32(), "[1, -200]")));
1436
1437 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1438 ASSERT_TRUE(out_type->Equals(*expected));
1439 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1440
1441 std::shared_ptr<Buffer> b1, b2, b3;
1442
1443 ASSERT_OK(unifier->Unify(*d1, &b1));
1444 ASSERT_OK(unifier->Unify(*d2, &b2));
1445 ASSERT_OK(unifier->Unify(*d3, &b3));
1446 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1447 ASSERT_TRUE(out_type->Equals(*expected));
1448 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1449
1450 CheckTransposeMap(*b1, {0, 1, 2});
1451 CheckTransposeMap(*b2, {3, 2, 1, 4});
1452 CheckTransposeMap(*b3, {3, 5});
1453 }
1454
1455 TEST(TestDictionaryUnifier, String) {
1456 auto dict_ty = utf8();
1457
1458 auto t1 = dictionary(int16(), dict_ty);
1459 auto d1 = ArrayFromJSON(dict_ty, "[\"foo\", \"bar\"]");
1460
1461 auto t2 = dictionary(int32(), dict_ty);
1462 auto d2 = ArrayFromJSON(dict_ty, "[\"quux\", \"foo\"]");
1463
1464 auto expected = dictionary(int8(), dict_ty);
1465 auto expected_dict = ArrayFromJSON(dict_ty, "[\"foo\", \"bar\", \"quux\"]");
1466
1467 ASSERT_OK_AND_ASSIGN(auto unifier, DictionaryUnifier::Make(dict_ty));
1468
1469 std::shared_ptr<DataType> out_type;
1470 std::shared_ptr<Array> out_dict;
1471 ASSERT_OK(unifier->Unify(*d1));
1472 ASSERT_OK(unifier->Unify(*d2));
1473 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1474 ASSERT_TRUE(out_type->Equals(*expected));
1475 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1476
1477 std::shared_ptr<Buffer> b1, b2;
1478
1479 ASSERT_OK(unifier->Unify(*d1, &b1));
1480 ASSERT_OK(unifier->Unify(*d2, &b2));
1481 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1482 ASSERT_TRUE(out_type->Equals(*expected));
1483 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1484
1485 CheckTransposeMap(*b1, {0, 1});
1486 CheckTransposeMap(*b2, {2, 0});
1487 }
1488
1489 TEST(TestDictionaryUnifier, FixedSizeBinary) {
1490 auto type = fixed_size_binary(3);
1491
1492 std::string data = "foobarbazqux";
1493 auto buf = std::make_shared<Buffer>(data);
1494 // ["foo", "bar"]
1495 auto dict1 = std::make_shared<FixedSizeBinaryArray>(type, 2, SliceBuffer(buf, 0, 6));
1496 auto t1 = dictionary(int16(), type);
1497 // ["bar", "baz", "qux"]
1498 auto dict2 = std::make_shared<FixedSizeBinaryArray>(type, 3, SliceBuffer(buf, 3, 9));
1499 auto t2 = dictionary(int16(), type);
1500
1501 // ["foo", "bar", "baz", "qux"]
1502 auto expected_dict = std::make_shared<FixedSizeBinaryArray>(type, 4, buf);
1503 auto expected = dictionary(int8(), type);
1504
1505 ASSERT_OK_AND_ASSIGN(auto unifier, DictionaryUnifier::Make(type));
1506 std::shared_ptr<DataType> out_type;
1507 std::shared_ptr<Array> out_dict;
1508 ASSERT_OK(unifier->Unify(*dict1));
1509 ASSERT_OK(unifier->Unify(*dict2));
1510 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1511 ASSERT_TRUE(out_type->Equals(*expected));
1512 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1513
1514 std::shared_ptr<Buffer> b1, b2;
1515 ASSERT_OK(unifier->Unify(*dict1, &b1));
1516 ASSERT_OK(unifier->Unify(*dict2, &b2));
1517 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1518 ASSERT_TRUE(out_type->Equals(*expected));
1519 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1520
1521 CheckTransposeMap(*b1, {0, 1});
1522 CheckTransposeMap(*b2, {1, 2, 3});
1523 }
1524
1525 TEST(TestDictionaryUnifier, Large) {
1526 // Unifying "large" dictionary types should choose the right index type
1527 std::shared_ptr<Array> dict1, dict2, expected_dict;
1528
1529 Int32Builder builder;
1530 ASSERT_OK(builder.Reserve(120));
1531 for (int32_t i = 0; i < 120; ++i) {
1532 builder.UnsafeAppend(i);
1533 }
1534 ASSERT_OK(builder.Finish(&dict1));
1535 ASSERT_EQ(dict1->length(), 120);
1536 auto t1 = dictionary(int8(), int32());
1537
1538 ASSERT_OK(builder.Reserve(30));
1539 for (int32_t i = 110; i < 140; ++i) {
1540 builder.UnsafeAppend(i);
1541 }
1542 ASSERT_OK(builder.Finish(&dict2));
1543 ASSERT_EQ(dict2->length(), 30);
1544 auto t2 = dictionary(int8(), int32());
1545
1546 ASSERT_OK(builder.Reserve(140));
1547 for (int32_t i = 0; i < 140; ++i) {
1548 builder.UnsafeAppend(i);
1549 }
1550 ASSERT_OK(builder.Finish(&expected_dict));
1551 ASSERT_EQ(expected_dict->length(), 140);
1552
1553 // int8 would be too narrow to hold all possible index values
1554 auto expected = dictionary(int16(), int32());
1555
1556 ASSERT_OK_AND_ASSIGN(auto unifier, DictionaryUnifier::Make(int32()));
1557 std::shared_ptr<DataType> out_type;
1558 std::shared_ptr<Array> out_dict;
1559 ASSERT_OK(unifier->Unify(*dict1));
1560 ASSERT_OK(unifier->Unify(*dict2));
1561 ASSERT_OK(unifier->GetResult(&out_type, &out_dict));
1562 ASSERT_TRUE(out_type->Equals(*expected));
1563 ASSERT_TRUE(out_dict->Equals(*expected_dict));
1564 }
1565
1566 TEST(TestDictionaryUnifier, ChunkedArraySimple) {
1567 auto type = dictionary(int8(), utf8());
1568 auto chunk1 = ArrayFromJSON(type, R"(["ab", "cd", null, "cd"])");
1569 auto chunk2 = ArrayFromJSON(type, R"(["ef", "cd", "ef"])");
1570 auto chunk3 = ArrayFromJSON(type, R"(["ef", "ab", null, "ab"])");
1571 auto chunk4 = ArrayFromJSON(type, "[]");
1572 ASSERT_OK_AND_ASSIGN(auto chunked,
1573 ChunkedArray::Make({chunk1, chunk2, chunk3, chunk4}));
1574
1575 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyChunkedArray(chunked));
1576 ASSERT_EQ(unified->num_chunks(), 4);
1577 auto expected_dict = ArrayFromJSON(utf8(), R"(["ab", "cd", "ef"])");
1578 CheckDictionaryArray(unified->chunk(0), expected_dict,
1579 ArrayFromJSON(int8(), "[0, 1, null, 1]"));
1580 CheckDictionaryArray(unified->chunk(1), expected_dict,
1581 ArrayFromJSON(int8(), "[2, 1, 2]"));
1582 CheckDictionaryArray(unified->chunk(2), expected_dict,
1583 ArrayFromJSON(int8(), "[2, 0, null, 0]"));
1584 CheckDictionaryArray(unified->chunk(3), expected_dict, ArrayFromJSON(int8(), "[]"));
1585 }
1586
1587 TEST(TestDictionaryUnifier, ChunkedArrayZeroChunk) {
1588 auto type = dictionary(int8(), utf8());
1589 ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make(ArrayVector{}, type));
1590 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyChunkedArray(chunked));
1591 AssertChunkedEqual(*chunked, *unified);
1592 }
1593
1594 TEST(TestDictionaryUnifier, ChunkedArrayOneChunk) {
1595 auto type = dictionary(int8(), utf8());
1596 auto chunk1 = ArrayFromJSON(type, R"(["ab", "cd", null, "cd"])");
1597 ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1}));
1598 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyChunkedArray(chunked));
1599 AssertChunkedEqual(*chunked, *unified);
1600 }
1601
1602 TEST(TestDictionaryUnifier, ChunkedArrayNoDict) {
1603 auto type = int8();
1604 auto chunk1 = ArrayFromJSON(type, "[1, 1, 2, 3]");
1605 auto chunk2 = ArrayFromJSON(type, "[5, 8, 13]");
1606 ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2}));
1607 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyChunkedArray(chunked));
1608 AssertChunkedEqual(*chunked, *unified);
1609 }
1610
1611 TEST(TestDictionaryUnifier, ChunkedArrayNested) {
1612 // Dict in a nested type: ok
1613 auto type = list(dictionary(int16(), utf8()));
1614 auto chunk1 = ArrayFromJSON(type, R"([["ab", "cd"], ["cd"]])");
1615 auto chunk2 = ArrayFromJSON(type, R"([[], ["ef", "cd", "ef"]])");
1616 ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2}));
1617
1618 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyChunkedArray(chunked));
1619 ASSERT_EQ(unified->num_chunks(), 2);
1620 auto expected_dict = ArrayFromJSON(utf8(), R"(["ab", "cd", "ef"])");
1621 auto unified1 = checked_pointer_cast<ListArray>(unified->chunk(0));
1622 AssertArraysEqual(*unified1->offsets(), *ArrayFromJSON(int32(), "[0, 2, 3]"));
1623 CheckDictionaryArray(unified1->values(), expected_dict,
1624 ArrayFromJSON(int16(), "[0, 1, 1]"));
1625 auto unified2 = checked_pointer_cast<ListArray>(unified->chunk(1));
1626 AssertArraysEqual(*unified2->offsets(), *ArrayFromJSON(int32(), "[0, 0, 3]"));
1627 CheckDictionaryArray(unified2->values(), expected_dict,
1628 ArrayFromJSON(int16(), "[2, 1, 2]"));
1629 }
1630
1631 TEST(TestDictionaryUnifier, ChunkedArrayExtension) {
1632 // Dict in an extension type: ok
1633 auto type = dict_extension_type();
1634 auto chunk1 = DictExtensionFromJSON(type, R"(["ab", null, "cd", "ab"])");
1635 auto chunk2 = DictExtensionFromJSON(type, R"(["ef", "ab", "ab"])");
1636 ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2}));
1637
1638 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyChunkedArray(chunked));
1639 ASSERT_EQ(unified->num_chunks(), 2);
1640
1641 auto expected_dict = ArrayFromJSON(utf8(), R"(["ab", "cd", "ef"])");
1642 auto unified1 = checked_pointer_cast<ExtensionArray>(unified->chunk(0));
1643 AssertTypeEqual(*type, *unified1->type());
1644 CheckDictionaryArray(unified1->storage(), expected_dict,
1645 ArrayFromJSON(int8(), "[0, null, 1, 0]"));
1646 auto unified2 = checked_pointer_cast<ExtensionArray>(unified->chunk(1));
1647 AssertTypeEqual(*type, *unified2->type());
1648 CheckDictionaryArray(unified2->storage(), expected_dict,
1649 ArrayFromJSON(int8(), "[2, 0, 0]"));
1650 }
1651
1652 TEST(TestDictionaryUnifier, ChunkedArrayNestedDict) {
1653 // Dict in a dict type: unsupported
1654 auto inner_type = list(dictionary(uint32(), utf8()));
1655 auto inner_dict1 = ArrayFromJSON(inner_type, R"([["ab", "cd"], [], ["cd", null]])");
1656 ASSERT_OK_AND_ASSIGN(
1657 auto chunk1, DictionaryArray::FromArrays(ArrayFromJSON(int32(), "[2, 1, 0, 1, 2]"),
1658 inner_dict1));
1659 auto inner_dict2 = ArrayFromJSON(inner_type, R"([["cd", "ef"], ["cd", null], []])");
1660 ASSERT_OK_AND_ASSIGN(
1661 auto chunk2,
1662 DictionaryArray::FromArrays(ArrayFromJSON(int32(), "[1, 2, 2, 0]"), inner_dict2));
1663 ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({chunk1, chunk2}));
1664
1665 ASSERT_RAISES(NotImplemented, DictionaryUnifier::UnifyChunkedArray(chunked));
1666 }
1667
1668 TEST(TestDictionaryUnifier, TableZeroColumns) {
1669 auto schema = ::arrow::schema(FieldVector{});
1670 auto table = Table::Make(schema, ArrayVector{}, /*num_rows=*/42);
1671
1672 ASSERT_OK_AND_ASSIGN(auto unified, DictionaryUnifier::UnifyTable(*table));
1673 AssertSchemaEqual(*schema, *unified->schema());
1674 ASSERT_EQ(unified->num_rows(), 42);
1675 AssertTablesEqual(*table, *unified);
1676 }
1677
1678 } // namespace arrow