2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org
.apache
.arrow
.vector
;
20 import static org
.apache
.arrow
.vector
.TestUtils
.newVarBinaryVector
;
21 import static org
.apache
.arrow
.vector
.TestUtils
.newVarCharVector
;
22 import static org
.apache
.arrow
.vector
.testing
.ValueVectorDataPopulator
.setVector
;
23 import static org
.junit
.Assert
.assertArrayEquals
;
24 import static org
.junit
.Assert
.assertEquals
;
25 import static org
.junit
.Assert
.assertFalse
;
26 import static org
.junit
.Assert
.assertTrue
;
28 import java
.nio
.charset
.StandardCharsets
;
29 import java
.util
.Arrays
;
30 import java
.util
.HashMap
;
31 import java
.util
.Iterator
;
32 import java
.util
.List
;
34 import java
.util
.function
.ToIntBiFunction
;
36 import org
.apache
.arrow
.memory
.ArrowBuf
;
37 import org
.apache
.arrow
.memory
.BufferAllocator
;
38 import org
.apache
.arrow
.vector
.complex
.FixedSizeListVector
;
39 import org
.apache
.arrow
.vector
.complex
.ListVector
;
40 import org
.apache
.arrow
.vector
.complex
.StructVector
;
41 import org
.apache
.arrow
.vector
.complex
.UnionVector
;
42 import org
.apache
.arrow
.vector
.complex
.impl
.NullableStructWriter
;
43 import org
.apache
.arrow
.vector
.complex
.impl
.UnionListWriter
;
44 import org
.apache
.arrow
.vector
.dictionary
.Dictionary
;
45 import org
.apache
.arrow
.vector
.dictionary
.DictionaryEncoder
;
46 import org
.apache
.arrow
.vector
.dictionary
.DictionaryProvider
;
47 import org
.apache
.arrow
.vector
.dictionary
.ListSubfieldEncoder
;
48 import org
.apache
.arrow
.vector
.dictionary
.StructSubfieldEncoder
;
49 import org
.apache
.arrow
.vector
.holders
.NullableIntHolder
;
50 import org
.apache
.arrow
.vector
.holders
.NullableUInt4Holder
;
51 import org
.apache
.arrow
.vector
.types
.Types
;
52 import org
.apache
.arrow
.vector
.types
.pojo
.ArrowType
;
53 import org
.apache
.arrow
.vector
.types
.pojo
.DictionaryEncoding
;
54 import org
.apache
.arrow
.vector
.types
.pojo
.FieldType
;
55 import org
.apache
.arrow
.vector
.util
.Text
;
56 import org
.junit
.After
;
57 import org
.junit
.Before
;
58 import org
.junit
.Test
;
60 public class TestDictionaryVector
{
62 private BufferAllocator allocator
;
64 byte[] zero
= "foo".getBytes(StandardCharsets
.UTF_8
);
65 byte[] one
= "bar".getBytes(StandardCharsets
.UTF_8
);
66 byte[] two
= "baz".getBytes(StandardCharsets
.UTF_8
);
68 byte[][] data
= new byte[][] {zero
, one
, two
};
72 allocator
= new DirtyRootAllocator(Long
.MAX_VALUE
, (byte) 100);
76 public void terminate() throws Exception
{
81 public void testEncodeStrings() {
82 // Create a new value vector
83 try (final VarCharVector vector
= newVarCharVector("foo", allocator
);
84 final VarCharVector dictionaryVector
= newVarCharVector("dict", allocator
);) {
86 setVector(vector
, zero
, one
, one
, two
, zero
);
87 setVector(dictionaryVector
, zero
, one
, two
);
89 Dictionary dictionary
=
90 new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
92 try (final ValueVector encoded
= DictionaryEncoder
.encode(vector
, dictionary
)) {
94 assertEquals(IntVector
.class, encoded
.getClass());
96 IntVector index
= ((IntVector
) encoded
);
97 assertEquals(5, index
.getValueCount());
98 assertEquals(0, index
.get(0));
99 assertEquals(1, index
.get(1));
100 assertEquals(1, index
.get(2));
101 assertEquals(2, index
.get(3));
102 assertEquals(0, index
.get(4));
104 // now run through the decoder and verify we get the original back
105 try (ValueVector decoded
= DictionaryEncoder
.decode(encoded
, dictionary
)) {
106 assertEquals(vector
.getClass(), decoded
.getClass());
107 assertEquals(vector
.getValueCount(), ((VarCharVector
) decoded
).getValueCount());
108 for (int i
= 0; i
< 5; i
++) {
109 assertEquals(vector
.getObject(i
), ((VarCharVector
) decoded
).getObject(i
));
117 public void testEncodeLargeVector() {
118 // Create a new value vector
119 try (final VarCharVector vector
= newVarCharVector("foo", allocator
);
120 final VarCharVector dictionaryVector
= newVarCharVector("dict", allocator
);) {
121 vector
.allocateNew();
125 for (int i
= 0; i
< 10000; ++i
) {
126 vector
.setSafe(i
, data
[i
% 3], 0, data
[i
% 3].length
);
128 vector
.setValueCount(count
);
130 setVector(dictionaryVector
, zero
, one
, two
);
132 Dictionary dictionary
=
133 new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
135 try (final ValueVector encoded
= DictionaryEncoder
.encode(vector
, dictionary
)) {
137 assertEquals(IntVector
.class, encoded
.getClass());
139 IntVector index
= ((IntVector
) encoded
);
140 assertEquals(count
, index
.getValueCount());
141 for (int i
= 0; i
< count
; ++i
) {
142 assertEquals(i
% 3, index
.get(i
));
145 // now run through the decoder and verify we get the original back
146 try (ValueVector decoded
= DictionaryEncoder
.decode(encoded
, dictionary
)) {
147 assertEquals(vector
.getClass(), decoded
.getClass());
148 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
149 for (int i
= 0; i
< count
; ++i
) {
150 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
158 public void testEncodeList() {
159 // Create a new value vector
160 try (final ListVector vector
= ListVector
.empty("vector", allocator
);
161 final ListVector dictionaryVector
= ListVector
.empty("dict", allocator
);) {
163 UnionListWriter writer
= vector
.getWriter();
167 writeListVector(writer
, new int[]{10, 20});
168 writeListVector(writer
, new int[]{10, 20});
169 writeListVector(writer
, new int[]{10, 20});
170 writeListVector(writer
, new int[]{30, 40, 50});
171 writeListVector(writer
, new int[]{30, 40, 50});
172 writeListVector(writer
, new int[]{10, 20});
174 writer
.setValueCount(6);
176 UnionListWriter dictWriter
= dictionaryVector
.getWriter();
177 dictWriter
.allocate();
179 writeListVector(dictWriter
, new int[]{10, 20});
180 writeListVector(dictWriter
, new int[]{30, 40, 50});
182 dictWriter
.setValueCount(2);
184 Dictionary dictionary
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
186 try (final ValueVector encoded
= DictionaryEncoder
.encode(vector
, dictionary
)) {
188 assertEquals(IntVector
.class, encoded
.getClass());
190 IntVector index
= ((IntVector
) encoded
);
191 assertEquals(6, index
.getValueCount());
192 assertEquals(0, index
.get(0));
193 assertEquals(0, index
.get(1));
194 assertEquals(0, index
.get(2));
195 assertEquals(1, index
.get(3));
196 assertEquals(1, index
.get(4));
197 assertEquals(0, index
.get(5));
199 // now run through the decoder and verify we get the original back
200 try (ValueVector decoded
= DictionaryEncoder
.decode(encoded
, dictionary
)) {
201 assertEquals(vector
.getClass(), decoded
.getClass());
202 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
203 for (int i
= 0; i
< 5; i
++) {
204 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
212 public void testEncodeStruct() {
213 // Create a new value vector
214 try (final StructVector vector
= StructVector
.empty("vector", allocator
);
215 final StructVector dictionaryVector
= StructVector
.empty("dict", allocator
);) {
216 vector
.addOrGet("f0", FieldType
.nullable(new ArrowType
.Int(32, true)), IntVector
.class);
217 vector
.addOrGet("f1", FieldType
.nullable(new ArrowType
.Int(64, true)), BigIntVector
.class);
218 dictionaryVector
.addOrGet("f0", FieldType
.nullable(new ArrowType
.Int(32, true)), IntVector
.class);
219 dictionaryVector
.addOrGet("f1", FieldType
.nullable(new ArrowType
.Int(64, true)), BigIntVector
.class);
221 NullableStructWriter writer
= vector
.getWriter();
224 writeStructVector(writer
, 1, 10L);
225 writeStructVector(writer
, 1, 10L);
226 writeStructVector(writer
, 1, 10L);
227 writeStructVector(writer
, 2, 20L);
228 writeStructVector(writer
, 2, 20L);
229 writeStructVector(writer
, 2, 20L);
230 writeStructVector(writer
, 1, 10L);
232 writer
.setValueCount(7);
234 NullableStructWriter dictWriter
= dictionaryVector
.getWriter();
235 dictWriter
.allocate();
237 writeStructVector(dictWriter
, 1, 10L);
238 writeStructVector(dictWriter
, 2, 20L);
241 dictionaryVector
.setValueCount(2);
243 Dictionary dictionary
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
245 try (final ValueVector encoded
= DictionaryEncoder
.encode(vector
, dictionary
)) {
247 assertEquals(IntVector
.class, encoded
.getClass());
249 IntVector index
= ((IntVector
) encoded
);
250 assertEquals(7, index
.getValueCount());
251 assertEquals(0, index
.get(0));
252 assertEquals(0, index
.get(1));
253 assertEquals(0, index
.get(2));
254 assertEquals(1, index
.get(3));
255 assertEquals(1, index
.get(4));
256 assertEquals(1, index
.get(5));
257 assertEquals(0, index
.get(6));
259 // now run through the decoder and verify we get the original back
260 try (ValueVector decoded
= DictionaryEncoder
.decode(encoded
, dictionary
)) {
261 assertEquals(vector
.getClass(), decoded
.getClass());
262 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
263 for (int i
= 0; i
< 5; i
++) {
264 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
272 public void testEncodeBinaryVector() {
273 // Create a new value vector
274 try (final VarBinaryVector vector
= newVarBinaryVector("foo", allocator
);
275 final VarBinaryVector dictionaryVector
= newVarBinaryVector("dict", allocator
)) {
277 setVector(vector
, zero
, one
, one
, two
, zero
);
278 setVector(dictionaryVector
, zero
, one
, two
);
280 Dictionary dictionary
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
282 try (final ValueVector encoded
= DictionaryEncoder
.encode(vector
, dictionary
)) {
284 assertEquals(IntVector
.class, encoded
.getClass());
286 IntVector index
= ((IntVector
) encoded
);
287 assertEquals(5, index
.getValueCount());
288 assertEquals(0, index
.get(0));
289 assertEquals(1, index
.get(1));
290 assertEquals(1, index
.get(2));
291 assertEquals(2, index
.get(3));
292 assertEquals(0, index
.get(4));
294 // now run through the decoder and verify we get the original back
295 try (VarBinaryVector decoded
= (VarBinaryVector
) DictionaryEncoder
.decode(encoded
, dictionary
)) {
296 assertEquals(vector
.getClass(), decoded
.getClass());
297 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
298 for (int i
= 0; i
< 5; i
++) {
299 assertTrue(Arrays
.equals(vector
.getObject(i
), decoded
.getObject(i
)));
307 public void testEncodeUnion() {
308 // Create a new value vector
309 try (final UnionVector vector
= new UnionVector("vector", allocator
, /* field type */ null, /* call-back */ null);
310 final UnionVector dictionaryVector
=
311 new UnionVector("dict", allocator
, /* field type */ null, /* call-back */ null);) {
313 final NullableUInt4Holder uintHolder1
= new NullableUInt4Holder();
314 uintHolder1
.value
= 10;
315 uintHolder1
.isSet
= 1;
317 final NullableIntHolder intHolder1
= new NullableIntHolder();
318 intHolder1
.value
= 10;
319 intHolder1
.isSet
= 1;
321 final NullableIntHolder intHolder2
= new NullableIntHolder();
322 intHolder2
.value
= 20;
323 intHolder2
.isSet
= 1;
326 vector
.setType(0, Types
.MinorType
.UINT4
);
327 vector
.setSafe(0, uintHolder1
);
329 vector
.setType(1, Types
.MinorType
.INT
);
330 vector
.setSafe(1, intHolder1
);
332 vector
.setType(2, Types
.MinorType
.INT
);
333 vector
.setSafe(2, intHolder1
);
335 vector
.setType(3, Types
.MinorType
.INT
);
336 vector
.setSafe(3, intHolder2
);
338 vector
.setType(4, Types
.MinorType
.INT
);
339 vector
.setSafe(4, intHolder2
);
341 vector
.setValueCount(5);
344 dictionaryVector
.setType(0, Types
.MinorType
.UINT4
);
345 dictionaryVector
.setSafe(0, uintHolder1
);
347 dictionaryVector
.setType(1, Types
.MinorType
.INT
);
348 dictionaryVector
.setSafe(1, intHolder1
);
350 dictionaryVector
.setType(2, Types
.MinorType
.INT
);
351 dictionaryVector
.setSafe(2, intHolder2
);
353 dictionaryVector
.setValueCount(3);
355 Dictionary dictionary
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
357 try (final ValueVector encoded
= DictionaryEncoder
.encode(vector
, dictionary
)) {
359 assertEquals(IntVector
.class, encoded
.getClass());
361 IntVector index
= ((IntVector
) encoded
);
362 assertEquals(5, index
.getValueCount());
363 assertEquals(0, index
.get(0));
364 assertEquals(1, index
.get(1));
365 assertEquals(1, index
.get(2));
366 assertEquals(2, index
.get(3));
367 assertEquals(2, index
.get(4));
369 // now run through the decoder and verify we get the original back
370 try (ValueVector decoded
= DictionaryEncoder
.decode(encoded
, dictionary
)) {
371 assertEquals(vector
.getClass(), decoded
.getClass());
372 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
373 for (int i
= 0; i
< 5; i
++) {
374 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
382 public void testIntEquals() {
384 try (final IntVector vector1
= new IntVector("int", allocator
);
385 final IntVector vector2
= new IntVector("int", allocator
)) {
387 Dictionary dict1
= new Dictionary(vector1
, new DictionaryEncoding(1L, false, null));
388 Dictionary dict2
= new Dictionary(vector2
, new DictionaryEncoding(1L, false, null));
390 setVector(vector1
, 1, 2, 3);
391 setVector(vector2
, 1, 2, 0);
393 assertFalse(dict1
.equals(dict2
));
395 vector2
.setSafe(2, 3);
396 assertTrue(dict1
.equals(dict2
));
401 public void testVarcharEquals() {
402 try (final VarCharVector vector1
= new VarCharVector("varchar", allocator
);
403 final VarCharVector vector2
= new VarCharVector("varchar", allocator
)) {
405 Dictionary dict1
= new Dictionary(vector1
, new DictionaryEncoding(1L, false, null));
406 Dictionary dict2
= new Dictionary(vector2
, new DictionaryEncoding(1L, false, null));
408 setVector(vector1
, zero
, one
, two
);
409 setVector(vector2
, zero
, one
, one
);
411 assertFalse(dict1
.equals(dict2
));
413 vector2
.setSafe(2, two
, 0, two
.length
);
414 assertTrue(dict1
.equals(dict2
));
419 public void testVarBinaryEquals() {
420 try (final VarBinaryVector vector1
= new VarBinaryVector("binary", allocator
);
421 final VarBinaryVector vector2
= new VarBinaryVector("binary", allocator
)) {
423 Dictionary dict1
= new Dictionary(vector1
, new DictionaryEncoding(1L, false, null));
424 Dictionary dict2
= new Dictionary(vector2
, new DictionaryEncoding(1L, false, null));
426 setVector(vector1
, zero
, one
, two
);
427 setVector(vector2
, zero
, one
, one
);
429 assertFalse(dict1
.equals(dict2
));
431 vector2
.setSafe(2, two
, 0, two
.length
);
432 assertTrue(dict1
.equals(dict2
));
437 public void testListEquals() {
438 try (final ListVector vector1
= ListVector
.empty("list", allocator
);
439 final ListVector vector2
= ListVector
.empty("list", allocator
);) {
441 Dictionary dict1
= new Dictionary(vector1
, new DictionaryEncoding(1L, false, null));
442 Dictionary dict2
= new Dictionary(vector2
, new DictionaryEncoding(1L, false, null));
444 UnionListWriter writer1
= vector1
.getWriter();
448 writeListVector(writer1
, new int[] {1, 2});
449 writeListVector(writer1
, new int[] {3, 4});
450 writeListVector(writer1
, new int[] {5, 6});
451 writer1
.setValueCount(3);
453 UnionListWriter writer2
= vector2
.getWriter();
457 writeListVector(writer2
, new int[] {1, 2});
458 writeListVector(writer2
, new int[] {3, 4});
459 writeListVector(writer2
, new int[] {5, 6});
460 writer2
.setValueCount(3);
462 assertTrue(dict1
.equals(dict2
));
467 public void testStructEquals() {
468 try (final StructVector vector1
= StructVector
.empty("struct", allocator
);
469 final StructVector vector2
= StructVector
.empty("struct", allocator
);) {
470 vector1
.addOrGet("f0", FieldType
.nullable(new ArrowType
.Int(32, true)), IntVector
.class);
471 vector1
.addOrGet("f1", FieldType
.nullable(new ArrowType
.Int(64, true)), BigIntVector
.class);
472 vector2
.addOrGet("f0", FieldType
.nullable(new ArrowType
.Int(32, true)), IntVector
.class);
473 vector2
.addOrGet("f1", FieldType
.nullable(new ArrowType
.Int(64, true)), BigIntVector
.class);
475 Dictionary dict1
= new Dictionary(vector1
, new DictionaryEncoding(1L, false, null));
476 Dictionary dict2
= new Dictionary(vector2
, new DictionaryEncoding(1L, false, null));
478 NullableStructWriter writer1
= vector1
.getWriter();
481 writeStructVector(writer1
, 1, 10L);
482 writeStructVector(writer1
, 2, 20L);
483 writer1
.setValueCount(2);
485 NullableStructWriter writer2
= vector2
.getWriter();
488 writeStructVector(writer2
, 1, 10L);
489 writeStructVector(writer2
, 2, 20L);
490 writer2
.setValueCount(2);
492 assertTrue(dict1
.equals(dict2
));
497 public void testUnionEquals() {
498 try (final UnionVector vector1
= new UnionVector("union", allocator
, /* field type */ null, /* call-back */ null);
499 final UnionVector vector2
=
500 new UnionVector("union", allocator
, /* field type */ null, /* call-back */ null);) {
502 final NullableUInt4Holder uInt4Holder
= new NullableUInt4Holder();
503 uInt4Holder
.value
= 10;
504 uInt4Holder
.isSet
= 1;
506 final NullableIntHolder intHolder
= new NullableIntHolder();
507 uInt4Holder
.value
= 20;
508 uInt4Holder
.isSet
= 1;
510 vector1
.setType(0, Types
.MinorType
.UINT4
);
511 vector1
.setSafe(0, uInt4Holder
);
513 vector1
.setType(2, Types
.MinorType
.INT
);
514 vector1
.setSafe(2, intHolder
);
515 vector1
.setValueCount(3);
517 vector2
.setType(0, Types
.MinorType
.UINT4
);
518 vector2
.setSafe(0, uInt4Holder
);
520 vector2
.setType(2, Types
.MinorType
.INT
);
521 vector2
.setSafe(2, intHolder
);
522 vector2
.setValueCount(3);
524 Dictionary dict1
= new Dictionary(vector1
, new DictionaryEncoding(1L, false, null));
525 Dictionary dict2
= new Dictionary(vector2
, new DictionaryEncoding(1L, false, null));
527 assertTrue(dict1
.equals(dict2
));
532 public void testEncodeWithEncoderInstance() {
533 // Create a new value vector
534 try (final VarCharVector vector
= newVarCharVector("vector", allocator
);
535 final VarCharVector dictionaryVector
= newVarCharVector("dict", allocator
);) {
537 setVector(vector
, zero
, one
, one
, two
, zero
);
538 setVector(dictionaryVector
, zero
, one
, two
);
540 Dictionary dictionary
=
541 new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
542 DictionaryEncoder encoder
= new DictionaryEncoder(dictionary
, allocator
);
544 try (final ValueVector encoded
= encoder
.encode(vector
)) {
546 assertEquals(IntVector
.class, encoded
.getClass());
548 IntVector index
= ((IntVector
) encoded
);
549 assertEquals(5, index
.getValueCount());
550 assertEquals(0, index
.get(0));
551 assertEquals(1, index
.get(1));
552 assertEquals(1, index
.get(2));
553 assertEquals(2, index
.get(3));
554 assertEquals(0, index
.get(4));
556 // now run through the decoder and verify we get the original back
557 try (ValueVector decoded
= encoder
.decode(encoded
)) {
558 assertEquals(vector
.getClass(), decoded
.getClass());
559 assertEquals(vector
.getValueCount(), (decoded
).getValueCount());
560 for (int i
= 0; i
< 5; i
++) {
561 assertEquals(vector
.getObject(i
), ((VarCharVector
) decoded
).getObject(i
));
569 public void testEncodeMultiVectors() {
570 // Create a new value vector
571 try (final VarCharVector vector1
= newVarCharVector("vector1", allocator
);
572 final VarCharVector vector2
= newVarCharVector("vector2", allocator
);
573 final VarCharVector dictionaryVector
= newVarCharVector("dict", allocator
);) {
575 setVector(vector1
, zero
, one
, one
, two
, zero
);
576 setVector(vector2
, zero
, one
, one
);
577 setVector(dictionaryVector
, zero
, one
, two
);
579 Dictionary dictionary
=
580 new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
581 DictionaryEncoder encoder
= new DictionaryEncoder(dictionary
, allocator
);
583 try (final ValueVector encoded
= encoder
.encode(vector1
)) {
585 assertEquals(IntVector
.class, encoded
.getClass());
587 IntVector index
= ((IntVector
) encoded
);
588 assertEquals(5, index
.getValueCount());
589 assertEquals(0, index
.get(0));
590 assertEquals(1, index
.get(1));
591 assertEquals(1, index
.get(2));
592 assertEquals(2, index
.get(3));
593 assertEquals(0, index
.get(4));
595 // now run through the decoder and verify we get the original back
596 try (ValueVector decoded
= encoder
.decode(encoded
)) {
597 assertEquals(vector1
.getClass(), decoded
.getClass());
598 assertEquals(vector1
.getValueCount(), (decoded
).getValueCount());
599 for (int i
= 0; i
< 5; i
++) {
600 assertEquals(vector1
.getObject(i
), ((VarCharVector
) decoded
).getObject(i
));
605 try (final ValueVector encoded
= encoder
.encode(vector2
)) {
607 assertEquals(IntVector
.class, encoded
.getClass());
609 IntVector index
= ((IntVector
) encoded
);
610 assertEquals(3, index
.getValueCount());
611 assertEquals(0, index
.get(0));
612 assertEquals(1, index
.get(1));
613 assertEquals(1, index
.get(2));
615 // now run through the decoder and verify we get the original back
616 try (ValueVector decoded
= encoder
.decode(encoded
)) {
617 assertEquals(vector2
.getClass(), decoded
.getClass());
618 assertEquals(vector2
.getValueCount(), (decoded
).getValueCount());
619 for (int i
= 0; i
< 3; i
++) {
620 assertEquals(vector2
.getObject(i
), ((VarCharVector
) decoded
).getObject(i
));
628 public void testEncodeListSubField() {
629 // Create a new value vector
630 try (final ListVector vector
= ListVector
.empty("vector", allocator
);
631 final ListVector dictionaryVector
= ListVector
.empty("dict", allocator
);) {
633 UnionListWriter writer
= vector
.getWriter();
637 writeListVector(writer
, new int[]{10, 20});
638 writeListVector(writer
, new int[]{10, 20});
639 writeListVector(writer
, new int[]{10, 20});
640 writeListVector(writer
, new int[]{30, 40, 50});
641 writeListVector(writer
, new int[]{30, 40, 50});
642 writeListVector(writer
, new int[]{10, 20});
643 writer
.setValueCount(6);
645 UnionListWriter dictWriter
= dictionaryVector
.getWriter();
646 dictWriter
.allocate();
647 writeListVector(dictWriter
, new int[]{10, 20, 30, 40, 50});
648 dictionaryVector
.setValueCount(1);
650 Dictionary dictionary
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
651 ListSubfieldEncoder encoder
= new ListSubfieldEncoder(dictionary
, allocator
);
653 try (final ListVector encoded
= (ListVector
) encoder
.encodeListSubField(vector
)) {
655 assertEquals(ListVector
.class, encoded
.getClass());
657 assertEquals(6, encoded
.getValueCount());
658 int[] realValue1
= convertListToIntArray(encoded
.getObject(0));
659 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue1
));
660 int[] realValue2
= convertListToIntArray(encoded
.getObject(1));
661 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue2
));
662 int[] realValue3
= convertListToIntArray(encoded
.getObject(2));
663 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue3
));
664 int[] realValue4
= convertListToIntArray(encoded
.getObject(3));
665 assertTrue(Arrays
.equals(new int[] {2, 3, 4}, realValue4
));
666 int[] realValue5
= convertListToIntArray(encoded
.getObject(4));
667 assertTrue(Arrays
.equals(new int[] {2, 3, 4}, realValue5
));
668 int[] realValue6
= convertListToIntArray(encoded
.getObject(5));
669 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue6
));
671 // now run through the decoder and verify we get the original back
672 try (ValueVector decoded
= encoder
.decodeListSubField(encoded
)) {
673 assertEquals(vector
.getClass(), decoded
.getClass());
674 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
675 for (int i
= 0; i
< 5; i
++) {
676 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
684 public void testEncodeFixedSizeListSubField() {
685 // Create a new value vector
686 try (final FixedSizeListVector vector
= FixedSizeListVector
.empty("vector", 2, allocator
);
687 final FixedSizeListVector dictionaryVector
= FixedSizeListVector
.empty("dict", 2, allocator
)) {
689 vector
.allocateNew();
690 vector
.setValueCount(4);
692 IntVector dataVector
=
693 (IntVector
) vector
.addOrGetVector(FieldType
.nullable(Types
.MinorType
.INT
.getType())).getVector();
694 dataVector
.allocateNew(8);
695 dataVector
.setValueCount(8);
696 // set value at index 0
697 vector
.setNotNull(0);
698 dataVector
.set(0, 10);
699 dataVector
.set(1, 20);
700 // set value at index 1
701 vector
.setNotNull(1);
702 dataVector
.set(2, 10);
703 dataVector
.set(3, 20);
704 // set value at index 2
705 vector
.setNotNull(2);
706 dataVector
.set(4, 30);
707 dataVector
.set(5, 40);
708 // set value at index 3
709 vector
.setNotNull(3);
710 dataVector
.set(6, 10);
711 dataVector
.set(7, 20);
713 dictionaryVector
.allocateNew();
714 dictionaryVector
.setValueCount(2);
715 IntVector dictDataVector
=
716 (IntVector
) dictionaryVector
.addOrGetVector(FieldType
.nullable(Types
.MinorType
.INT
.getType())).getVector();
717 dictDataVector
.allocateNew(4);
718 dictDataVector
.setValueCount(4);
720 dictionaryVector
.setNotNull(0);
721 dictDataVector
.set(0, 10);
722 dictDataVector
.set(1, 20);
723 dictionaryVector
.setNotNull(1);
724 dictDataVector
.set(2, 30);
725 dictDataVector
.set(3, 40);
727 Dictionary dictionary
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
728 ListSubfieldEncoder encoder
= new ListSubfieldEncoder(dictionary
, allocator
);
730 try (final FixedSizeListVector encoded
=
731 (FixedSizeListVector
) encoder
.encodeListSubField(vector
)) {
733 assertEquals(FixedSizeListVector
.class, encoded
.getClass());
735 assertEquals(4, encoded
.getValueCount());
736 int[] realValue1
= convertListToIntArray(encoded
.getObject(0));
737 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue1
));
738 int[] realValue2
= convertListToIntArray(encoded
.getObject(1));
739 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue2
));
740 int[] realValue3
= convertListToIntArray(encoded
.getObject(2));
741 assertTrue(Arrays
.equals(new int[] {2, 3}, realValue3
));
742 int[] realValue4
= convertListToIntArray(encoded
.getObject(3));
743 assertTrue(Arrays
.equals(new int[] {0, 1}, realValue4
));
745 // now run through the decoder and verify we get the original back
746 try (ValueVector decoded
= encoder
.decodeListSubField(encoded
)) {
747 assertEquals(vector
.getClass(), decoded
.getClass());
748 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
749 for (int i
= 0; i
< 5; i
++) {
750 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
758 public void testEncodeStructSubField() {
759 try (final StructVector vector
= StructVector
.empty("vector", allocator
);
760 final VarCharVector dictVector1
= new VarCharVector("f0", allocator
);
761 final VarCharVector dictVector2
= new VarCharVector("f1", allocator
)) {
763 vector
.addOrGet("f0", FieldType
.nullable(ArrowType
.Utf8
.INSTANCE
), VarCharVector
.class);
764 vector
.addOrGet("f1", FieldType
.nullable(ArrowType
.Utf8
.INSTANCE
), VarCharVector
.class);
766 NullableStructWriter writer
= vector
.getWriter();
769 writeStructVector(writer
, "aa", "baz");
770 writeStructVector(writer
, "bb", "bar");
771 writeStructVector(writer
, "cc", "foo");
772 writeStructVector(writer
, "aa", "foo");
773 writeStructVector(writer
, "dd", "foo");
774 writer
.setValueCount(5);
776 // initialize dictionaries
777 DictionaryProvider
.MapDictionaryProvider provider
= new DictionaryProvider
.MapDictionaryProvider();
780 setVector(dictVector1
,
781 "aa".getBytes(StandardCharsets
.UTF_8
),
782 "bb".getBytes(StandardCharsets
.UTF_8
),
783 "cc".getBytes(StandardCharsets
.UTF_8
),
784 "dd".getBytes(StandardCharsets
.UTF_8
));
785 setVector(dictVector2
,
786 "foo".getBytes(StandardCharsets
.UTF_8
),
787 "baz".getBytes(StandardCharsets
.UTF_8
),
788 "bar".getBytes(StandardCharsets
.UTF_8
));
790 provider
.put(new Dictionary(dictVector1
, new DictionaryEncoding(1L, false, null)));
791 provider
.put(new Dictionary(dictVector2
, new DictionaryEncoding(2L, false, null)));
793 StructSubfieldEncoder encoder
= new StructSubfieldEncoder(allocator
, provider
);
794 Map
<Integer
, Long
> columnToDictionaryId
= new HashMap
<>();
795 columnToDictionaryId
.put(0, 1L);
796 columnToDictionaryId
.put(1, 2L);
798 try (final StructVector encoded
= (StructVector
) encoder
.encode(vector
, columnToDictionaryId
)) {
800 assertEquals(StructVector
.class, encoded
.getClass());
802 assertEquals(5, encoded
.getValueCount());
803 Object
[] realValue1
= convertMapValuesToArray(encoded
.getObject(0));
804 assertTrue(Arrays
.equals(new Object
[] {0, 1}, realValue1
));
805 Object
[] realValue2
= convertMapValuesToArray(encoded
.getObject(1));
806 assertTrue(Arrays
.equals(new Object
[] {1, 2}, realValue2
));
807 Object
[] realValue3
= convertMapValuesToArray(encoded
.getObject(2));
808 assertTrue(Arrays
.equals(new Object
[] {2, 0}, realValue3
));
809 Object
[] realValue4
= convertMapValuesToArray(encoded
.getObject(3));
810 assertTrue(Arrays
.equals(new Object
[] {0, 0}, realValue4
));
811 Object
[] realValue5
= convertMapValuesToArray(encoded
.getObject(4));
812 assertTrue(Arrays
.equals(new Object
[] {3, 0}, realValue5
));
814 // now run through the decoder and verify we get the original back
815 try (ValueVector decoded
= encoder
.decode(encoded
)) {
816 assertEquals(vector
.getClass(), decoded
.getClass());
817 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
818 for (int i
= 0; i
< 5; i
++) {
819 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
827 public void testEncodeStructSubFieldWithCertainColumns() {
828 // in this case, some child vector is encoded and others are not
829 try (final StructVector vector
= StructVector
.empty("vector", allocator
);
830 final VarCharVector dictVector1
= new VarCharVector("f0", allocator
)) {
832 vector
.addOrGet("f0", FieldType
.nullable(ArrowType
.Utf8
.INSTANCE
), VarCharVector
.class);
833 vector
.addOrGet("f1", FieldType
.nullable(ArrowType
.Utf8
.INSTANCE
), VarCharVector
.class);
835 NullableStructWriter writer
= vector
.getWriter();
838 writeStructVector(writer
, "aa", "baz");
839 writeStructVector(writer
, "bb", "bar");
840 writeStructVector(writer
, "cc", "foo");
841 writeStructVector(writer
, "aa", "foo");
842 writeStructVector(writer
, "dd", "foo");
843 writer
.setValueCount(5);
845 // initialize dictionaries
846 DictionaryProvider
.MapDictionaryProvider provider
= new DictionaryProvider
.MapDictionaryProvider();
848 setVector(dictVector1
, "aa".getBytes(), "bb".getBytes(), "cc".getBytes(), "dd".getBytes());
850 provider
.put(new Dictionary(dictVector1
, new DictionaryEncoding(1L, false, null)));
851 StructSubfieldEncoder encoder
= new StructSubfieldEncoder(allocator
, provider
);
852 Map
<Integer
, Long
> columnToDictionaryId
= new HashMap
<>();
853 columnToDictionaryId
.put(0, 1L);
855 try (final StructVector encoded
= (StructVector
) encoder
.encode(vector
, columnToDictionaryId
)) {
857 assertEquals(StructVector
.class, encoded
.getClass());
859 assertEquals(5, encoded
.getValueCount());
860 Object
[] realValue1
= convertMapValuesToArray(encoded
.getObject(0));
861 assertTrue(Arrays
.equals(new Object
[] {0, new Text("baz")}, realValue1
));
862 Object
[] realValue2
= convertMapValuesToArray(encoded
.getObject(1));
863 assertTrue(Arrays
.equals(new Object
[] {1, new Text("bar")}, realValue2
));
864 Object
[] realValue3
= convertMapValuesToArray(encoded
.getObject(2));
865 assertTrue(Arrays
.equals(new Object
[] {2, new Text("foo")}, realValue3
));
866 Object
[] realValue4
= convertMapValuesToArray(encoded
.getObject(3));
867 assertTrue(Arrays
.equals(new Object
[] {0, new Text("foo")}, realValue4
));
868 Object
[] realValue5
= convertMapValuesToArray(encoded
.getObject(4));
869 assertTrue(Arrays
.equals(new Object
[] {3, new Text("foo")}, realValue5
));
871 // now run through the decoder and verify we get the original back
872 try (ValueVector decoded
= encoder
.decode(encoded
)) {
873 assertEquals(vector
.getClass(), decoded
.getClass());
874 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
875 for (int i
= 0; i
< 5; i
++) {
876 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
884 private void testDictionary(Dictionary dictionary
, ToIntBiFunction
<ValueVector
, Integer
> valGetter
) {
885 try (VarCharVector vector
= new VarCharVector("vector", allocator
)) {
886 setVector(vector
, "1", "3", "5", "7", "9");
887 try (ValueVector encodedVector
= DictionaryEncoder
.encode(vector
, dictionary
)) {
889 // verify encoded result
890 assertEquals(vector
.getValueCount(), encodedVector
.getValueCount());
891 assertEquals(valGetter
.applyAsInt(encodedVector
, 0), 1);
892 assertEquals(valGetter
.applyAsInt(encodedVector
, 1), 3);
893 assertEquals(valGetter
.applyAsInt(encodedVector
, 2), 5);
894 assertEquals(valGetter
.applyAsInt(encodedVector
, 3), 7);
895 assertEquals(valGetter
.applyAsInt(encodedVector
, 4), 9);
897 try (ValueVector decodedVector
= DictionaryEncoder
.decode(encodedVector
, dictionary
)) {
898 assertTrue(decodedVector
instanceof VarCharVector
);
899 assertEquals(vector
.getValueCount(), decodedVector
.getValueCount());
900 assertArrayEquals("1".getBytes(), ((VarCharVector
) decodedVector
).get(0));
901 assertArrayEquals("3".getBytes(), ((VarCharVector
) decodedVector
).get(1));
902 assertArrayEquals("5".getBytes(), ((VarCharVector
) decodedVector
).get(2));
903 assertArrayEquals("7".getBytes(), ((VarCharVector
) decodedVector
).get(3));
904 assertArrayEquals("9".getBytes(), ((VarCharVector
) decodedVector
).get(4));
911 public void testDictionaryUInt1() {
912 try (VarCharVector dictionaryVector
= new VarCharVector("dict vector", allocator
)) {
913 setVector(dictionaryVector
, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
914 Dictionary dictionary1
= new Dictionary(dictionaryVector
,
915 new DictionaryEncoding(/*id=*/10L, /*ordered=*/false,
916 /*indexType=*/new ArrowType
.Int(/*bitWidth*/8, /*isSigned*/false)));
917 testDictionary(dictionary1
, (vector
, index
) -> ((UInt1Vector
) vector
).get(index
));
922 public void testDictionaryUInt2() {
923 try (VarCharVector dictionaryVector
= new VarCharVector("dict vector", allocator
)) {
924 setVector(dictionaryVector
, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
925 Dictionary dictionary2
= new Dictionary(dictionaryVector
,
926 new DictionaryEncoding(/*id=*/20L, /*ordered=*/false,
927 /*indexType=*/new ArrowType
.Int(/*indexType=*/16, /*isSigned*/false)));
928 testDictionary(dictionary2
, (vector
, index
) -> ((UInt2Vector
) vector
).get(index
));
933 public void testDictionaryUInt4() {
934 try (VarCharVector dictionaryVector
= new VarCharVector("dict vector", allocator
)) {
935 setVector(dictionaryVector
, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
936 Dictionary dictionary4
= new Dictionary(dictionaryVector
,
937 new DictionaryEncoding(/*id=*/30L, /*ordered=*/false,
938 /*indexType=*/new ArrowType
.Int(/*indexType=*/32, /*isSigned*/false)));
939 testDictionary(dictionary4
, (vector
, index
) -> ((UInt4Vector
) vector
).get(index
));
944 public void testDictionaryUInt8() {
945 try (VarCharVector dictionaryVector
= new VarCharVector("dict vector", allocator
)) {
946 setVector(dictionaryVector
, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
947 Dictionary dictionary8
= new Dictionary(dictionaryVector
,
948 new DictionaryEncoding(/*id=*/40L, /*ordered=*/false,
949 /*indexType=*/new ArrowType
.Int(/*indexType=*/64, /*isSigned*/false)));
950 testDictionary(dictionary8
, (vector
, index
) -> (int) ((UInt8Vector
) vector
).get(index
));
955 public void testDictionaryUIntOverflow() {
956 // the size is within the range of UInt1, but outside the range of TinyInt.
957 final int vecLength
= 256;
958 try (VarCharVector dictionaryVector
= new VarCharVector("dict vector", allocator
)) {
959 dictionaryVector
.allocateNew(vecLength
* 3, vecLength
);
960 for (int i
= 0; i
< vecLength
; i
++) {
961 dictionaryVector
.set(i
, String
.valueOf(i
).getBytes());
963 dictionaryVector
.setValueCount(vecLength
);
965 Dictionary dictionary
= new Dictionary(dictionaryVector
,
966 new DictionaryEncoding(/*id=*/10L, /*ordered=*/false,
967 /*indexType=*/new ArrowType
.Int(/*indexType=*/8, /*isSigned*/false)));
969 try (VarCharVector vector
= new VarCharVector("vector", allocator
)) {
970 setVector(vector
, "255");
971 try (UInt1Vector encodedVector
= (UInt1Vector
) DictionaryEncoder
.encode(vector
, dictionary
)) {
973 // verify encoded result
974 assertEquals(1, encodedVector
.getValueCount());
975 assertEquals(255, encodedVector
.getValueAsLong(0));
977 try (VarCharVector decodedVector
= (VarCharVector
) DictionaryEncoder
.decode(encodedVector
, dictionary
)) {
978 assertEquals(1, decodedVector
.getValueCount());
979 assertArrayEquals("255".getBytes(), decodedVector
.get(0));
986 private int[] convertListToIntArray(List list
) {
987 int[] values
= new int[list
.size()];
988 for (int i
= 0; i
< list
.size(); i
++) {
989 values
[i
] = (int) list
.get(i
);
994 private Object
[] convertMapValuesToArray(Map map
) {
995 Object
[] values
= new Object
[map
.size()];
996 Iterator valueIterator
= map
.values().iterator();
997 for (int i
= 0; i
< map
.size(); i
++) {
998 values
[i
] = valueIterator
.next();
1003 private void writeStructVector(NullableStructWriter writer
, String value1
, String value2
) {
1005 byte[] bytes1
= value1
.getBytes(StandardCharsets
.UTF_8
);
1006 byte[] bytes2
= value2
.getBytes(StandardCharsets
.UTF_8
);
1007 ArrowBuf temp
= allocator
.buffer(bytes1
.length
> bytes2
.length ? bytes1
.length
: bytes2
.length
);
1010 temp
.setBytes(0, bytes1
);
1011 writer
.varChar("f0").writeVarChar(0, bytes1
.length
, temp
);
1012 temp
.setBytes(0, bytes2
);
1013 writer
.varChar("f1").writeVarChar(0, bytes2
.length
, temp
);
1018 private void writeStructVector(NullableStructWriter writer
, int value1
, long value2
) {
1020 writer
.integer("f0").writeInt(value1
);
1021 writer
.bigInt("f1").writeBigInt(value2
);
1025 private void writeListVector(UnionListWriter writer
, int[] values
) {
1027 for (int v
: values
) {
1028 writer
.integer().writeInt(v
);