]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / vector / src / test / java / org / apache / arrow / vector / TestDictionaryVector.java
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.arrow.vector;
19
20 import static org.apache.arrow.vector.TestUtils.newVarBinaryVector;
21 import static org.apache.arrow.vector.TestUtils.newVarCharVector;
22 import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector;
23 import static org.junit.Assert.assertArrayEquals;
24 import static org.junit.Assert.assertEquals;
25 import static org.junit.Assert.assertFalse;
26 import static org.junit.Assert.assertTrue;
27
28 import java.nio.charset.StandardCharsets;
29 import java.util.Arrays;
30 import java.util.HashMap;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.function.ToIntBiFunction;
35
36 import org.apache.arrow.memory.ArrowBuf;
37 import org.apache.arrow.memory.BufferAllocator;
38 import org.apache.arrow.vector.complex.FixedSizeListVector;
39 import org.apache.arrow.vector.complex.ListVector;
40 import org.apache.arrow.vector.complex.StructVector;
41 import org.apache.arrow.vector.complex.UnionVector;
42 import org.apache.arrow.vector.complex.impl.NullableStructWriter;
43 import org.apache.arrow.vector.complex.impl.UnionListWriter;
44 import org.apache.arrow.vector.dictionary.Dictionary;
45 import org.apache.arrow.vector.dictionary.DictionaryEncoder;
46 import org.apache.arrow.vector.dictionary.DictionaryProvider;
47 import org.apache.arrow.vector.dictionary.ListSubfieldEncoder;
48 import org.apache.arrow.vector.dictionary.StructSubfieldEncoder;
49 import org.apache.arrow.vector.holders.NullableIntHolder;
50 import org.apache.arrow.vector.holders.NullableUInt4Holder;
51 import org.apache.arrow.vector.types.Types;
52 import org.apache.arrow.vector.types.pojo.ArrowType;
53 import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
54 import org.apache.arrow.vector.types.pojo.FieldType;
55 import org.apache.arrow.vector.util.Text;
56 import org.junit.After;
57 import org.junit.Before;
58 import org.junit.Test;
59
60 public class TestDictionaryVector {
61
62 private BufferAllocator allocator;
63
64 byte[] zero = "foo".getBytes(StandardCharsets.UTF_8);
65 byte[] one = "bar".getBytes(StandardCharsets.UTF_8);
66 byte[] two = "baz".getBytes(StandardCharsets.UTF_8);
67
68 byte[][] data = new byte[][] {zero, one, two};
69
70 @Before
71 public void init() {
72 allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100);
73 }
74
75 @After
76 public void terminate() throws Exception {
77 allocator.close();
78 }
79
80 @Test
81 public void testEncodeStrings() {
82 // Create a new value vector
83 try (final VarCharVector vector = newVarCharVector("foo", allocator);
84 final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) {
85
86 setVector(vector, zero, one, one, two, zero);
87 setVector(dictionaryVector, zero, one, two);
88
89 Dictionary dictionary =
90 new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
91
92 try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
93 // verify indices
94 assertEquals(IntVector.class, encoded.getClass());
95
96 IntVector index = ((IntVector) encoded);
97 assertEquals(5, index.getValueCount());
98 assertEquals(0, index.get(0));
99 assertEquals(1, index.get(1));
100 assertEquals(1, index.get(2));
101 assertEquals(2, index.get(3));
102 assertEquals(0, index.get(4));
103
104 // now run through the decoder and verify we get the original back
105 try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
106 assertEquals(vector.getClass(), decoded.getClass());
107 assertEquals(vector.getValueCount(), ((VarCharVector) decoded).getValueCount());
108 for (int i = 0; i < 5; i++) {
109 assertEquals(vector.getObject(i), ((VarCharVector) decoded).getObject(i));
110 }
111 }
112 }
113 }
114 }
115
116 @Test
117 public void testEncodeLargeVector() {
118 // Create a new value vector
119 try (final VarCharVector vector = newVarCharVector("foo", allocator);
120 final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) {
121 vector.allocateNew();
122
123 int count = 10000;
124
125 for (int i = 0; i < 10000; ++i) {
126 vector.setSafe(i, data[i % 3], 0, data[i % 3].length);
127 }
128 vector.setValueCount(count);
129
130 setVector(dictionaryVector, zero, one, two);
131
132 Dictionary dictionary =
133 new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
134
135 try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
136 // verify indices
137 assertEquals(IntVector.class, encoded.getClass());
138
139 IntVector index = ((IntVector) encoded);
140 assertEquals(count, index.getValueCount());
141 for (int i = 0; i < count; ++i) {
142 assertEquals(i % 3, index.get(i));
143 }
144
145 // now run through the decoder and verify we get the original back
146 try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
147 assertEquals(vector.getClass(), decoded.getClass());
148 assertEquals(vector.getValueCount(), decoded.getValueCount());
149 for (int i = 0; i < count; ++i) {
150 assertEquals(vector.getObject(i), decoded.getObject(i));
151 }
152 }
153 }
154 }
155 }
156
157 @Test
158 public void testEncodeList() {
159 // Create a new value vector
160 try (final ListVector vector = ListVector.empty("vector", allocator);
161 final ListVector dictionaryVector = ListVector.empty("dict", allocator);) {
162
163 UnionListWriter writer = vector.getWriter();
164 writer.allocate();
165
166 //set some values
167 writeListVector(writer, new int[]{10, 20});
168 writeListVector(writer, new int[]{10, 20});
169 writeListVector(writer, new int[]{10, 20});
170 writeListVector(writer, new int[]{30, 40, 50});
171 writeListVector(writer, new int[]{30, 40, 50});
172 writeListVector(writer, new int[]{10, 20});
173
174 writer.setValueCount(6);
175
176 UnionListWriter dictWriter = dictionaryVector.getWriter();
177 dictWriter.allocate();
178
179 writeListVector(dictWriter, new int[]{10, 20});
180 writeListVector(dictWriter, new int[]{30, 40, 50});
181
182 dictWriter.setValueCount(2);
183
184 Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
185
186 try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
187 // verify indices
188 assertEquals(IntVector.class, encoded.getClass());
189
190 IntVector index = ((IntVector) encoded);
191 assertEquals(6, index.getValueCount());
192 assertEquals(0, index.get(0));
193 assertEquals(0, index.get(1));
194 assertEquals(0, index.get(2));
195 assertEquals(1, index.get(3));
196 assertEquals(1, index.get(4));
197 assertEquals(0, index.get(5));
198
199 // now run through the decoder and verify we get the original back
200 try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
201 assertEquals(vector.getClass(), decoded.getClass());
202 assertEquals(vector.getValueCount(), decoded.getValueCount());
203 for (int i = 0; i < 5; i++) {
204 assertEquals(vector.getObject(i), decoded.getObject(i));
205 }
206 }
207 }
208 }
209 }
210
211 @Test
212 public void testEncodeStruct() {
213 // Create a new value vector
214 try (final StructVector vector = StructVector.empty("vector", allocator);
215 final StructVector dictionaryVector = StructVector.empty("dict", allocator);) {
216 vector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
217 vector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
218 dictionaryVector.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
219 dictionaryVector.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
220
221 NullableStructWriter writer = vector.getWriter();
222 writer.allocate();
223
224 writeStructVector(writer, 1, 10L);
225 writeStructVector(writer, 1, 10L);
226 writeStructVector(writer, 1, 10L);
227 writeStructVector(writer, 2, 20L);
228 writeStructVector(writer, 2, 20L);
229 writeStructVector(writer, 2, 20L);
230 writeStructVector(writer, 1, 10L);
231
232 writer.setValueCount(7);
233
234 NullableStructWriter dictWriter = dictionaryVector.getWriter();
235 dictWriter.allocate();
236
237 writeStructVector(dictWriter, 1, 10L);
238 writeStructVector(dictWriter, 2, 20L);
239
240
241 dictionaryVector.setValueCount(2);
242
243 Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
244
245 try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
246 // verify indices
247 assertEquals(IntVector.class, encoded.getClass());
248
249 IntVector index = ((IntVector) encoded);
250 assertEquals(7, index.getValueCount());
251 assertEquals(0, index.get(0));
252 assertEquals(0, index.get(1));
253 assertEquals(0, index.get(2));
254 assertEquals(1, index.get(3));
255 assertEquals(1, index.get(4));
256 assertEquals(1, index.get(5));
257 assertEquals(0, index.get(6));
258
259 // now run through the decoder and verify we get the original back
260 try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
261 assertEquals(vector.getClass(), decoded.getClass());
262 assertEquals(vector.getValueCount(), decoded.getValueCount());
263 for (int i = 0; i < 5; i++) {
264 assertEquals(vector.getObject(i), decoded.getObject(i));
265 }
266 }
267 }
268 }
269 }
270
271 @Test
272 public void testEncodeBinaryVector() {
273 // Create a new value vector
274 try (final VarBinaryVector vector = newVarBinaryVector("foo", allocator);
275 final VarBinaryVector dictionaryVector = newVarBinaryVector("dict", allocator)) {
276
277 setVector(vector, zero, one, one, two, zero);
278 setVector(dictionaryVector, zero, one, two);
279
280 Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
281
282 try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
283 // verify indices
284 assertEquals(IntVector.class, encoded.getClass());
285
286 IntVector index = ((IntVector) encoded);
287 assertEquals(5, index.getValueCount());
288 assertEquals(0, index.get(0));
289 assertEquals(1, index.get(1));
290 assertEquals(1, index.get(2));
291 assertEquals(2, index.get(3));
292 assertEquals(0, index.get(4));
293
294 // now run through the decoder and verify we get the original back
295 try (VarBinaryVector decoded = (VarBinaryVector) DictionaryEncoder.decode(encoded, dictionary)) {
296 assertEquals(vector.getClass(), decoded.getClass());
297 assertEquals(vector.getValueCount(), decoded.getValueCount());
298 for (int i = 0; i < 5; i++) {
299 assertTrue(Arrays.equals(vector.getObject(i), decoded.getObject(i)));
300 }
301 }
302 }
303 }
304 }
305
306 @Test
307 public void testEncodeUnion() {
308 // Create a new value vector
309 try (final UnionVector vector = new UnionVector("vector", allocator, /* field type */ null, /* call-back */ null);
310 final UnionVector dictionaryVector =
311 new UnionVector("dict", allocator, /* field type */ null, /* call-back */ null);) {
312
313 final NullableUInt4Holder uintHolder1 = new NullableUInt4Holder();
314 uintHolder1.value = 10;
315 uintHolder1.isSet = 1;
316
317 final NullableIntHolder intHolder1 = new NullableIntHolder();
318 intHolder1.value = 10;
319 intHolder1.isSet = 1;
320
321 final NullableIntHolder intHolder2 = new NullableIntHolder();
322 intHolder2.value = 20;
323 intHolder2.isSet = 1;
324
325 //write data
326 vector.setType(0, Types.MinorType.UINT4);
327 vector.setSafe(0, uintHolder1);
328
329 vector.setType(1, Types.MinorType.INT);
330 vector.setSafe(1, intHolder1);
331
332 vector.setType(2, Types.MinorType.INT);
333 vector.setSafe(2, intHolder1);
334
335 vector.setType(3, Types.MinorType.INT);
336 vector.setSafe(3, intHolder2);
337
338 vector.setType(4, Types.MinorType.INT);
339 vector.setSafe(4, intHolder2);
340
341 vector.setValueCount(5);
342
343 //write dictionary
344 dictionaryVector.setType(0, Types.MinorType.UINT4);
345 dictionaryVector.setSafe(0, uintHolder1);
346
347 dictionaryVector.setType(1, Types.MinorType.INT);
348 dictionaryVector.setSafe(1, intHolder1);
349
350 dictionaryVector.setType(2, Types.MinorType.INT);
351 dictionaryVector.setSafe(2, intHolder2);
352
353 dictionaryVector.setValueCount(3);
354
355 Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
356
357 try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
358 // verify indices
359 assertEquals(IntVector.class, encoded.getClass());
360
361 IntVector index = ((IntVector) encoded);
362 assertEquals(5, index.getValueCount());
363 assertEquals(0, index.get(0));
364 assertEquals(1, index.get(1));
365 assertEquals(1, index.get(2));
366 assertEquals(2, index.get(3));
367 assertEquals(2, index.get(4));
368
369 // now run through the decoder and verify we get the original back
370 try (ValueVector decoded = DictionaryEncoder.decode(encoded, dictionary)) {
371 assertEquals(vector.getClass(), decoded.getClass());
372 assertEquals(vector.getValueCount(), decoded.getValueCount());
373 for (int i = 0; i < 5; i++) {
374 assertEquals(vector.getObject(i), decoded.getObject(i));
375 }
376 }
377 }
378 }
379 }
380
381 @Test
382 public void testIntEquals() {
383 //test Int
384 try (final IntVector vector1 = new IntVector("int", allocator);
385 final IntVector vector2 = new IntVector("int", allocator)) {
386
387 Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null));
388 Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null));
389
390 setVector(vector1, 1, 2, 3);
391 setVector(vector2, 1, 2, 0);
392
393 assertFalse(dict1.equals(dict2));
394
395 vector2.setSafe(2, 3);
396 assertTrue(dict1.equals(dict2));
397 }
398 }
399
400 @Test
401 public void testVarcharEquals() {
402 try (final VarCharVector vector1 = new VarCharVector("varchar", allocator);
403 final VarCharVector vector2 = new VarCharVector("varchar", allocator)) {
404
405 Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null));
406 Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null));
407
408 setVector(vector1, zero, one, two);
409 setVector(vector2, zero, one, one);
410
411 assertFalse(dict1.equals(dict2));
412
413 vector2.setSafe(2, two, 0, two.length);
414 assertTrue(dict1.equals(dict2));
415 }
416 }
417
418 @Test
419 public void testVarBinaryEquals() {
420 try (final VarBinaryVector vector1 = new VarBinaryVector("binary", allocator);
421 final VarBinaryVector vector2 = new VarBinaryVector("binary", allocator)) {
422
423 Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null));
424 Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null));
425
426 setVector(vector1, zero, one, two);
427 setVector(vector2, zero, one, one);
428
429 assertFalse(dict1.equals(dict2));
430
431 vector2.setSafe(2, two, 0, two.length);
432 assertTrue(dict1.equals(dict2));
433 }
434 }
435
436 @Test
437 public void testListEquals() {
438 try (final ListVector vector1 = ListVector.empty("list", allocator);
439 final ListVector vector2 = ListVector.empty("list", allocator);) {
440
441 Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null));
442 Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null));
443
444 UnionListWriter writer1 = vector1.getWriter();
445 writer1.allocate();
446
447 //set some values
448 writeListVector(writer1, new int[] {1, 2});
449 writeListVector(writer1, new int[] {3, 4});
450 writeListVector(writer1, new int[] {5, 6});
451 writer1.setValueCount(3);
452
453 UnionListWriter writer2 = vector2.getWriter();
454 writer2.allocate();
455
456 //set some values
457 writeListVector(writer2, new int[] {1, 2});
458 writeListVector(writer2, new int[] {3, 4});
459 writeListVector(writer2, new int[] {5, 6});
460 writer2.setValueCount(3);
461
462 assertTrue(dict1.equals(dict2));
463 }
464 }
465
466 @Test
467 public void testStructEquals() {
468 try (final StructVector vector1 = StructVector.empty("struct", allocator);
469 final StructVector vector2 = StructVector.empty("struct", allocator);) {
470 vector1.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
471 vector1.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
472 vector2.addOrGet("f0", FieldType.nullable(new ArrowType.Int(32, true)), IntVector.class);
473 vector2.addOrGet("f1", FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class);
474
475 Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null));
476 Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null));
477
478 NullableStructWriter writer1 = vector1.getWriter();
479 writer1.allocate();
480
481 writeStructVector(writer1, 1, 10L);
482 writeStructVector(writer1, 2, 20L);
483 writer1.setValueCount(2);
484
485 NullableStructWriter writer2 = vector2.getWriter();
486 writer2.allocate();
487
488 writeStructVector(writer2, 1, 10L);
489 writeStructVector(writer2, 2, 20L);
490 writer2.setValueCount(2);
491
492 assertTrue(dict1.equals(dict2));
493 }
494 }
495
496 @Test
497 public void testUnionEquals() {
498 try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);
499 final UnionVector vector2 =
500 new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) {
501
502 final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder();
503 uInt4Holder.value = 10;
504 uInt4Holder.isSet = 1;
505
506 final NullableIntHolder intHolder = new NullableIntHolder();
507 uInt4Holder.value = 20;
508 uInt4Holder.isSet = 1;
509
510 vector1.setType(0, Types.MinorType.UINT4);
511 vector1.setSafe(0, uInt4Holder);
512
513 vector1.setType(2, Types.MinorType.INT);
514 vector1.setSafe(2, intHolder);
515 vector1.setValueCount(3);
516
517 vector2.setType(0, Types.MinorType.UINT4);
518 vector2.setSafe(0, uInt4Holder);
519
520 vector2.setType(2, Types.MinorType.INT);
521 vector2.setSafe(2, intHolder);
522 vector2.setValueCount(3);
523
524 Dictionary dict1 = new Dictionary(vector1, new DictionaryEncoding(1L, false, null));
525 Dictionary dict2 = new Dictionary(vector2, new DictionaryEncoding(1L, false, null));
526
527 assertTrue(dict1.equals(dict2));
528 }
529 }
530
531 @Test
532 public void testEncodeWithEncoderInstance() {
533 // Create a new value vector
534 try (final VarCharVector vector = newVarCharVector("vector", allocator);
535 final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) {
536
537 setVector(vector, zero, one, one, two, zero);
538 setVector(dictionaryVector, zero, one, two);
539
540 Dictionary dictionary =
541 new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
542 DictionaryEncoder encoder = new DictionaryEncoder(dictionary, allocator);
543
544 try (final ValueVector encoded = encoder.encode(vector)) {
545 // verify indices
546 assertEquals(IntVector.class, encoded.getClass());
547
548 IntVector index = ((IntVector) encoded);
549 assertEquals(5, index.getValueCount());
550 assertEquals(0, index.get(0));
551 assertEquals(1, index.get(1));
552 assertEquals(1, index.get(2));
553 assertEquals(2, index.get(3));
554 assertEquals(0, index.get(4));
555
556 // now run through the decoder and verify we get the original back
557 try (ValueVector decoded = encoder.decode(encoded)) {
558 assertEquals(vector.getClass(), decoded.getClass());
559 assertEquals(vector.getValueCount(), (decoded).getValueCount());
560 for (int i = 0; i < 5; i++) {
561 assertEquals(vector.getObject(i), ((VarCharVector) decoded).getObject(i));
562 }
563 }
564 }
565 }
566 }
567
568 @Test
569 public void testEncodeMultiVectors() {
570 // Create a new value vector
571 try (final VarCharVector vector1 = newVarCharVector("vector1", allocator);
572 final VarCharVector vector2 = newVarCharVector("vector2", allocator);
573 final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) {
574
575 setVector(vector1, zero, one, one, two, zero);
576 setVector(vector2, zero, one, one);
577 setVector(dictionaryVector, zero, one, two);
578
579 Dictionary dictionary =
580 new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
581 DictionaryEncoder encoder = new DictionaryEncoder(dictionary, allocator);
582
583 try (final ValueVector encoded = encoder.encode(vector1)) {
584 // verify indices
585 assertEquals(IntVector.class, encoded.getClass());
586
587 IntVector index = ((IntVector) encoded);
588 assertEquals(5, index.getValueCount());
589 assertEquals(0, index.get(0));
590 assertEquals(1, index.get(1));
591 assertEquals(1, index.get(2));
592 assertEquals(2, index.get(3));
593 assertEquals(0, index.get(4));
594
595 // now run through the decoder and verify we get the original back
596 try (ValueVector decoded = encoder.decode(encoded)) {
597 assertEquals(vector1.getClass(), decoded.getClass());
598 assertEquals(vector1.getValueCount(), (decoded).getValueCount());
599 for (int i = 0; i < 5; i++) {
600 assertEquals(vector1.getObject(i), ((VarCharVector) decoded).getObject(i));
601 }
602 }
603 }
604
605 try (final ValueVector encoded = encoder.encode(vector2)) {
606 // verify indices
607 assertEquals(IntVector.class, encoded.getClass());
608
609 IntVector index = ((IntVector) encoded);
610 assertEquals(3, index.getValueCount());
611 assertEquals(0, index.get(0));
612 assertEquals(1, index.get(1));
613 assertEquals(1, index.get(2));
614
615 // now run through the decoder and verify we get the original back
616 try (ValueVector decoded = encoder.decode(encoded)) {
617 assertEquals(vector2.getClass(), decoded.getClass());
618 assertEquals(vector2.getValueCount(), (decoded).getValueCount());
619 for (int i = 0; i < 3; i++) {
620 assertEquals(vector2.getObject(i), ((VarCharVector) decoded).getObject(i));
621 }
622 }
623 }
624 }
625 }
626
627 @Test
628 public void testEncodeListSubField() {
629 // Create a new value vector
630 try (final ListVector vector = ListVector.empty("vector", allocator);
631 final ListVector dictionaryVector = ListVector.empty("dict", allocator);) {
632
633 UnionListWriter writer = vector.getWriter();
634 writer.allocate();
635
636 //set some values
637 writeListVector(writer, new int[]{10, 20});
638 writeListVector(writer, new int[]{10, 20});
639 writeListVector(writer, new int[]{10, 20});
640 writeListVector(writer, new int[]{30, 40, 50});
641 writeListVector(writer, new int[]{30, 40, 50});
642 writeListVector(writer, new int[]{10, 20});
643 writer.setValueCount(6);
644
645 UnionListWriter dictWriter = dictionaryVector.getWriter();
646 dictWriter.allocate();
647 writeListVector(dictWriter, new int[]{10, 20, 30, 40, 50});
648 dictionaryVector.setValueCount(1);
649
650 Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
651 ListSubfieldEncoder encoder = new ListSubfieldEncoder(dictionary, allocator);
652
653 try (final ListVector encoded = (ListVector) encoder.encodeListSubField(vector)) {
654 // verify indices
655 assertEquals(ListVector.class, encoded.getClass());
656
657 assertEquals(6, encoded.getValueCount());
658 int[] realValue1 = convertListToIntArray(encoded.getObject(0));
659 assertTrue(Arrays.equals(new int[] {0, 1}, realValue1));
660 int[] realValue2 = convertListToIntArray(encoded.getObject(1));
661 assertTrue(Arrays.equals(new int[] {0, 1}, realValue2));
662 int[] realValue3 = convertListToIntArray(encoded.getObject(2));
663 assertTrue(Arrays.equals(new int[] {0, 1}, realValue3));
664 int[] realValue4 = convertListToIntArray(encoded.getObject(3));
665 assertTrue(Arrays.equals(new int[] {2, 3, 4}, realValue4));
666 int[] realValue5 = convertListToIntArray(encoded.getObject(4));
667 assertTrue(Arrays.equals(new int[] {2, 3, 4}, realValue5));
668 int[] realValue6 = convertListToIntArray(encoded.getObject(5));
669 assertTrue(Arrays.equals(new int[] {0, 1}, realValue6));
670
671 // now run through the decoder and verify we get the original back
672 try (ValueVector decoded = encoder.decodeListSubField(encoded)) {
673 assertEquals(vector.getClass(), decoded.getClass());
674 assertEquals(vector.getValueCount(), decoded.getValueCount());
675 for (int i = 0; i < 5; i++) {
676 assertEquals(vector.getObject(i), decoded.getObject(i));
677 }
678 }
679 }
680 }
681 }
682
683 @Test
684 public void testEncodeFixedSizeListSubField() {
685 // Create a new value vector
686 try (final FixedSizeListVector vector = FixedSizeListVector.empty("vector", 2, allocator);
687 final FixedSizeListVector dictionaryVector = FixedSizeListVector.empty("dict", 2, allocator)) {
688
689 vector.allocateNew();
690 vector.setValueCount(4);
691
692 IntVector dataVector =
693 (IntVector) vector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())).getVector();
694 dataVector.allocateNew(8);
695 dataVector.setValueCount(8);
696 // set value at index 0
697 vector.setNotNull(0);
698 dataVector.set(0, 10);
699 dataVector.set(1, 20);
700 // set value at index 1
701 vector.setNotNull(1);
702 dataVector.set(2, 10);
703 dataVector.set(3, 20);
704 // set value at index 2
705 vector.setNotNull(2);
706 dataVector.set(4, 30);
707 dataVector.set(5, 40);
708 // set value at index 3
709 vector.setNotNull(3);
710 dataVector.set(6, 10);
711 dataVector.set(7, 20);
712
713 dictionaryVector.allocateNew();
714 dictionaryVector.setValueCount(2);
715 IntVector dictDataVector =
716 (IntVector) dictionaryVector.addOrGetVector(FieldType.nullable(Types.MinorType.INT.getType())).getVector();
717 dictDataVector.allocateNew(4);
718 dictDataVector.setValueCount(4);
719
720 dictionaryVector.setNotNull(0);
721 dictDataVector.set(0, 10);
722 dictDataVector.set(1, 20);
723 dictionaryVector.setNotNull(1);
724 dictDataVector.set(2, 30);
725 dictDataVector.set(3, 40);
726
727 Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
728 ListSubfieldEncoder encoder = new ListSubfieldEncoder(dictionary, allocator);
729
730 try (final FixedSizeListVector encoded =
731 (FixedSizeListVector) encoder.encodeListSubField(vector)) {
732 // verify indices
733 assertEquals(FixedSizeListVector.class, encoded.getClass());
734
735 assertEquals(4, encoded.getValueCount());
736 int[] realValue1 = convertListToIntArray(encoded.getObject(0));
737 assertTrue(Arrays.equals(new int[] {0, 1}, realValue1));
738 int[] realValue2 = convertListToIntArray(encoded.getObject(1));
739 assertTrue(Arrays.equals(new int[] {0, 1}, realValue2));
740 int[] realValue3 = convertListToIntArray(encoded.getObject(2));
741 assertTrue(Arrays.equals(new int[] {2, 3}, realValue3));
742 int[] realValue4 = convertListToIntArray(encoded.getObject(3));
743 assertTrue(Arrays.equals(new int[] {0, 1}, realValue4));
744
745 // now run through the decoder and verify we get the original back
746 try (ValueVector decoded = encoder.decodeListSubField(encoded)) {
747 assertEquals(vector.getClass(), decoded.getClass());
748 assertEquals(vector.getValueCount(), decoded.getValueCount());
749 for (int i = 0; i < 5; i++) {
750 assertEquals(vector.getObject(i), decoded.getObject(i));
751 }
752 }
753 }
754 }
755 }
756
757 @Test
758 public void testEncodeStructSubField() {
759 try (final StructVector vector = StructVector.empty("vector", allocator);
760 final VarCharVector dictVector1 = new VarCharVector("f0", allocator);
761 final VarCharVector dictVector2 = new VarCharVector("f1", allocator)) {
762
763 vector.addOrGet("f0", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class);
764 vector.addOrGet("f1", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class);
765
766 NullableStructWriter writer = vector.getWriter();
767 writer.allocate();
768 //set some values
769 writeStructVector(writer, "aa", "baz");
770 writeStructVector(writer, "bb", "bar");
771 writeStructVector(writer, "cc", "foo");
772 writeStructVector(writer, "aa", "foo");
773 writeStructVector(writer, "dd", "foo");
774 writer.setValueCount(5);
775
776 // initialize dictionaries
777 DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
778
779
780 setVector(dictVector1,
781 "aa".getBytes(StandardCharsets.UTF_8),
782 "bb".getBytes(StandardCharsets.UTF_8),
783 "cc".getBytes(StandardCharsets.UTF_8),
784 "dd".getBytes(StandardCharsets.UTF_8));
785 setVector(dictVector2,
786 "foo".getBytes(StandardCharsets.UTF_8),
787 "baz".getBytes(StandardCharsets.UTF_8),
788 "bar".getBytes(StandardCharsets.UTF_8));
789
790 provider.put(new Dictionary(dictVector1, new DictionaryEncoding(1L, false, null)));
791 provider.put(new Dictionary(dictVector2, new DictionaryEncoding(2L, false, null)));
792
793 StructSubfieldEncoder encoder = new StructSubfieldEncoder(allocator, provider);
794 Map<Integer, Long> columnToDictionaryId = new HashMap<>();
795 columnToDictionaryId.put(0, 1L);
796 columnToDictionaryId.put(1, 2L);
797
798 try (final StructVector encoded = (StructVector) encoder.encode(vector, columnToDictionaryId)) {
799 // verify indices
800 assertEquals(StructVector.class, encoded.getClass());
801
802 assertEquals(5, encoded.getValueCount());
803 Object[] realValue1 = convertMapValuesToArray(encoded.getObject(0));
804 assertTrue(Arrays.equals(new Object[] {0, 1}, realValue1));
805 Object[] realValue2 = convertMapValuesToArray(encoded.getObject(1));
806 assertTrue(Arrays.equals(new Object[] {1, 2}, realValue2));
807 Object[] realValue3 = convertMapValuesToArray(encoded.getObject(2));
808 assertTrue(Arrays.equals(new Object[] {2, 0}, realValue3));
809 Object[] realValue4 = convertMapValuesToArray(encoded.getObject(3));
810 assertTrue(Arrays.equals(new Object[] {0, 0}, realValue4));
811 Object[] realValue5 = convertMapValuesToArray(encoded.getObject(4));
812 assertTrue(Arrays.equals(new Object[] {3, 0}, realValue5));
813
814 // now run through the decoder and verify we get the original back
815 try (ValueVector decoded = encoder.decode(encoded)) {
816 assertEquals(vector.getClass(), decoded.getClass());
817 assertEquals(vector.getValueCount(), decoded.getValueCount());
818 for (int i = 0; i < 5; i++) {
819 assertEquals(vector.getObject(i), decoded.getObject(i));
820 }
821 }
822 }
823 }
824 }
825
826 @Test
827 public void testEncodeStructSubFieldWithCertainColumns() {
828 // in this case, some child vector is encoded and others are not
829 try (final StructVector vector = StructVector.empty("vector", allocator);
830 final VarCharVector dictVector1 = new VarCharVector("f0", allocator)) {
831
832 vector.addOrGet("f0", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class);
833 vector.addOrGet("f1", FieldType.nullable(ArrowType.Utf8.INSTANCE), VarCharVector.class);
834
835 NullableStructWriter writer = vector.getWriter();
836 writer.allocate();
837 //set some values
838 writeStructVector(writer, "aa", "baz");
839 writeStructVector(writer, "bb", "bar");
840 writeStructVector(writer, "cc", "foo");
841 writeStructVector(writer, "aa", "foo");
842 writeStructVector(writer, "dd", "foo");
843 writer.setValueCount(5);
844
845 // initialize dictionaries
846 DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider();
847
848 setVector(dictVector1, "aa".getBytes(), "bb".getBytes(), "cc".getBytes(), "dd".getBytes());
849
850 provider.put(new Dictionary(dictVector1, new DictionaryEncoding(1L, false, null)));
851 StructSubfieldEncoder encoder = new StructSubfieldEncoder(allocator, provider);
852 Map<Integer, Long> columnToDictionaryId = new HashMap<>();
853 columnToDictionaryId.put(0, 1L);
854
855 try (final StructVector encoded = (StructVector) encoder.encode(vector, columnToDictionaryId)) {
856 // verify indices
857 assertEquals(StructVector.class, encoded.getClass());
858
859 assertEquals(5, encoded.getValueCount());
860 Object[] realValue1 = convertMapValuesToArray(encoded.getObject(0));
861 assertTrue(Arrays.equals(new Object[] {0, new Text("baz")}, realValue1));
862 Object[] realValue2 = convertMapValuesToArray(encoded.getObject(1));
863 assertTrue(Arrays.equals(new Object[] {1, new Text("bar")}, realValue2));
864 Object[] realValue3 = convertMapValuesToArray(encoded.getObject(2));
865 assertTrue(Arrays.equals(new Object[] {2, new Text("foo")}, realValue3));
866 Object[] realValue4 = convertMapValuesToArray(encoded.getObject(3));
867 assertTrue(Arrays.equals(new Object[] {0, new Text("foo")}, realValue4));
868 Object[] realValue5 = convertMapValuesToArray(encoded.getObject(4));
869 assertTrue(Arrays.equals(new Object[] {3, new Text("foo")}, realValue5));
870
871 // now run through the decoder and verify we get the original back
872 try (ValueVector decoded = encoder.decode(encoded)) {
873 assertEquals(vector.getClass(), decoded.getClass());
874 assertEquals(vector.getValueCount(), decoded.getValueCount());
875 for (int i = 0; i < 5; i++) {
876 assertEquals(vector.getObject(i), decoded.getObject(i));
877 }
878 }
879 }
880
881 }
882 }
883
884 private void testDictionary(Dictionary dictionary, ToIntBiFunction<ValueVector, Integer> valGetter) {
885 try (VarCharVector vector = new VarCharVector("vector", allocator)) {
886 setVector(vector, "1", "3", "5", "7", "9");
887 try (ValueVector encodedVector = DictionaryEncoder.encode(vector, dictionary)) {
888
889 // verify encoded result
890 assertEquals(vector.getValueCount(), encodedVector.getValueCount());
891 assertEquals(valGetter.applyAsInt(encodedVector, 0), 1);
892 assertEquals(valGetter.applyAsInt(encodedVector, 1), 3);
893 assertEquals(valGetter.applyAsInt(encodedVector, 2), 5);
894 assertEquals(valGetter.applyAsInt(encodedVector, 3), 7);
895 assertEquals(valGetter.applyAsInt(encodedVector, 4), 9);
896
897 try (ValueVector decodedVector = DictionaryEncoder.decode(encodedVector, dictionary)) {
898 assertTrue(decodedVector instanceof VarCharVector);
899 assertEquals(vector.getValueCount(), decodedVector.getValueCount());
900 assertArrayEquals("1".getBytes(), ((VarCharVector) decodedVector).get(0));
901 assertArrayEquals("3".getBytes(), ((VarCharVector) decodedVector).get(1));
902 assertArrayEquals("5".getBytes(), ((VarCharVector) decodedVector).get(2));
903 assertArrayEquals("7".getBytes(), ((VarCharVector) decodedVector).get(3));
904 assertArrayEquals("9".getBytes(), ((VarCharVector) decodedVector).get(4));
905 }
906 }
907 }
908 }
909
910 @Test
911 public void testDictionaryUInt1() {
912 try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) {
913 setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
914 Dictionary dictionary1 = new Dictionary(dictionaryVector,
915 new DictionaryEncoding(/*id=*/10L, /*ordered=*/false,
916 /*indexType=*/new ArrowType.Int(/*bitWidth*/8, /*isSigned*/false)));
917 testDictionary(dictionary1, (vector, index) -> ((UInt1Vector) vector).get(index));
918 }
919 }
920
921 @Test
922 public void testDictionaryUInt2() {
923 try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) {
924 setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
925 Dictionary dictionary2 = new Dictionary(dictionaryVector,
926 new DictionaryEncoding(/*id=*/20L, /*ordered=*/false,
927 /*indexType=*/new ArrowType.Int(/*indexType=*/16, /*isSigned*/false)));
928 testDictionary(dictionary2, (vector, index) -> ((UInt2Vector) vector).get(index));
929 }
930 }
931
932 @Test
933 public void testDictionaryUInt4() {
934 try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) {
935 setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
936 Dictionary dictionary4 = new Dictionary(dictionaryVector,
937 new DictionaryEncoding(/*id=*/30L, /*ordered=*/false,
938 /*indexType=*/new ArrowType.Int(/*indexType=*/32, /*isSigned*/false)));
939 testDictionary(dictionary4, (vector, index) -> ((UInt4Vector) vector).get(index));
940 }
941 }
942
943 @Test
944 public void testDictionaryUInt8() {
945 try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) {
946 setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9");
947 Dictionary dictionary8 = new Dictionary(dictionaryVector,
948 new DictionaryEncoding(/*id=*/40L, /*ordered=*/false,
949 /*indexType=*/new ArrowType.Int(/*indexType=*/64, /*isSigned*/false)));
950 testDictionary(dictionary8, (vector, index) -> (int) ((UInt8Vector) vector).get(index));
951 }
952 }
953
954 @Test
955 public void testDictionaryUIntOverflow() {
956 // the size is within the range of UInt1, but outside the range of TinyInt.
957 final int vecLength = 256;
958 try (VarCharVector dictionaryVector = new VarCharVector("dict vector", allocator)) {
959 dictionaryVector.allocateNew(vecLength * 3, vecLength);
960 for (int i = 0; i < vecLength; i++) {
961 dictionaryVector.set(i, String.valueOf(i).getBytes());
962 }
963 dictionaryVector.setValueCount(vecLength);
964
965 Dictionary dictionary = new Dictionary(dictionaryVector,
966 new DictionaryEncoding(/*id=*/10L, /*ordered=*/false,
967 /*indexType=*/new ArrowType.Int(/*indexType=*/8, /*isSigned*/false)));
968
969 try (VarCharVector vector = new VarCharVector("vector", allocator)) {
970 setVector(vector, "255");
971 try (UInt1Vector encodedVector = (UInt1Vector) DictionaryEncoder.encode(vector, dictionary)) {
972
973 // verify encoded result
974 assertEquals(1, encodedVector.getValueCount());
975 assertEquals(255, encodedVector.getValueAsLong(0));
976
977 try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dictionary)) {
978 assertEquals(1, decodedVector.getValueCount());
979 assertArrayEquals("255".getBytes(), decodedVector.get(0));
980 }
981 }
982 }
983 }
984 }
985
986 private int[] convertListToIntArray(List list) {
987 int[] values = new int[list.size()];
988 for (int i = 0; i < list.size(); i++) {
989 values[i] = (int) list.get(i);
990 }
991 return values;
992 }
993
994 private Object[] convertMapValuesToArray(Map map) {
995 Object[] values = new Object[map.size()];
996 Iterator valueIterator = map.values().iterator();
997 for (int i = 0; i < map.size(); i++) {
998 values[i] = valueIterator.next();
999 }
1000 return values;
1001 }
1002
1003 private void writeStructVector(NullableStructWriter writer, String value1, String value2) {
1004
1005 byte[] bytes1 = value1.getBytes(StandardCharsets.UTF_8);
1006 byte[] bytes2 = value2.getBytes(StandardCharsets.UTF_8);
1007 ArrowBuf temp = allocator.buffer(bytes1.length > bytes2.length ? bytes1.length : bytes2.length);
1008
1009 writer.start();
1010 temp.setBytes(0, bytes1);
1011 writer.varChar("f0").writeVarChar(0, bytes1.length, temp);
1012 temp.setBytes(0, bytes2);
1013 writer.varChar("f1").writeVarChar(0, bytes2.length, temp);
1014 writer.end();
1015 temp.close();
1016 }
1017
1018 private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
1019 writer.start();
1020 writer.integer("f0").writeInt(value1);
1021 writer.bigInt("f1").writeBigInt(value2);
1022 writer.end();
1023 }
1024
1025 private void writeListVector(UnionListWriter writer, int[] values) {
1026 writer.startList();
1027 for (int v: values) {
1028 writer.integer().writeInt(v);
1029 }
1030 writer.endList();
1031 }
1032 }