2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org
.apache
.arrow
.algorithm
.dictionary
;
20 import static junit
.framework
.TestCase
.assertTrue
;
21 import static org
.junit
.Assert
.assertArrayEquals
;
22 import static org
.junit
.Assert
.assertEquals
;
23 import static org
.junit
.jupiter
.api
.Assertions
.assertThrows
;
25 import java
.nio
.charset
.StandardCharsets
;
26 import java
.util
.Arrays
;
27 import java
.util
.Random
;
29 import org
.apache
.arrow
.algorithm
.sort
.DefaultVectorComparators
;
30 import org
.apache
.arrow
.memory
.BufferAllocator
;
31 import org
.apache
.arrow
.memory
.RootAllocator
;
32 import org
.apache
.arrow
.vector
.IntVector
;
33 import org
.apache
.arrow
.vector
.VarBinaryVector
;
34 import org
.apache
.arrow
.vector
.VarCharVector
;
35 import org
.apache
.arrow
.vector
.dictionary
.Dictionary
;
36 import org
.apache
.arrow
.vector
.dictionary
.DictionaryEncoder
;
37 import org
.apache
.arrow
.vector
.types
.pojo
.DictionaryEncoding
;
38 import org
.junit
.After
;
39 import org
.junit
.Assert
;
40 import org
.junit
.Before
;
41 import org
.junit
.Test
;
44 * Test cases for {@link SearchDictionaryEncoder}.
46 public class TestSearchDictionaryEncoder
{
48 private final int VECTOR_LENGTH
= 50;
50 private final int DICTIONARY_LENGTH
= 10;
52 private BufferAllocator allocator
;
54 byte[] zero
= "000".getBytes(StandardCharsets
.UTF_8
);
55 byte[] one
= "111".getBytes(StandardCharsets
.UTF_8
);
56 byte[] two
= "222".getBytes(StandardCharsets
.UTF_8
);
58 byte[][] data
= new byte[][]{zero
, one
, two
};
61 public void prepare() {
62 allocator
= new RootAllocator(1024 * 1024);
66 public void shutdown() {
71 public void testEncodeAndDecode() {
72 Random random
= new Random();
73 try (VarCharVector rawVector
= new VarCharVector("original vector", allocator
);
74 IntVector encodedVector
= new IntVector("encoded vector", allocator
);
75 VarCharVector dictionary
= new VarCharVector("dictionary", allocator
)) {
78 dictionary
.allocateNew();
79 for (int i
= 0; i
< DICTIONARY_LENGTH
; i
++) {
81 dictionary
.setSafe(i
, String
.valueOf(i
).getBytes());
83 dictionary
.setValueCount(DICTIONARY_LENGTH
);
86 rawVector
.allocateNew(10 * VECTOR_LENGTH
, VECTOR_LENGTH
);
87 for (int i
= 0; i
< VECTOR_LENGTH
; i
++) {
88 int val
= (random
.nextInt() & Integer
.MAX_VALUE
) % DICTIONARY_LENGTH
;
89 rawVector
.set(i
, String
.valueOf(val
).getBytes());
91 rawVector
.setValueCount(VECTOR_LENGTH
);
93 SearchDictionaryEncoder
<IntVector
, VarCharVector
> encoder
=
94 new SearchDictionaryEncoder
<>(
95 dictionary
, DefaultVectorComparators
.createDefaultComparator(rawVector
), false);
98 encodedVector
.allocateNew();
99 encoder
.encode(rawVector
, encodedVector
);
101 // verify encoding results
102 assertEquals(rawVector
.getValueCount(), encodedVector
.getValueCount());
103 for (int i
= 0; i
< VECTOR_LENGTH
; i
++) {
104 assertArrayEquals(rawVector
.get(i
), String
.valueOf(encodedVector
.get(i
)).getBytes());
108 Dictionary dict
= new Dictionary(dictionary
, new DictionaryEncoding(1L, false, null));
109 try (VarCharVector decodedVector
= (VarCharVector
) DictionaryEncoder
.decode(encodedVector
, dict
)) {
111 // verify decoding results
112 assertEquals(encodedVector
.getValueCount(), decodedVector
.getValueCount());
113 for (int i
= 0; i
< VECTOR_LENGTH
; i
++) {
114 assertArrayEquals(String
.valueOf(encodedVector
.get(i
)).getBytes(), decodedVector
.get(i
));
121 public void testEncodeAndDecodeWithNull() {
122 Random random
= new Random();
123 try (VarCharVector rawVector
= new VarCharVector("original vector", allocator
);
124 IntVector encodedVector
= new IntVector("encoded vector", allocator
);
125 VarCharVector dictionary
= new VarCharVector("dictionary", allocator
)) {
128 dictionary
.allocateNew();
129 dictionary
.setNull(0);
130 for (int i
= 1; i
< DICTIONARY_LENGTH
; i
++) {
132 dictionary
.setSafe(i
, String
.valueOf(i
).getBytes());
134 dictionary
.setValueCount(DICTIONARY_LENGTH
);
137 rawVector
.allocateNew(10 * VECTOR_LENGTH
, VECTOR_LENGTH
);
138 for (int i
= 0; i
< VECTOR_LENGTH
; i
++) {
140 rawVector
.setNull(i
);
142 int val
= (random
.nextInt() & Integer
.MAX_VALUE
) % (DICTIONARY_LENGTH
- 1) + 1;
143 rawVector
.set(i
, String
.valueOf(val
).getBytes());
146 rawVector
.setValueCount(VECTOR_LENGTH
);
148 SearchDictionaryEncoder
<IntVector
, VarCharVector
> encoder
=
149 new SearchDictionaryEncoder
<>(
150 dictionary
, DefaultVectorComparators
.createDefaultComparator(rawVector
), true);
153 encodedVector
.allocateNew();
154 encoder
.encode(rawVector
, encodedVector
);
156 // verify encoding results
157 assertEquals(rawVector
.getValueCount(), encodedVector
.getValueCount());
158 for (int i
= 0; i
< VECTOR_LENGTH
; i
++) {
160 assertEquals(0, encodedVector
.get(i
));
162 assertArrayEquals(rawVector
.get(i
), String
.valueOf(encodedVector
.get(i
)).getBytes());
167 Dictionary dict
= new Dictionary(dictionary
, new DictionaryEncoding(1L, false, null));
168 try (VarCharVector decodedVector
= (VarCharVector
) DictionaryEncoder
.decode(encodedVector
, dict
)) {
170 // verify decoding results
171 assertEquals(encodedVector
.getValueCount(), decodedVector
.getValueCount());
172 for (int i
= 0; i
< VECTOR_LENGTH
; i
++) {
174 assertTrue(decodedVector
.isNull(i
));
176 assertArrayEquals(String
.valueOf(encodedVector
.get(i
)).getBytes(), decodedVector
.get(i
));
184 public void testEncodeNullWithoutNullInDictionary() {
185 try (VarCharVector rawVector
= new VarCharVector("original vector", allocator
);
186 IntVector encodedVector
= new IntVector("encoded vector", allocator
);
187 VarCharVector dictionary
= new VarCharVector("dictionary", allocator
)) {
189 // set up dictionary, with no null in it.
190 dictionary
.allocateNew();
191 for (int i
= 0; i
< DICTIONARY_LENGTH
; i
++) {
193 dictionary
.setSafe(i
, String
.valueOf(i
).getBytes());
195 dictionary
.setValueCount(DICTIONARY_LENGTH
);
197 // the vector to encode has a null inside.
198 rawVector
.allocateNew(1);
199 rawVector
.setNull(0);
200 rawVector
.setValueCount(1);
202 encodedVector
.allocateNew();
204 SearchDictionaryEncoder
<IntVector
, VarCharVector
> encoder
=
205 new SearchDictionaryEncoder
<>(
206 dictionary
, DefaultVectorComparators
.createDefaultComparator(rawVector
), true);
208 // the encoder should encode null, but no null in the dictionary,
209 // so an exception should be thrown.
210 assertThrows(IllegalArgumentException
.class, () -> {
211 encoder
.encode(rawVector
, encodedVector
);
217 public void testEncodeStrings() {
218 // Create a new value vector
219 try (final VarCharVector vector
= new VarCharVector("foo", allocator
);
220 final IntVector encoded
= new IntVector("encoded", allocator
);
221 final VarCharVector dictionaryVector
= new VarCharVector("dict", allocator
)) {
223 vector
.allocateNew(512, 5);
224 encoded
.allocateNew();
227 vector
.setSafe(0, zero
, 0, zero
.length
);
228 vector
.setSafe(1, one
, 0, one
.length
);
229 vector
.setSafe(2, one
, 0, one
.length
);
230 vector
.setSafe(3, two
, 0, two
.length
);
231 vector
.setSafe(4, zero
, 0, zero
.length
);
232 vector
.setValueCount(5);
234 // set some dictionary values
235 dictionaryVector
.allocateNew(512, 3);
236 dictionaryVector
.setSafe(0, zero
, 0, one
.length
);
237 dictionaryVector
.setSafe(1, one
, 0, two
.length
);
238 dictionaryVector
.setSafe(2, two
, 0, zero
.length
);
239 dictionaryVector
.setValueCount(3);
241 SearchDictionaryEncoder
<IntVector
, VarCharVector
> encoder
=
242 new SearchDictionaryEncoder
<>(
243 dictionaryVector
, DefaultVectorComparators
.createDefaultComparator(vector
));
244 encoder
.encode(vector
, encoded
);
247 assertEquals(5, encoded
.getValueCount());
248 assertEquals(0, encoded
.get(0));
249 assertEquals(1, encoded
.get(1));
250 assertEquals(1, encoded
.get(2));
251 assertEquals(2, encoded
.get(3));
252 assertEquals(0, encoded
.get(4));
254 // now run through the decoder and verify we get the original back
255 Dictionary dict
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
256 try (VarCharVector decoded
= (VarCharVector
) DictionaryEncoder
.decode(encoded
, dict
)) {
257 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
258 for (int i
= 0; i
< 5; i
++) {
259 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
266 public void testEncodeLargeVector() {
267 // Create a new value vector
268 try (final VarCharVector vector
= new VarCharVector("foo", allocator
);
269 final IntVector encoded
= new IntVector("encoded", allocator
);
270 final VarCharVector dictionaryVector
= new VarCharVector("dict", allocator
)) {
271 vector
.allocateNew();
272 encoded
.allocateNew();
276 for (int i
= 0; i
< 10000; ++i
) {
277 vector
.setSafe(i
, data
[i
% 3], 0, data
[i
% 3].length
);
279 vector
.setValueCount(count
);
281 dictionaryVector
.allocateNew(512, 3);
282 dictionaryVector
.setSafe(0, zero
, 0, one
.length
);
283 dictionaryVector
.setSafe(1, one
, 0, two
.length
);
284 dictionaryVector
.setSafe(2, two
, 0, zero
.length
);
285 dictionaryVector
.setValueCount(3);
287 SearchDictionaryEncoder
<IntVector
, VarCharVector
> encoder
=
288 new SearchDictionaryEncoder
<>(
289 dictionaryVector
, DefaultVectorComparators
.createDefaultComparator(vector
));
290 encoder
.encode(vector
, encoded
);
292 assertEquals(count
, encoded
.getValueCount());
293 for (int i
= 0; i
< count
; ++i
) {
294 assertEquals(i
% 3, encoded
.get(i
));
297 // now run through the decoder and verify we get the original back
298 Dictionary dict
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
299 try (VarCharVector decoded
= (VarCharVector
) DictionaryEncoder
.decode(encoded
, dict
)) {
300 assertEquals(vector
.getClass(), decoded
.getClass());
301 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
302 for (int i
= 0; i
< count
; ++i
) {
303 assertEquals(vector
.getObject(i
), decoded
.getObject(i
));
310 public void testEncodeBinaryVector() {
311 // Create a new value vector
312 try (final VarBinaryVector vector
= new VarBinaryVector("foo", allocator
);
313 final VarBinaryVector dictionaryVector
= new VarBinaryVector("dict", allocator
);
314 final IntVector encoded
= new IntVector("encoded", allocator
)) {
315 vector
.allocateNew(512, 5);
316 vector
.allocateNew();
317 encoded
.allocateNew();
320 vector
.setSafe(0, zero
, 0, zero
.length
);
321 vector
.setSafe(1, one
, 0, one
.length
);
322 vector
.setSafe(2, one
, 0, one
.length
);
323 vector
.setSafe(3, two
, 0, two
.length
);
324 vector
.setSafe(4, zero
, 0, zero
.length
);
325 vector
.setValueCount(5);
327 // set some dictionary values
328 dictionaryVector
.allocateNew(512, 3);
329 dictionaryVector
.setSafe(0, zero
, 0, one
.length
);
330 dictionaryVector
.setSafe(1, one
, 0, two
.length
);
331 dictionaryVector
.setSafe(2, two
, 0, zero
.length
);
332 dictionaryVector
.setValueCount(3);
334 SearchDictionaryEncoder
<IntVector
, VarBinaryVector
> encoder
=
335 new SearchDictionaryEncoder
<>(
336 dictionaryVector
, DefaultVectorComparators
.createDefaultComparator(vector
));
337 encoder
.encode(vector
, encoded
);
339 assertEquals(5, encoded
.getValueCount());
340 assertEquals(0, encoded
.get(0));
341 assertEquals(1, encoded
.get(1));
342 assertEquals(1, encoded
.get(2));
343 assertEquals(2, encoded
.get(3));
344 assertEquals(0, encoded
.get(4));
346 // now run through the decoder and verify we get the original back
347 Dictionary dict
= new Dictionary(dictionaryVector
, new DictionaryEncoding(1L, false, null));
348 try (VarBinaryVector decoded
= (VarBinaryVector
) DictionaryEncoder
.decode(encoded
, dict
)) {
349 assertEquals(vector
.getClass(), decoded
.getClass());
350 assertEquals(vector
.getValueCount(), decoded
.getValueCount());
351 for (int i
= 0; i
< 5; i
++) {
352 Assert
.assertTrue(Arrays
.equals(vector
.getObject(i
), decoded
.getObject(i
)));