]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestSearchDictionaryEncoder.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / algorithm / src / test / java / org / apache / arrow / algorithm / dictionary / TestSearchDictionaryEncoder.java
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.arrow.algorithm.dictionary;
19
20 import static junit.framework.TestCase.assertTrue;
21 import static org.junit.Assert.assertArrayEquals;
22 import static org.junit.Assert.assertEquals;
23 import static org.junit.jupiter.api.Assertions.assertThrows;
24
25 import java.nio.charset.StandardCharsets;
26 import java.util.Arrays;
27 import java.util.Random;
28
29 import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
30 import org.apache.arrow.memory.BufferAllocator;
31 import org.apache.arrow.memory.RootAllocator;
32 import org.apache.arrow.vector.IntVector;
33 import org.apache.arrow.vector.VarBinaryVector;
34 import org.apache.arrow.vector.VarCharVector;
35 import org.apache.arrow.vector.dictionary.Dictionary;
36 import org.apache.arrow.vector.dictionary.DictionaryEncoder;
37 import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
38 import org.junit.After;
39 import org.junit.Assert;
40 import org.junit.Before;
41 import org.junit.Test;
42
43 /**
44 * Test cases for {@link SearchDictionaryEncoder}.
45 */
46 public class TestSearchDictionaryEncoder {
47
48 private final int VECTOR_LENGTH = 50;
49
50 private final int DICTIONARY_LENGTH = 10;
51
52 private BufferAllocator allocator;
53
54 byte[] zero = "000".getBytes(StandardCharsets.UTF_8);
55 byte[] one = "111".getBytes(StandardCharsets.UTF_8);
56 byte[] two = "222".getBytes(StandardCharsets.UTF_8);
57
58 byte[][] data = new byte[][]{zero, one, two};
59
60 @Before
61 public void prepare() {
62 allocator = new RootAllocator(1024 * 1024);
63 }
64
65 @After
66 public void shutdown() {
67 allocator.close();
68 }
69
70 @Test
71 public void testEncodeAndDecode() {
72 Random random = new Random();
73 try (VarCharVector rawVector = new VarCharVector("original vector", allocator);
74 IntVector encodedVector = new IntVector("encoded vector", allocator);
75 VarCharVector dictionary = new VarCharVector("dictionary", allocator)) {
76
77 // set up dictionary
78 dictionary.allocateNew();
79 for (int i = 0; i < DICTIONARY_LENGTH; i++) {
80 // encode "i" as i
81 dictionary.setSafe(i, String.valueOf(i).getBytes());
82 }
83 dictionary.setValueCount(DICTIONARY_LENGTH);
84
85 // set up raw vector
86 rawVector.allocateNew(10 * VECTOR_LENGTH, VECTOR_LENGTH);
87 for (int i = 0; i < VECTOR_LENGTH; i++) {
88 int val = (random.nextInt() & Integer.MAX_VALUE) % DICTIONARY_LENGTH;
89 rawVector.set(i, String.valueOf(val).getBytes());
90 }
91 rawVector.setValueCount(VECTOR_LENGTH);
92
93 SearchDictionaryEncoder<IntVector, VarCharVector> encoder =
94 new SearchDictionaryEncoder<>(
95 dictionary, DefaultVectorComparators.createDefaultComparator(rawVector), false);
96
97 // perform encoding
98 encodedVector.allocateNew();
99 encoder.encode(rawVector, encodedVector);
100
101 // verify encoding results
102 assertEquals(rawVector.getValueCount(), encodedVector.getValueCount());
103 for (int i = 0; i < VECTOR_LENGTH; i++) {
104 assertArrayEquals(rawVector.get(i), String.valueOf(encodedVector.get(i)).getBytes());
105 }
106
107 // perform decoding
108 Dictionary dict = new Dictionary(dictionary, new DictionaryEncoding(1L, false, null));
109 try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dict)) {
110
111 // verify decoding results
112 assertEquals(encodedVector.getValueCount(), decodedVector.getValueCount());
113 for (int i = 0; i < VECTOR_LENGTH; i++) {
114 assertArrayEquals(String.valueOf(encodedVector.get(i)).getBytes(), decodedVector.get(i));
115 }
116 }
117 }
118 }
119
120 @Test
121 public void testEncodeAndDecodeWithNull() {
122 Random random = new Random();
123 try (VarCharVector rawVector = new VarCharVector("original vector", allocator);
124 IntVector encodedVector = new IntVector("encoded vector", allocator);
125 VarCharVector dictionary = new VarCharVector("dictionary", allocator)) {
126
127 // set up dictionary
128 dictionary.allocateNew();
129 dictionary.setNull(0);
130 for (int i = 1; i < DICTIONARY_LENGTH; i++) {
131 // encode "i" as i
132 dictionary.setSafe(i, String.valueOf(i).getBytes());
133 }
134 dictionary.setValueCount(DICTIONARY_LENGTH);
135
136 // set up raw vector
137 rawVector.allocateNew(10 * VECTOR_LENGTH, VECTOR_LENGTH);
138 for (int i = 0; i < VECTOR_LENGTH; i++) {
139 if (i % 10 == 0) {
140 rawVector.setNull(i);
141 } else {
142 int val = (random.nextInt() & Integer.MAX_VALUE) % (DICTIONARY_LENGTH - 1) + 1;
143 rawVector.set(i, String.valueOf(val).getBytes());
144 }
145 }
146 rawVector.setValueCount(VECTOR_LENGTH);
147
148 SearchDictionaryEncoder<IntVector, VarCharVector> encoder =
149 new SearchDictionaryEncoder<>(
150 dictionary, DefaultVectorComparators.createDefaultComparator(rawVector), true);
151
152 // perform encoding
153 encodedVector.allocateNew();
154 encoder.encode(rawVector, encodedVector);
155
156 // verify encoding results
157 assertEquals(rawVector.getValueCount(), encodedVector.getValueCount());
158 for (int i = 0; i < VECTOR_LENGTH; i++) {
159 if (i % 10 == 0) {
160 assertEquals(0, encodedVector.get(i));
161 } else {
162 assertArrayEquals(rawVector.get(i), String.valueOf(encodedVector.get(i)).getBytes());
163 }
164 }
165
166 // perform decoding
167 Dictionary dict = new Dictionary(dictionary, new DictionaryEncoding(1L, false, null));
168 try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dict)) {
169
170 // verify decoding results
171 assertEquals(encodedVector.getValueCount(), decodedVector.getValueCount());
172 for (int i = 0; i < VECTOR_LENGTH; i++) {
173 if (i % 10 == 0) {
174 assertTrue(decodedVector.isNull(i));
175 } else {
176 assertArrayEquals(String.valueOf(encodedVector.get(i)).getBytes(), decodedVector.get(i));
177 }
178 }
179 }
180 }
181 }
182
183 @Test
184 public void testEncodeNullWithoutNullInDictionary() {
185 try (VarCharVector rawVector = new VarCharVector("original vector", allocator);
186 IntVector encodedVector = new IntVector("encoded vector", allocator);
187 VarCharVector dictionary = new VarCharVector("dictionary", allocator)) {
188
189 // set up dictionary, with no null in it.
190 dictionary.allocateNew();
191 for (int i = 0; i < DICTIONARY_LENGTH; i++) {
192 // encode "i" as i
193 dictionary.setSafe(i, String.valueOf(i).getBytes());
194 }
195 dictionary.setValueCount(DICTIONARY_LENGTH);
196
197 // the vector to encode has a null inside.
198 rawVector.allocateNew(1);
199 rawVector.setNull(0);
200 rawVector.setValueCount(1);
201
202 encodedVector.allocateNew();
203
204 SearchDictionaryEncoder<IntVector, VarCharVector> encoder =
205 new SearchDictionaryEncoder<>(
206 dictionary, DefaultVectorComparators.createDefaultComparator(rawVector), true);
207
208 // the encoder should encode null, but no null in the dictionary,
209 // so an exception should be thrown.
210 assertThrows(IllegalArgumentException.class, () -> {
211 encoder.encode(rawVector, encodedVector);
212 });
213 }
214 }
215
216 @Test
217 public void testEncodeStrings() {
218 // Create a new value vector
219 try (final VarCharVector vector = new VarCharVector("foo", allocator);
220 final IntVector encoded = new IntVector("encoded", allocator);
221 final VarCharVector dictionaryVector = new VarCharVector("dict", allocator)) {
222
223 vector.allocateNew(512, 5);
224 encoded.allocateNew();
225
226 // set some values
227 vector.setSafe(0, zero, 0, zero.length);
228 vector.setSafe(1, one, 0, one.length);
229 vector.setSafe(2, one, 0, one.length);
230 vector.setSafe(3, two, 0, two.length);
231 vector.setSafe(4, zero, 0, zero.length);
232 vector.setValueCount(5);
233
234 // set some dictionary values
235 dictionaryVector.allocateNew(512, 3);
236 dictionaryVector.setSafe(0, zero, 0, one.length);
237 dictionaryVector.setSafe(1, one, 0, two.length);
238 dictionaryVector.setSafe(2, two, 0, zero.length);
239 dictionaryVector.setValueCount(3);
240
241 SearchDictionaryEncoder<IntVector, VarCharVector> encoder =
242 new SearchDictionaryEncoder<>(
243 dictionaryVector, DefaultVectorComparators.createDefaultComparator(vector));
244 encoder.encode(vector, encoded);
245
246 // verify indices
247 assertEquals(5, encoded.getValueCount());
248 assertEquals(0, encoded.get(0));
249 assertEquals(1, encoded.get(1));
250 assertEquals(1, encoded.get(2));
251 assertEquals(2, encoded.get(3));
252 assertEquals(0, encoded.get(4));
253
254 // now run through the decoder and verify we get the original back
255 Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
256 try (VarCharVector decoded = (VarCharVector) DictionaryEncoder.decode(encoded, dict)) {
257 assertEquals(vector.getValueCount(), decoded.getValueCount());
258 for (int i = 0; i < 5; i++) {
259 assertEquals(vector.getObject(i), decoded.getObject(i));
260 }
261 }
262 }
263 }
264
265 @Test
266 public void testEncodeLargeVector() {
267 // Create a new value vector
268 try (final VarCharVector vector = new VarCharVector("foo", allocator);
269 final IntVector encoded = new IntVector("encoded", allocator);
270 final VarCharVector dictionaryVector = new VarCharVector("dict", allocator)) {
271 vector.allocateNew();
272 encoded.allocateNew();
273
274 int count = 10000;
275
276 for (int i = 0; i < 10000; ++i) {
277 vector.setSafe(i, data[i % 3], 0, data[i % 3].length);
278 }
279 vector.setValueCount(count);
280
281 dictionaryVector.allocateNew(512, 3);
282 dictionaryVector.setSafe(0, zero, 0, one.length);
283 dictionaryVector.setSafe(1, one, 0, two.length);
284 dictionaryVector.setSafe(2, two, 0, zero.length);
285 dictionaryVector.setValueCount(3);
286
287 SearchDictionaryEncoder<IntVector, VarCharVector> encoder =
288 new SearchDictionaryEncoder<>(
289 dictionaryVector, DefaultVectorComparators.createDefaultComparator(vector));
290 encoder.encode(vector, encoded);
291
292 assertEquals(count, encoded.getValueCount());
293 for (int i = 0; i < count; ++i) {
294 assertEquals(i % 3, encoded.get(i));
295 }
296
297 // now run through the decoder and verify we get the original back
298 Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
299 try (VarCharVector decoded = (VarCharVector) DictionaryEncoder.decode(encoded, dict)) {
300 assertEquals(vector.getClass(), decoded.getClass());
301 assertEquals(vector.getValueCount(), decoded.getValueCount());
302 for (int i = 0; i < count; ++i) {
303 assertEquals(vector.getObject(i), decoded.getObject(i));
304 }
305 }
306 }
307 }
308
309 @Test
310 public void testEncodeBinaryVector() {
311 // Create a new value vector
312 try (final VarBinaryVector vector = new VarBinaryVector("foo", allocator);
313 final VarBinaryVector dictionaryVector = new VarBinaryVector("dict", allocator);
314 final IntVector encoded = new IntVector("encoded", allocator)) {
315 vector.allocateNew(512, 5);
316 vector.allocateNew();
317 encoded.allocateNew();
318
319 // set some values
320 vector.setSafe(0, zero, 0, zero.length);
321 vector.setSafe(1, one, 0, one.length);
322 vector.setSafe(2, one, 0, one.length);
323 vector.setSafe(3, two, 0, two.length);
324 vector.setSafe(4, zero, 0, zero.length);
325 vector.setValueCount(5);
326
327 // set some dictionary values
328 dictionaryVector.allocateNew(512, 3);
329 dictionaryVector.setSafe(0, zero, 0, one.length);
330 dictionaryVector.setSafe(1, one, 0, two.length);
331 dictionaryVector.setSafe(2, two, 0, zero.length);
332 dictionaryVector.setValueCount(3);
333
334 SearchDictionaryEncoder<IntVector, VarBinaryVector> encoder =
335 new SearchDictionaryEncoder<>(
336 dictionaryVector, DefaultVectorComparators.createDefaultComparator(vector));
337 encoder.encode(vector, encoded);
338
339 assertEquals(5, encoded.getValueCount());
340 assertEquals(0, encoded.get(0));
341 assertEquals(1, encoded.get(1));
342 assertEquals(1, encoded.get(2));
343 assertEquals(2, encoded.get(3));
344 assertEquals(0, encoded.get(4));
345
346 // now run through the decoder and verify we get the original back
347 Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
348 try (VarBinaryVector decoded = (VarBinaryVector) DictionaryEncoder.decode(encoded, dict)) {
349 assertEquals(vector.getClass(), decoded.getClass());
350 assertEquals(vector.getValueCount(), decoded.getValueCount());
351 for (int i = 0; i < 5; i++) {
352 Assert.assertTrue(Arrays.equals(vector.getObject(i), decoded.getObject(i)));
353 }
354 }
355 }
356 }
357 }