]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/java/algorithm/src/test/java/org/apache/arrow/algorithm/dictionary/TestHashTableDictionaryEncoder.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / algorithm / src / test / java / org / apache / arrow / algorithm / dictionary / TestHashTableDictionaryEncoder.java
CommitLineData
1d09f67e
TL
1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.arrow.algorithm.dictionary;
19
20import static junit.framework.TestCase.assertTrue;
21import static org.junit.jupiter.api.Assertions.assertArrayEquals;
22import static org.junit.jupiter.api.Assertions.assertEquals;
23import static org.junit.jupiter.api.Assertions.assertThrows;
24
25import java.nio.charset.StandardCharsets;
26import java.util.Arrays;
27import java.util.Random;
28
29import org.apache.arrow.memory.BufferAllocator;
30import org.apache.arrow.memory.RootAllocator;
31import org.apache.arrow.vector.IntVector;
32import org.apache.arrow.vector.VarBinaryVector;
33import org.apache.arrow.vector.VarCharVector;
34import org.apache.arrow.vector.dictionary.Dictionary;
35import org.apache.arrow.vector.dictionary.DictionaryEncoder;
36import org.apache.arrow.vector.types.pojo.DictionaryEncoding;
37import org.junit.After;
38import org.junit.Before;
39import org.junit.Test;
40
41/**
42 * Test cases for {@link HashTableDictionaryEncoder}.
43 */
44public class TestHashTableDictionaryEncoder {
45
46 private final int VECTOR_LENGTH = 50;
47
48 private final int DICTIONARY_LENGTH = 10;
49
50 private BufferAllocator allocator;
51
52 byte[] zero = "000".getBytes(StandardCharsets.UTF_8);
53 byte[] one = "111".getBytes(StandardCharsets.UTF_8);
54 byte[] two = "222".getBytes(StandardCharsets.UTF_8);
55
56 byte[][] data = new byte[][]{zero, one, two};
57
58 @Before
59 public void prepare() {
60 allocator = new RootAllocator(1024 * 1024);
61 }
62
63 @After
64 public void shutdown() {
65 allocator.close();
66 }
67
68 @Test
69 public void testEncodeAndDecode() {
70 Random random = new Random();
71 try (VarCharVector rawVector = new VarCharVector("original vector", allocator);
72 IntVector encodedVector = new IntVector("encoded vector", allocator);
73 VarCharVector dictionary = new VarCharVector("dictionary", allocator)) {
74
75 // set up dictionary
76 dictionary.allocateNew();
77 for (int i = 0; i < DICTIONARY_LENGTH; i++) {
78 // encode "i" as i
79 dictionary.setSafe(i, String.valueOf(i).getBytes());
80 }
81 dictionary.setValueCount(DICTIONARY_LENGTH);
82
83 // set up raw vector
84 rawVector.allocateNew(10 * VECTOR_LENGTH, VECTOR_LENGTH);
85 for (int i = 0; i < VECTOR_LENGTH; i++) {
86 int val = (random.nextInt() & Integer.MAX_VALUE) % DICTIONARY_LENGTH;
87 rawVector.set(i, String.valueOf(val).getBytes());
88 }
89 rawVector.setValueCount(VECTOR_LENGTH);
90
91 HashTableDictionaryEncoder<IntVector, VarCharVector> encoder =
92 new HashTableDictionaryEncoder<>(dictionary, false);
93
94 // perform encoding
95 encodedVector.allocateNew();
96 encoder.encode(rawVector, encodedVector);
97
98 // verify encoding results
99 assertEquals(rawVector.getValueCount(), encodedVector.getValueCount());
100 for (int i = 0; i < VECTOR_LENGTH; i++) {
101 assertArrayEquals(rawVector.get(i), String.valueOf(encodedVector.get(i)).getBytes());
102 }
103
104 // perform decoding
105 Dictionary dict = new Dictionary(dictionary, new DictionaryEncoding(1L, false, null));
106 try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dict)) {
107
108 // verify decoding results
109 assertEquals(encodedVector.getValueCount(), decodedVector.getValueCount());
110 for (int i = 0; i < VECTOR_LENGTH; i++) {
111 assertArrayEquals(String.valueOf(encodedVector.get(i)).getBytes(), decodedVector.get(i));
112 }
113 }
114 }
115 }
116
117 @Test
118 public void testEncodeAndDecodeWithNull() {
119 Random random = new Random();
120 try (VarCharVector rawVector = new VarCharVector("original vector", allocator);
121 IntVector encodedVector = new IntVector("encoded vector", allocator);
122 VarCharVector dictionary = new VarCharVector("dictionary", allocator)) {
123
124 // set up dictionary
125 dictionary.allocateNew();
126 dictionary.setNull(0);
127 for (int i = 1; i < DICTIONARY_LENGTH; i++) {
128 // encode "i" as i
129 dictionary.setSafe(i, String.valueOf(i).getBytes());
130 }
131 dictionary.setValueCount(DICTIONARY_LENGTH);
132
133 // set up raw vector
134 rawVector.allocateNew(10 * VECTOR_LENGTH, VECTOR_LENGTH);
135 for (int i = 0; i < VECTOR_LENGTH; i++) {
136 if (i % 10 == 0) {
137 rawVector.setNull(i);
138 } else {
139 int val = (random.nextInt() & Integer.MAX_VALUE) % (DICTIONARY_LENGTH - 1) + 1;
140 rawVector.set(i, String.valueOf(val).getBytes());
141 }
142 }
143 rawVector.setValueCount(VECTOR_LENGTH);
144
145 HashTableDictionaryEncoder<IntVector, VarCharVector> encoder =
146 new HashTableDictionaryEncoder<>(dictionary, true);
147
148 // perform encoding
149 encodedVector.allocateNew();
150 encoder.encode(rawVector, encodedVector);
151
152 // verify encoding results
153 assertEquals(rawVector.getValueCount(), encodedVector.getValueCount());
154 for (int i = 0; i < VECTOR_LENGTH; i++) {
155 if (i % 10 == 0) {
156 assertEquals(0, encodedVector.get(i));
157 } else {
158 assertArrayEquals(rawVector.get(i), String.valueOf(encodedVector.get(i)).getBytes());
159 }
160 }
161
162 // perform decoding
163 Dictionary dict = new Dictionary(dictionary, new DictionaryEncoding(1L, false, null));
164 try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dict)) {
165 // verify decoding results
166 assertEquals(encodedVector.getValueCount(), decodedVector.getValueCount());
167 for (int i = 0; i < VECTOR_LENGTH; i++) {
168 if (i % 10 == 0) {
169 assertTrue(decodedVector.isNull(i));
170 } else {
171 assertArrayEquals(String.valueOf(encodedVector.get(i)).getBytes(), decodedVector.get(i));
172 }
173 }
174 }
175 }
176 }
177
178 @Test
179 public void testEncodeNullWithoutNullInDictionary() {
180 try (VarCharVector rawVector = new VarCharVector("original vector", allocator);
181 IntVector encodedVector = new IntVector("encoded vector", allocator);
182 VarCharVector dictionary = new VarCharVector("dictionary", allocator)) {
183
184 // set up dictionary, with no null in it.
185 dictionary.allocateNew();
186 for (int i = 0; i < DICTIONARY_LENGTH; i++) {
187 // encode "i" as i
188 dictionary.setSafe(i, String.valueOf(i).getBytes());
189 }
190 dictionary.setValueCount(DICTIONARY_LENGTH);
191
192 // the vector to encode has a null inside.
193 rawVector.allocateNew(1);
194 rawVector.setNull(0);
195 rawVector.setValueCount(1);
196
197 encodedVector.allocateNew();
198
199 HashTableDictionaryEncoder<IntVector, VarCharVector> encoder =
200 new HashTableDictionaryEncoder<>(dictionary, true);
201
202 // the encoder should encode null, but no null in the dictionary,
203 // so an exception should be thrown.
204 assertThrows(IllegalArgumentException.class, () -> {
205 encoder.encode(rawVector, encodedVector);
206 });
207 }
208 }
209
210 @Test
211 public void testEncodeStrings() {
212 // Create a new value vector
213 try (final VarCharVector vector = new VarCharVector("foo", allocator);
214 final IntVector encoded = new IntVector("encoded", allocator);
215 final VarCharVector dictionaryVector = new VarCharVector("dict", allocator)) {
216
217 vector.allocateNew(512, 5);
218 encoded.allocateNew();
219
220 // set some values
221 vector.setSafe(0, zero, 0, zero.length);
222 vector.setSafe(1, one, 0, one.length);
223 vector.setSafe(2, one, 0, one.length);
224 vector.setSafe(3, two, 0, two.length);
225 vector.setSafe(4, zero, 0, zero.length);
226 vector.setValueCount(5);
227
228 // set some dictionary values
229 dictionaryVector.allocateNew(512, 3);
230 dictionaryVector.setSafe(0, zero, 0, one.length);
231 dictionaryVector.setSafe(1, one, 0, two.length);
232 dictionaryVector.setSafe(2, two, 0, zero.length);
233 dictionaryVector.setValueCount(3);
234
235 HashTableDictionaryEncoder<IntVector, VarCharVector> encoder =
236 new HashTableDictionaryEncoder<>(dictionaryVector);
237 encoder.encode(vector, encoded);
238
239 // verify indices
240 assertEquals(5, encoded.getValueCount());
241 assertEquals(0, encoded.get(0));
242 assertEquals(1, encoded.get(1));
243 assertEquals(1, encoded.get(2));
244 assertEquals(2, encoded.get(3));
245 assertEquals(0, encoded.get(4));
246
247 // now run through the decoder and verify we get the original back
248 Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
249 try (VarCharVector decoded = (VarCharVector) DictionaryEncoder.decode(encoded, dict)) {
250
251 assertEquals(vector.getValueCount(), decoded.getValueCount());
252 for (int i = 0; i < 5; i++) {
253 assertEquals(vector.getObject(i), decoded.getObject(i));
254 }
255 }
256 }
257 }
258
259 @Test
260 public void testEncodeLargeVector() {
261 // Create a new value vector
262 try (final VarCharVector vector = new VarCharVector("foo", allocator);
263 final IntVector encoded = new IntVector("encoded", allocator);
264 final VarCharVector dictionaryVector = new VarCharVector("dict", allocator)) {
265 vector.allocateNew();
266 encoded.allocateNew();
267
268 int count = 10000;
269
270 for (int i = 0; i < 10000; ++i) {
271 vector.setSafe(i, data[i % 3], 0, data[i % 3].length);
272 }
273 vector.setValueCount(count);
274
275 dictionaryVector.allocateNew(512, 3);
276 dictionaryVector.setSafe(0, zero, 0, one.length);
277 dictionaryVector.setSafe(1, one, 0, two.length);
278 dictionaryVector.setSafe(2, two, 0, zero.length);
279 dictionaryVector.setValueCount(3);
280
281 HashTableDictionaryEncoder<IntVector, VarCharVector> encoder =
282 new HashTableDictionaryEncoder<>(dictionaryVector);
283 encoder.encode(vector, encoded);
284
285 assertEquals(count, encoded.getValueCount());
286 for (int i = 0; i < count; ++i) {
287 assertEquals(i % 3, encoded.get(i));
288 }
289
290 // now run through the decoder and verify we get the original back
291 Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
292 try (VarCharVector decoded = (VarCharVector) DictionaryEncoder.decode(encoded, dict)) {
293 assertEquals(vector.getClass(), decoded.getClass());
294 assertEquals(vector.getValueCount(), decoded.getValueCount());
295 for (int i = 0; i < count; ++i) {
296 assertEquals(vector.getObject(i), decoded.getObject(i));
297 }
298 }
299 }
300 }
301
302 @Test
303 public void testEncodeBinaryVector() {
304 // Create a new value vector
305 try (final VarBinaryVector vector = new VarBinaryVector("foo", allocator);
306 final VarBinaryVector dictionaryVector = new VarBinaryVector("dict", allocator);
307 final IntVector encoded = new IntVector("encoded", allocator)) {
308 vector.allocateNew(512, 5);
309 vector.allocateNew();
310 encoded.allocateNew();
311
312 // set some values
313 vector.setSafe(0, zero, 0, zero.length);
314 vector.setSafe(1, one, 0, one.length);
315 vector.setSafe(2, one, 0, one.length);
316 vector.setSafe(3, two, 0, two.length);
317 vector.setSafe(4, zero, 0, zero.length);
318 vector.setValueCount(5);
319
320 // set some dictionary values
321 dictionaryVector.allocateNew(512, 3);
322 dictionaryVector.setSafe(0, zero, 0, one.length);
323 dictionaryVector.setSafe(1, one, 0, two.length);
324 dictionaryVector.setSafe(2, two, 0, zero.length);
325 dictionaryVector.setValueCount(3);
326
327 HashTableDictionaryEncoder<IntVector, VarBinaryVector> encoder =
328 new HashTableDictionaryEncoder<>(dictionaryVector);
329 encoder.encode(vector, encoded);
330
331 assertEquals(5, encoded.getValueCount());
332 assertEquals(0, encoded.get(0));
333 assertEquals(1, encoded.get(1));
334 assertEquals(1, encoded.get(2));
335 assertEquals(2, encoded.get(3));
336 assertEquals(0, encoded.get(4));
337
338 // now run through the decoder and verify we get the original back
339 Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
340 try (VarBinaryVector decoded = (VarBinaryVector) DictionaryEncoder.decode(encoded, dict)) {
341
342 assertEquals(vector.getClass(), decoded.getClass());
343 assertEquals(vector.getValueCount(), decoded.getValueCount());
344 for (int i = 0; i < 5; i++) {
345 assertTrue(Arrays.equals(vector.getObject(i), decoded.getObject(i)));
346 }
347 }
348 }
349 }
350}