]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.arrow.algorithm.dictionary; | |
19 | ||
20 | import static junit.framework.TestCase.assertTrue; | |
21 | import static org.junit.jupiter.api.Assertions.assertArrayEquals; | |
22 | import static org.junit.jupiter.api.Assertions.assertEquals; | |
23 | import static org.junit.jupiter.api.Assertions.assertThrows; | |
24 | ||
25 | import java.nio.charset.StandardCharsets; | |
26 | import java.util.Arrays; | |
27 | import java.util.Random; | |
28 | ||
29 | import org.apache.arrow.memory.BufferAllocator; | |
30 | import org.apache.arrow.memory.RootAllocator; | |
31 | import org.apache.arrow.vector.IntVector; | |
32 | import org.apache.arrow.vector.VarBinaryVector; | |
33 | import org.apache.arrow.vector.VarCharVector; | |
34 | import org.apache.arrow.vector.dictionary.Dictionary; | |
35 | import org.apache.arrow.vector.dictionary.DictionaryEncoder; | |
36 | import org.apache.arrow.vector.types.pojo.DictionaryEncoding; | |
37 | import org.junit.After; | |
38 | import org.junit.Before; | |
39 | import org.junit.Test; | |
40 | ||
41 | /** | |
42 | * Test cases for {@link HashTableDictionaryEncoder}. | |
43 | */ | |
44 | public class TestHashTableDictionaryEncoder { | |
45 | ||
46 | private final int VECTOR_LENGTH = 50; | |
47 | ||
48 | private final int DICTIONARY_LENGTH = 10; | |
49 | ||
50 | private BufferAllocator allocator; | |
51 | ||
52 | byte[] zero = "000".getBytes(StandardCharsets.UTF_8); | |
53 | byte[] one = "111".getBytes(StandardCharsets.UTF_8); | |
54 | byte[] two = "222".getBytes(StandardCharsets.UTF_8); | |
55 | ||
56 | byte[][] data = new byte[][]{zero, one, two}; | |
57 | ||
58 | @Before | |
59 | public void prepare() { | |
60 | allocator = new RootAllocator(1024 * 1024); | |
61 | } | |
62 | ||
63 | @After | |
64 | public void shutdown() { | |
65 | allocator.close(); | |
66 | } | |
67 | ||
68 | @Test | |
69 | public void testEncodeAndDecode() { | |
70 | Random random = new Random(); | |
71 | try (VarCharVector rawVector = new VarCharVector("original vector", allocator); | |
72 | IntVector encodedVector = new IntVector("encoded vector", allocator); | |
73 | VarCharVector dictionary = new VarCharVector("dictionary", allocator)) { | |
74 | ||
75 | // set up dictionary | |
76 | dictionary.allocateNew(); | |
77 | for (int i = 0; i < DICTIONARY_LENGTH; i++) { | |
78 | // encode "i" as i | |
79 | dictionary.setSafe(i, String.valueOf(i).getBytes()); | |
80 | } | |
81 | dictionary.setValueCount(DICTIONARY_LENGTH); | |
82 | ||
83 | // set up raw vector | |
84 | rawVector.allocateNew(10 * VECTOR_LENGTH, VECTOR_LENGTH); | |
85 | for (int i = 0; i < VECTOR_LENGTH; i++) { | |
86 | int val = (random.nextInt() & Integer.MAX_VALUE) % DICTIONARY_LENGTH; | |
87 | rawVector.set(i, String.valueOf(val).getBytes()); | |
88 | } | |
89 | rawVector.setValueCount(VECTOR_LENGTH); | |
90 | ||
91 | HashTableDictionaryEncoder<IntVector, VarCharVector> encoder = | |
92 | new HashTableDictionaryEncoder<>(dictionary, false); | |
93 | ||
94 | // perform encoding | |
95 | encodedVector.allocateNew(); | |
96 | encoder.encode(rawVector, encodedVector); | |
97 | ||
98 | // verify encoding results | |
99 | assertEquals(rawVector.getValueCount(), encodedVector.getValueCount()); | |
100 | for (int i = 0; i < VECTOR_LENGTH; i++) { | |
101 | assertArrayEquals(rawVector.get(i), String.valueOf(encodedVector.get(i)).getBytes()); | |
102 | } | |
103 | ||
104 | // perform decoding | |
105 | Dictionary dict = new Dictionary(dictionary, new DictionaryEncoding(1L, false, null)); | |
106 | try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dict)) { | |
107 | ||
108 | // verify decoding results | |
109 | assertEquals(encodedVector.getValueCount(), decodedVector.getValueCount()); | |
110 | for (int i = 0; i < VECTOR_LENGTH; i++) { | |
111 | assertArrayEquals(String.valueOf(encodedVector.get(i)).getBytes(), decodedVector.get(i)); | |
112 | } | |
113 | } | |
114 | } | |
115 | } | |
116 | ||
117 | @Test | |
118 | public void testEncodeAndDecodeWithNull() { | |
119 | Random random = new Random(); | |
120 | try (VarCharVector rawVector = new VarCharVector("original vector", allocator); | |
121 | IntVector encodedVector = new IntVector("encoded vector", allocator); | |
122 | VarCharVector dictionary = new VarCharVector("dictionary", allocator)) { | |
123 | ||
124 | // set up dictionary | |
125 | dictionary.allocateNew(); | |
126 | dictionary.setNull(0); | |
127 | for (int i = 1; i < DICTIONARY_LENGTH; i++) { | |
128 | // encode "i" as i | |
129 | dictionary.setSafe(i, String.valueOf(i).getBytes()); | |
130 | } | |
131 | dictionary.setValueCount(DICTIONARY_LENGTH); | |
132 | ||
133 | // set up raw vector | |
134 | rawVector.allocateNew(10 * VECTOR_LENGTH, VECTOR_LENGTH); | |
135 | for (int i = 0; i < VECTOR_LENGTH; i++) { | |
136 | if (i % 10 == 0) { | |
137 | rawVector.setNull(i); | |
138 | } else { | |
139 | int val = (random.nextInt() & Integer.MAX_VALUE) % (DICTIONARY_LENGTH - 1) + 1; | |
140 | rawVector.set(i, String.valueOf(val).getBytes()); | |
141 | } | |
142 | } | |
143 | rawVector.setValueCount(VECTOR_LENGTH); | |
144 | ||
145 | HashTableDictionaryEncoder<IntVector, VarCharVector> encoder = | |
146 | new HashTableDictionaryEncoder<>(dictionary, true); | |
147 | ||
148 | // perform encoding | |
149 | encodedVector.allocateNew(); | |
150 | encoder.encode(rawVector, encodedVector); | |
151 | ||
152 | // verify encoding results | |
153 | assertEquals(rawVector.getValueCount(), encodedVector.getValueCount()); | |
154 | for (int i = 0; i < VECTOR_LENGTH; i++) { | |
155 | if (i % 10 == 0) { | |
156 | assertEquals(0, encodedVector.get(i)); | |
157 | } else { | |
158 | assertArrayEquals(rawVector.get(i), String.valueOf(encodedVector.get(i)).getBytes()); | |
159 | } | |
160 | } | |
161 | ||
162 | // perform decoding | |
163 | Dictionary dict = new Dictionary(dictionary, new DictionaryEncoding(1L, false, null)); | |
164 | try (VarCharVector decodedVector = (VarCharVector) DictionaryEncoder.decode(encodedVector, dict)) { | |
165 | // verify decoding results | |
166 | assertEquals(encodedVector.getValueCount(), decodedVector.getValueCount()); | |
167 | for (int i = 0; i < VECTOR_LENGTH; i++) { | |
168 | if (i % 10 == 0) { | |
169 | assertTrue(decodedVector.isNull(i)); | |
170 | } else { | |
171 | assertArrayEquals(String.valueOf(encodedVector.get(i)).getBytes(), decodedVector.get(i)); | |
172 | } | |
173 | } | |
174 | } | |
175 | } | |
176 | } | |
177 | ||
178 | @Test | |
179 | public void testEncodeNullWithoutNullInDictionary() { | |
180 | try (VarCharVector rawVector = new VarCharVector("original vector", allocator); | |
181 | IntVector encodedVector = new IntVector("encoded vector", allocator); | |
182 | VarCharVector dictionary = new VarCharVector("dictionary", allocator)) { | |
183 | ||
184 | // set up dictionary, with no null in it. | |
185 | dictionary.allocateNew(); | |
186 | for (int i = 0; i < DICTIONARY_LENGTH; i++) { | |
187 | // encode "i" as i | |
188 | dictionary.setSafe(i, String.valueOf(i).getBytes()); | |
189 | } | |
190 | dictionary.setValueCount(DICTIONARY_LENGTH); | |
191 | ||
192 | // the vector to encode has a null inside. | |
193 | rawVector.allocateNew(1); | |
194 | rawVector.setNull(0); | |
195 | rawVector.setValueCount(1); | |
196 | ||
197 | encodedVector.allocateNew(); | |
198 | ||
199 | HashTableDictionaryEncoder<IntVector, VarCharVector> encoder = | |
200 | new HashTableDictionaryEncoder<>(dictionary, true); | |
201 | ||
202 | // the encoder should encode null, but no null in the dictionary, | |
203 | // so an exception should be thrown. | |
204 | assertThrows(IllegalArgumentException.class, () -> { | |
205 | encoder.encode(rawVector, encodedVector); | |
206 | }); | |
207 | } | |
208 | } | |
209 | ||
210 | @Test | |
211 | public void testEncodeStrings() { | |
212 | // Create a new value vector | |
213 | try (final VarCharVector vector = new VarCharVector("foo", allocator); | |
214 | final IntVector encoded = new IntVector("encoded", allocator); | |
215 | final VarCharVector dictionaryVector = new VarCharVector("dict", allocator)) { | |
216 | ||
217 | vector.allocateNew(512, 5); | |
218 | encoded.allocateNew(); | |
219 | ||
220 | // set some values | |
221 | vector.setSafe(0, zero, 0, zero.length); | |
222 | vector.setSafe(1, one, 0, one.length); | |
223 | vector.setSafe(2, one, 0, one.length); | |
224 | vector.setSafe(3, two, 0, two.length); | |
225 | vector.setSafe(4, zero, 0, zero.length); | |
226 | vector.setValueCount(5); | |
227 | ||
228 | // set some dictionary values | |
229 | dictionaryVector.allocateNew(512, 3); | |
230 | dictionaryVector.setSafe(0, zero, 0, one.length); | |
231 | dictionaryVector.setSafe(1, one, 0, two.length); | |
232 | dictionaryVector.setSafe(2, two, 0, zero.length); | |
233 | dictionaryVector.setValueCount(3); | |
234 | ||
235 | HashTableDictionaryEncoder<IntVector, VarCharVector> encoder = | |
236 | new HashTableDictionaryEncoder<>(dictionaryVector); | |
237 | encoder.encode(vector, encoded); | |
238 | ||
239 | // verify indices | |
240 | assertEquals(5, encoded.getValueCount()); | |
241 | assertEquals(0, encoded.get(0)); | |
242 | assertEquals(1, encoded.get(1)); | |
243 | assertEquals(1, encoded.get(2)); | |
244 | assertEquals(2, encoded.get(3)); | |
245 | assertEquals(0, encoded.get(4)); | |
246 | ||
247 | // now run through the decoder and verify we get the original back | |
248 | Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); | |
249 | try (VarCharVector decoded = (VarCharVector) DictionaryEncoder.decode(encoded, dict)) { | |
250 | ||
251 | assertEquals(vector.getValueCount(), decoded.getValueCount()); | |
252 | for (int i = 0; i < 5; i++) { | |
253 | assertEquals(vector.getObject(i), decoded.getObject(i)); | |
254 | } | |
255 | } | |
256 | } | |
257 | } | |
258 | ||
259 | @Test | |
260 | public void testEncodeLargeVector() { | |
261 | // Create a new value vector | |
262 | try (final VarCharVector vector = new VarCharVector("foo", allocator); | |
263 | final IntVector encoded = new IntVector("encoded", allocator); | |
264 | final VarCharVector dictionaryVector = new VarCharVector("dict", allocator)) { | |
265 | vector.allocateNew(); | |
266 | encoded.allocateNew(); | |
267 | ||
268 | int count = 10000; | |
269 | ||
270 | for (int i = 0; i < 10000; ++i) { | |
271 | vector.setSafe(i, data[i % 3], 0, data[i % 3].length); | |
272 | } | |
273 | vector.setValueCount(count); | |
274 | ||
275 | dictionaryVector.allocateNew(512, 3); | |
276 | dictionaryVector.setSafe(0, zero, 0, one.length); | |
277 | dictionaryVector.setSafe(1, one, 0, two.length); | |
278 | dictionaryVector.setSafe(2, two, 0, zero.length); | |
279 | dictionaryVector.setValueCount(3); | |
280 | ||
281 | HashTableDictionaryEncoder<IntVector, VarCharVector> encoder = | |
282 | new HashTableDictionaryEncoder<>(dictionaryVector); | |
283 | encoder.encode(vector, encoded); | |
284 | ||
285 | assertEquals(count, encoded.getValueCount()); | |
286 | for (int i = 0; i < count; ++i) { | |
287 | assertEquals(i % 3, encoded.get(i)); | |
288 | } | |
289 | ||
290 | // now run through the decoder and verify we get the original back | |
291 | Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); | |
292 | try (VarCharVector decoded = (VarCharVector) DictionaryEncoder.decode(encoded, dict)) { | |
293 | assertEquals(vector.getClass(), decoded.getClass()); | |
294 | assertEquals(vector.getValueCount(), decoded.getValueCount()); | |
295 | for (int i = 0; i < count; ++i) { | |
296 | assertEquals(vector.getObject(i), decoded.getObject(i)); | |
297 | } | |
298 | } | |
299 | } | |
300 | } | |
301 | ||
302 | @Test | |
303 | public void testEncodeBinaryVector() { | |
304 | // Create a new value vector | |
305 | try (final VarBinaryVector vector = new VarBinaryVector("foo", allocator); | |
306 | final VarBinaryVector dictionaryVector = new VarBinaryVector("dict", allocator); | |
307 | final IntVector encoded = new IntVector("encoded", allocator)) { | |
308 | vector.allocateNew(512, 5); | |
309 | vector.allocateNew(); | |
310 | encoded.allocateNew(); | |
311 | ||
312 | // set some values | |
313 | vector.setSafe(0, zero, 0, zero.length); | |
314 | vector.setSafe(1, one, 0, one.length); | |
315 | vector.setSafe(2, one, 0, one.length); | |
316 | vector.setSafe(3, two, 0, two.length); | |
317 | vector.setSafe(4, zero, 0, zero.length); | |
318 | vector.setValueCount(5); | |
319 | ||
320 | // set some dictionary values | |
321 | dictionaryVector.allocateNew(512, 3); | |
322 | dictionaryVector.setSafe(0, zero, 0, one.length); | |
323 | dictionaryVector.setSafe(1, one, 0, two.length); | |
324 | dictionaryVector.setSafe(2, two, 0, zero.length); | |
325 | dictionaryVector.setValueCount(3); | |
326 | ||
327 | HashTableDictionaryEncoder<IntVector, VarBinaryVector> encoder = | |
328 | new HashTableDictionaryEncoder<>(dictionaryVector); | |
329 | encoder.encode(vector, encoded); | |
330 | ||
331 | assertEquals(5, encoded.getValueCount()); | |
332 | assertEquals(0, encoded.get(0)); | |
333 | assertEquals(1, encoded.get(1)); | |
334 | assertEquals(1, encoded.get(2)); | |
335 | assertEquals(2, encoded.get(3)); | |
336 | assertEquals(0, encoded.get(4)); | |
337 | ||
338 | // now run through the decoder and verify we get the original back | |
339 | Dictionary dict = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); | |
340 | try (VarBinaryVector decoded = (VarBinaryVector) DictionaryEncoder.decode(encoded, dict)) { | |
341 | ||
342 | assertEquals(vector.getClass(), decoded.getClass()); | |
343 | assertEquals(vector.getValueCount(), decoded.getValueCount()); | |
344 | for (int i = 0; i < 5; i++) { | |
345 | assertTrue(Arrays.equals(vector.getObject(i), decoded.getObject(i))); | |
346 | } | |
347 | } | |
348 | } | |
349 | } | |
350 | } |