]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.arrow.vector.dictionary; | |
19 | ||
20 | import org.apache.arrow.memory.util.hash.ArrowBufHasher; | |
21 | import org.apache.arrow.memory.util.hash.SimpleHasher; | |
22 | import org.apache.arrow.vector.ValueVector; | |
23 | import org.apache.arrow.vector.compare.Range; | |
24 | import org.apache.arrow.vector.compare.RangeEqualsVisitor; | |
25 | ||
26 | /** | |
27 | * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector) | |
28 | * It stores the index in dictionary vector and for a given index in encode vector, | |
29 | * it could return dictionary index. | |
30 | */ | |
31 | public class DictionaryHashTable { | |
32 | ||
33 | /** | |
34 | * Represents a null value in map. | |
35 | */ | |
36 | static final int NULL_VALUE = -1; | |
37 | ||
38 | /** | |
39 | * The default initial capacity - MUST be a power of two. | |
40 | */ | |
41 | static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; | |
42 | ||
43 | /** | |
44 | * The maximum capacity, used if a higher value is implicitly specified | |
45 | * by either of the constructors with arguments. | |
46 | */ | |
47 | static final int MAXIMUM_CAPACITY = 1 << 30; | |
48 | ||
49 | /** | |
50 | * The load factor used when none specified in constructor. | |
51 | */ | |
52 | static final float DEFAULT_LOAD_FACTOR = 0.75f; | |
53 | ||
54 | static final DictionaryHashTable.Entry[] EMPTY_TABLE = {}; | |
55 | ||
56 | /** | |
57 | * The table, initialized on first use, and resized as | |
58 | * necessary. When allocated, length is always a power of two. | |
59 | */ | |
60 | transient DictionaryHashTable.Entry[] table = EMPTY_TABLE; | |
61 | ||
62 | /** | |
63 | * The number of key-value mappings contained in this map. | |
64 | */ | |
65 | transient int size; | |
66 | ||
67 | /** | |
68 | * The next size value at which to resize (capacity * load factor). | |
69 | */ | |
70 | int threshold; | |
71 | ||
72 | /** | |
73 | * The load factor for the hash table. | |
74 | */ | |
75 | final float loadFactor; | |
76 | ||
77 | private final ValueVector dictionary; | |
78 | ||
79 | private final ArrowBufHasher hasher; | |
80 | ||
81 | /** | |
82 | * Constructs an empty map with the specified initial capacity and load factor. | |
83 | */ | |
84 | public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ArrowBufHasher hasher) { | |
85 | if (initialCapacity < 0) { | |
86 | throw new IllegalArgumentException("Illegal initial capacity: " + | |
87 | initialCapacity); | |
88 | } | |
89 | if (initialCapacity > MAXIMUM_CAPACITY) { | |
90 | initialCapacity = MAXIMUM_CAPACITY; | |
91 | } | |
92 | this.loadFactor = DEFAULT_LOAD_FACTOR; | |
93 | this.threshold = initialCapacity; | |
94 | ||
95 | this.dictionary = dictionary; | |
96 | ||
97 | this.hasher = hasher; | |
98 | ||
99 | // build hash table | |
100 | for (int i = 0; i < this.dictionary.getValueCount(); i++) { | |
101 | put(i); | |
102 | } | |
103 | } | |
104 | ||
105 | public DictionaryHashTable(ValueVector dictionary, ArrowBufHasher hasher) { | |
106 | this(DEFAULT_INITIAL_CAPACITY, dictionary, hasher); | |
107 | } | |
108 | ||
109 | public DictionaryHashTable(ValueVector dictionary) { | |
110 | this(dictionary, SimpleHasher.INSTANCE); | |
111 | } | |
112 | ||
113 | /** | |
114 | * Compute the capacity with given threshold and create init table. | |
115 | */ | |
116 | private void inflateTable(int threshold) { | |
117 | int capacity = roundUpToPowerOf2(threshold); | |
118 | this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1); | |
119 | table = new DictionaryHashTable.Entry[capacity]; | |
120 | } | |
121 | ||
122 | /** | |
123 | * Computes the storage location in an array for the given hashCode. | |
124 | */ | |
125 | static int indexFor(int h, int length) { | |
126 | return h & (length - 1); | |
127 | } | |
128 | ||
129 | /** | |
130 | * Returns a power of two size for the given size. | |
131 | */ | |
132 | static final int roundUpToPowerOf2(int size) { | |
133 | int n = size - 1; | |
134 | n |= n >>> 1; | |
135 | n |= n >>> 2; | |
136 | n |= n >>> 4; | |
137 | n |= n >>> 8; | |
138 | n |= n >>> 16; | |
139 | return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1; | |
140 | } | |
141 | ||
142 | /** | |
143 | * get the corresponding dictionary index with the given index in vector which to encode. | |
144 | * @param indexInArray index in vector. | |
145 | * @return dictionary vector index or -1 if no value equals. | |
146 | */ | |
147 | public int getIndex(int indexInArray, ValueVector toEncode) { | |
148 | int hash = toEncode.hashCode(indexInArray, this.hasher); | |
149 | int index = indexFor(hash, table.length); | |
150 | ||
151 | RangeEqualsVisitor equalVisitor = new RangeEqualsVisitor(dictionary, toEncode, null); | |
152 | Range range = new Range(0, 0, 1); | |
153 | ||
154 | for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) { | |
155 | if (e.hash == hash) { | |
156 | int dictIndex = e.index; | |
157 | ||
158 | range = range.setRightStart(indexInArray) | |
159 | .setLeftStart(dictIndex); | |
160 | if (equalVisitor.rangeEquals(range)) { | |
161 | return dictIndex; | |
162 | } | |
163 | } | |
164 | } | |
165 | return NULL_VALUE; | |
166 | } | |
167 | ||
168 | /** | |
169 | * put the index of dictionary vector to build hash table. | |
170 | */ | |
171 | private void put(int indexInDictionary) { | |
172 | if (table == EMPTY_TABLE) { | |
173 | inflateTable(threshold); | |
174 | } | |
175 | ||
176 | int hash = dictionary.hashCode(indexInDictionary, this.hasher); | |
177 | int i = indexFor(hash, table.length); | |
178 | for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) { | |
179 | if (e.hash == hash && e.index == indexInDictionary) { | |
180 | //already has this index, return | |
181 | return; | |
182 | } | |
183 | } | |
184 | ||
185 | addEntry(hash, indexInDictionary, i); | |
186 | } | |
187 | ||
188 | /** | |
189 | * Create a new Entry at the specific position of table. | |
190 | */ | |
191 | void createEntry(int hash, int index, int bucketIndex) { | |
192 | DictionaryHashTable.Entry e = table[bucketIndex]; | |
193 | table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e); | |
194 | size++; | |
195 | } | |
196 | ||
197 | /** | |
198 | * Add Entry at the specified location of the table. | |
199 | */ | |
200 | void addEntry(int hash, int index, int bucketIndex) { | |
201 | if ((size >= threshold) && (null != table[bucketIndex])) { | |
202 | resize(2 * table.length); | |
203 | bucketIndex = indexFor(hash, table.length); | |
204 | } | |
205 | ||
206 | createEntry(hash, index, bucketIndex); | |
207 | } | |
208 | ||
209 | /** | |
210 | * Resize table with given new capacity. | |
211 | */ | |
212 | void resize(int newCapacity) { | |
213 | DictionaryHashTable.Entry[] oldTable = table; | |
214 | int oldCapacity = oldTable.length; | |
215 | if (oldCapacity == MAXIMUM_CAPACITY) { | |
216 | threshold = Integer.MAX_VALUE; | |
217 | return; | |
218 | } | |
219 | ||
220 | DictionaryHashTable.Entry[] newTable = new DictionaryHashTable.Entry[newCapacity]; | |
221 | transfer(newTable); | |
222 | table = newTable; | |
223 | threshold = (int) Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1); | |
224 | } | |
225 | ||
226 | /** | |
227 | * Transfer entries into new table from old table. | |
228 | * @param newTable new table | |
229 | */ | |
230 | void transfer(DictionaryHashTable.Entry[] newTable) { | |
231 | int newCapacity = newTable.length; | |
232 | for (DictionaryHashTable.Entry e : table) { | |
233 | while (null != e) { | |
234 | DictionaryHashTable.Entry next = e.next; | |
235 | int i = indexFor(e.hash, newCapacity); | |
236 | e.next = newTable[i]; | |
237 | newTable[i] = e; | |
238 | e = next; | |
239 | } | |
240 | } | |
241 | } | |
242 | ||
243 | /** | |
244 | * Returns the number of mappings in this Map. | |
245 | */ | |
246 | public int size() { | |
247 | return size; | |
248 | } | |
249 | ||
250 | /** | |
251 | * Removes all elements from this map, leaving it empty. | |
252 | */ | |
253 | public void clear() { | |
254 | size = 0; | |
255 | for (int i = 0; i < table.length; i++) { | |
256 | table[i] = null; | |
257 | } | |
258 | } | |
259 | ||
260 | /** | |
261 | * Class to keep dictionary index data within hash table. | |
262 | */ | |
263 | static class Entry { | |
264 | //dictionary index | |
265 | int index; | |
266 | DictionaryHashTable.Entry next; | |
267 | int hash; | |
268 | ||
269 | Entry(int hash, int index, DictionaryHashTable.Entry next) { | |
270 | this.index = index; | |
271 | this.hash = hash; | |
272 | this.next = next; | |
273 | } | |
274 | ||
275 | public final int getIndex() { | |
276 | return this.index; | |
277 | } | |
278 | ||
279 | @Override | |
280 | public int hashCode() { | |
281 | return hash; | |
282 | } | |
283 | ||
284 | public final boolean equals(Object o) { | |
285 | if (!(o instanceof DictionaryHashTable.Entry)) { | |
286 | return false; | |
287 | } | |
288 | DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o; | |
289 | if (index == e.getIndex()) { | |
290 | return true; | |
291 | } | |
292 | return false; | |
293 | } | |
294 | } | |
295 | } |