]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryHashTable.java
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / java / vector / src / main / java / org / apache / arrow / vector / dictionary / DictionaryHashTable.java
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.arrow.vector.dictionary;
19
20 import org.apache.arrow.memory.util.hash.ArrowBufHasher;
21 import org.apache.arrow.memory.util.hash.SimpleHasher;
22 import org.apache.arrow.vector.ValueVector;
23 import org.apache.arrow.vector.compare.Range;
24 import org.apache.arrow.vector.compare.RangeEqualsVisitor;
25
26 /**
27 * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector)
28 * It stores the index in dictionary vector and for a given index in encode vector,
29 * it could return dictionary index.
30 */
31 public class DictionaryHashTable {
32
33 /**
34 * Represents a null value in map.
35 */
36 static final int NULL_VALUE = -1;
37
38 /**
39 * The default initial capacity - MUST be a power of two.
40 */
41 static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;
42
43 /**
44 * The maximum capacity, used if a higher value is implicitly specified
45 * by either of the constructors with arguments.
46 */
47 static final int MAXIMUM_CAPACITY = 1 << 30;
48
49 /**
50 * The load factor used when none specified in constructor.
51 */
52 static final float DEFAULT_LOAD_FACTOR = 0.75f;
53
54 static final DictionaryHashTable.Entry[] EMPTY_TABLE = {};
55
56 /**
57 * The table, initialized on first use, and resized as
58 * necessary. When allocated, length is always a power of two.
59 */
60 transient DictionaryHashTable.Entry[] table = EMPTY_TABLE;
61
62 /**
63 * The number of key-value mappings contained in this map.
64 */
65 transient int size;
66
67 /**
68 * The next size value at which to resize (capacity * load factor).
69 */
70 int threshold;
71
72 /**
73 * The load factor for the hash table.
74 */
75 final float loadFactor;
76
77 private final ValueVector dictionary;
78
79 private final ArrowBufHasher hasher;
80
81 /**
82 * Constructs an empty map with the specified initial capacity and load factor.
83 */
84 public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ArrowBufHasher hasher) {
85 if (initialCapacity < 0) {
86 throw new IllegalArgumentException("Illegal initial capacity: " +
87 initialCapacity);
88 }
89 if (initialCapacity > MAXIMUM_CAPACITY) {
90 initialCapacity = MAXIMUM_CAPACITY;
91 }
92 this.loadFactor = DEFAULT_LOAD_FACTOR;
93 this.threshold = initialCapacity;
94
95 this.dictionary = dictionary;
96
97 this.hasher = hasher;
98
99 // build hash table
100 for (int i = 0; i < this.dictionary.getValueCount(); i++) {
101 put(i);
102 }
103 }
104
105 public DictionaryHashTable(ValueVector dictionary, ArrowBufHasher hasher) {
106 this(DEFAULT_INITIAL_CAPACITY, dictionary, hasher);
107 }
108
109 public DictionaryHashTable(ValueVector dictionary) {
110 this(dictionary, SimpleHasher.INSTANCE);
111 }
112
113 /**
114 * Compute the capacity with given threshold and create init table.
115 */
116 private void inflateTable(int threshold) {
117 int capacity = roundUpToPowerOf2(threshold);
118 this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1);
119 table = new DictionaryHashTable.Entry[capacity];
120 }
121
122 /**
123 * Computes the storage location in an array for the given hashCode.
124 */
125 static int indexFor(int h, int length) {
126 return h & (length - 1);
127 }
128
129 /**
130 * Returns a power of two size for the given size.
131 */
132 static final int roundUpToPowerOf2(int size) {
133 int n = size - 1;
134 n |= n >>> 1;
135 n |= n >>> 2;
136 n |= n >>> 4;
137 n |= n >>> 8;
138 n |= n >>> 16;
139 return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1;
140 }
141
142 /**
143 * get the corresponding dictionary index with the given index in vector which to encode.
144 * @param indexInArray index in vector.
145 * @return dictionary vector index or -1 if no value equals.
146 */
147 public int getIndex(int indexInArray, ValueVector toEncode) {
148 int hash = toEncode.hashCode(indexInArray, this.hasher);
149 int index = indexFor(hash, table.length);
150
151 RangeEqualsVisitor equalVisitor = new RangeEqualsVisitor(dictionary, toEncode, null);
152 Range range = new Range(0, 0, 1);
153
154 for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) {
155 if (e.hash == hash) {
156 int dictIndex = e.index;
157
158 range = range.setRightStart(indexInArray)
159 .setLeftStart(dictIndex);
160 if (equalVisitor.rangeEquals(range)) {
161 return dictIndex;
162 }
163 }
164 }
165 return NULL_VALUE;
166 }
167
168 /**
169 * put the index of dictionary vector to build hash table.
170 */
171 private void put(int indexInDictionary) {
172 if (table == EMPTY_TABLE) {
173 inflateTable(threshold);
174 }
175
176 int hash = dictionary.hashCode(indexInDictionary, this.hasher);
177 int i = indexFor(hash, table.length);
178 for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) {
179 if (e.hash == hash && e.index == indexInDictionary) {
180 //already has this index, return
181 return;
182 }
183 }
184
185 addEntry(hash, indexInDictionary, i);
186 }
187
188 /**
189 * Create a new Entry at the specific position of table.
190 */
191 void createEntry(int hash, int index, int bucketIndex) {
192 DictionaryHashTable.Entry e = table[bucketIndex];
193 table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e);
194 size++;
195 }
196
197 /**
198 * Add Entry at the specified location of the table.
199 */
200 void addEntry(int hash, int index, int bucketIndex) {
201 if ((size >= threshold) && (null != table[bucketIndex])) {
202 resize(2 * table.length);
203 bucketIndex = indexFor(hash, table.length);
204 }
205
206 createEntry(hash, index, bucketIndex);
207 }
208
209 /**
210 * Resize table with given new capacity.
211 */
212 void resize(int newCapacity) {
213 DictionaryHashTable.Entry[] oldTable = table;
214 int oldCapacity = oldTable.length;
215 if (oldCapacity == MAXIMUM_CAPACITY) {
216 threshold = Integer.MAX_VALUE;
217 return;
218 }
219
220 DictionaryHashTable.Entry[] newTable = new DictionaryHashTable.Entry[newCapacity];
221 transfer(newTable);
222 table = newTable;
223 threshold = (int) Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1);
224 }
225
226 /**
227 * Transfer entries into new table from old table.
228 * @param newTable new table
229 */
230 void transfer(DictionaryHashTable.Entry[] newTable) {
231 int newCapacity = newTable.length;
232 for (DictionaryHashTable.Entry e : table) {
233 while (null != e) {
234 DictionaryHashTable.Entry next = e.next;
235 int i = indexFor(e.hash, newCapacity);
236 e.next = newTable[i];
237 newTable[i] = e;
238 e = next;
239 }
240 }
241 }
242
243 /**
244 * Returns the number of mappings in this Map.
245 */
246 public int size() {
247 return size;
248 }
249
250 /**
251 * Removes all elements from this map, leaving it empty.
252 */
253 public void clear() {
254 size = 0;
255 for (int i = 0; i < table.length; i++) {
256 table[i] = null;
257 }
258 }
259
260 /**
261 * Class to keep dictionary index data within hash table.
262 */
263 static class Entry {
264 //dictionary index
265 int index;
266 DictionaryHashTable.Entry next;
267 int hash;
268
269 Entry(int hash, int index, DictionaryHashTable.Entry next) {
270 this.index = index;
271 this.hash = hash;
272 this.next = next;
273 }
274
275 public final int getIndex() {
276 return this.index;
277 }
278
279 @Override
280 public int hashCode() {
281 return hash;
282 }
283
284 public final boolean equals(Object o) {
285 if (!(o instanceof DictionaryHashTable.Entry)) {
286 return false;
287 }
288 DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o;
289 if (index == e.getIndex()) {
290 return true;
291 }
292 return false;
293 }
294 }
295 }