2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org
.apache
.arrow
.vector
.dictionary
;
20 import org
.apache
.arrow
.memory
.util
.hash
.ArrowBufHasher
;
21 import org
.apache
.arrow
.memory
.util
.hash
.SimpleHasher
;
22 import org
.apache
.arrow
.vector
.ValueVector
;
23 import org
.apache
.arrow
.vector
.compare
.Range
;
24 import org
.apache
.arrow
.vector
.compare
.RangeEqualsVisitor
;
27 * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector)
28 * It stores the index in dictionary vector and for a given index in encode vector,
29 * it could return dictionary index.
31 public class DictionaryHashTable
{
34 * Represents a null value in map.
36 static final int NULL_VALUE
= -1;
39 * The default initial capacity - MUST be a power of two.
41 static final int DEFAULT_INITIAL_CAPACITY
= 1 << 4;
44 * The maximum capacity, used if a higher value is implicitly specified
45 * by either of the constructors with arguments.
47 static final int MAXIMUM_CAPACITY
= 1 << 30;
50 * The load factor used when none specified in constructor.
52 static final float DEFAULT_LOAD_FACTOR
= 0.75f
;
54 static final DictionaryHashTable
.Entry
[] EMPTY_TABLE
= {};
57 * The table, initialized on first use, and resized as
58 * necessary. When allocated, length is always a power of two.
60 transient DictionaryHashTable
.Entry
[] table
= EMPTY_TABLE
;
63 * The number of key-value mappings contained in this map.
68 * The next size value at which to resize (capacity * load factor).
73 * The load factor for the hash table.
75 final float loadFactor
;
77 private final ValueVector dictionary
;
79 private final ArrowBufHasher hasher
;
82 * Constructs an empty map with the specified initial capacity and load factor.
84 public DictionaryHashTable(int initialCapacity
, ValueVector dictionary
, ArrowBufHasher hasher
) {
85 if (initialCapacity
< 0) {
86 throw new IllegalArgumentException("Illegal initial capacity: " +
89 if (initialCapacity
> MAXIMUM_CAPACITY
) {
90 initialCapacity
= MAXIMUM_CAPACITY
;
92 this.loadFactor
= DEFAULT_LOAD_FACTOR
;
93 this.threshold
= initialCapacity
;
95 this.dictionary
= dictionary
;
100 for (int i
= 0; i
< this.dictionary
.getValueCount(); i
++) {
105 public DictionaryHashTable(ValueVector dictionary
, ArrowBufHasher hasher
) {
106 this(DEFAULT_INITIAL_CAPACITY
, dictionary
, hasher
);
109 public DictionaryHashTable(ValueVector dictionary
) {
110 this(dictionary
, SimpleHasher
.INSTANCE
);
114 * Compute the capacity with given threshold and create init table.
116 private void inflateTable(int threshold
) {
117 int capacity
= roundUpToPowerOf2(threshold
);
118 this.threshold
= (int) Math
.min(capacity
* loadFactor
, MAXIMUM_CAPACITY
+ 1);
119 table
= new DictionaryHashTable
.Entry
[capacity
];
123 * Computes the storage location in an array for the given hashCode.
125 static int indexFor(int h
, int length
) {
126 return h
& (length
- 1);
130 * Returns a power of two size for the given size.
132 static final int roundUpToPowerOf2(int size
) {
139 return (n
< 0) ?
1 : (n
>= MAXIMUM_CAPACITY
) ? MAXIMUM_CAPACITY
: n
+ 1;
143 * get the corresponding dictionary index with the given index in vector which to encode.
144 * @param indexInArray index in vector.
145 * @return dictionary vector index or -1 if no value equals.
147 public int getIndex(int indexInArray
, ValueVector toEncode
) {
148 int hash
= toEncode
.hashCode(indexInArray
, this.hasher
);
149 int index
= indexFor(hash
, table
.length
);
151 RangeEqualsVisitor equalVisitor
= new RangeEqualsVisitor(dictionary
, toEncode
, null);
152 Range range
= new Range(0, 0, 1);
154 for (DictionaryHashTable
.Entry e
= table
[index
]; e
!= null ; e
= e
.next
) {
155 if (e
.hash
== hash
) {
156 int dictIndex
= e
.index
;
158 range
= range
.setRightStart(indexInArray
)
159 .setLeftStart(dictIndex
);
160 if (equalVisitor
.rangeEquals(range
)) {
169 * put the index of dictionary vector to build hash table.
171 private void put(int indexInDictionary
) {
172 if (table
== EMPTY_TABLE
) {
173 inflateTable(threshold
);
176 int hash
= dictionary
.hashCode(indexInDictionary
, this.hasher
);
177 int i
= indexFor(hash
, table
.length
);
178 for (DictionaryHashTable
.Entry e
= table
[i
]; e
!= null; e
= e
.next
) {
179 if (e
.hash
== hash
&& e
.index
== indexInDictionary
) {
180 //already has this index, return
185 addEntry(hash
, indexInDictionary
, i
);
189 * Create a new Entry at the specific position of table.
191 void createEntry(int hash
, int index
, int bucketIndex
) {
192 DictionaryHashTable
.Entry e
= table
[bucketIndex
];
193 table
[bucketIndex
] = new DictionaryHashTable
.Entry(hash
, index
, e
);
198 * Add Entry at the specified location of the table.
200 void addEntry(int hash
, int index
, int bucketIndex
) {
201 if ((size
>= threshold
) && (null != table
[bucketIndex
])) {
202 resize(2 * table
.length
);
203 bucketIndex
= indexFor(hash
, table
.length
);
206 createEntry(hash
, index
, bucketIndex
);
210 * Resize table with given new capacity.
212 void resize(int newCapacity
) {
213 DictionaryHashTable
.Entry
[] oldTable
= table
;
214 int oldCapacity
= oldTable
.length
;
215 if (oldCapacity
== MAXIMUM_CAPACITY
) {
216 threshold
= Integer
.MAX_VALUE
;
220 DictionaryHashTable
.Entry
[] newTable
= new DictionaryHashTable
.Entry
[newCapacity
];
223 threshold
= (int) Math
.min(newCapacity
* loadFactor
, MAXIMUM_CAPACITY
+ 1);
227 * Transfer entries into new table from old table.
228 * @param newTable new table
230 void transfer(DictionaryHashTable
.Entry
[] newTable
) {
231 int newCapacity
= newTable
.length
;
232 for (DictionaryHashTable
.Entry e
: table
) {
234 DictionaryHashTable
.Entry next
= e
.next
;
235 int i
= indexFor(e
.hash
, newCapacity
);
236 e
.next
= newTable
[i
];
244 * Returns the number of mappings in this Map.
251 * Removes all elements from this map, leaving it empty.
253 public void clear() {
255 for (int i
= 0; i
< table
.length
; i
++) {
261 * Class to keep dictionary index data within hash table.
266 DictionaryHashTable
.Entry next
;
269 Entry(int hash
, int index
, DictionaryHashTable
.Entry next
) {
275 public final int getIndex() {
280 public int hashCode() {
284 public final boolean equals(Object o
) {
285 if (!(o
instanceof DictionaryHashTable
.Entry
)) {
288 DictionaryHashTable
.Entry e
= (DictionaryHashTable
.Entry
) o
;
289 if (index
== e
.getIndex()) {