1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
24 #include "arrow/util/bit_util.h"
25 #include "arrow/util/logging.h"
26 #include "parquet/hasher.h"
27 #include "parquet/platform.h"
28 #include "parquet/types.h"
32 // A Bloom filter is a compact structure to indicate whether an item is not in a set or
33 // probably in a set. The Bloom filter usually consists of a bit set that represents a
34 // set of elements, a hash strategy and a Bloom filter algorithm.
35 class PARQUET_EXPORT BloomFilter
{
37 // Maximum Bloom filter size, it sets to HDFS default block size 128MB
38 // This value will be reconsidered when implementing Bloom filter producer.
39 static constexpr uint32_t kMaximumBloomFilterBytes
= 128 * 1024 * 1024;
41 /// Determine whether an element exist in set or not.
43 /// @param hash the element to contain.
44 /// @return false if value is definitely not in set, and true means PROBABLY
46 virtual bool FindHash(uint64_t hash
) const = 0;
48 /// Insert element to set represented by Bloom filter bitset.
49 /// @param hash the hash of value to insert into Bloom filter.
50 virtual void InsertHash(uint64_t hash
) = 0;
52 /// Write this Bloom filter to an output stream. A Bloom filter structure should
53 /// include bitset length, hash strategy, algorithm, and bitset.
55 /// @param sink the output stream to write
56 virtual void WriteTo(ArrowOutputStream
* sink
) const = 0;
58 /// Get the number of bytes of bitset
59 virtual uint32_t GetBitsetSize() const = 0;
61 /// Compute hash for 32 bits value by using its plain encoding result.
63 /// @param value the value to hash.
64 /// @return hash result.
65 virtual uint64_t Hash(int32_t value
) const = 0;
67 /// Compute hash for 64 bits value by using its plain encoding result.
69 /// @param value the value to hash.
70 /// @return hash result.
71 virtual uint64_t Hash(int64_t value
) const = 0;
73 /// Compute hash for float value by using its plain encoding result.
75 /// @param value the value to hash.
76 /// @return hash result.
77 virtual uint64_t Hash(float value
) const = 0;
79 /// Compute hash for double value by using its plain encoding result.
81 /// @param value the value to hash.
82 /// @return hash result.
83 virtual uint64_t Hash(double value
) const = 0;
85 /// Compute hash for Int96 value by using its plain encoding result.
87 /// @param value the value to hash.
88 /// @return hash result.
89 virtual uint64_t Hash(const Int96
* value
) const = 0;
91 /// Compute hash for ByteArray value by using its plain encoding result.
93 /// @param value the value to hash.
94 /// @return hash result.
95 virtual uint64_t Hash(const ByteArray
* value
) const = 0;
97 /// Compute hash for fixed byte array value by using its plain encoding result.
99 /// @param value the value address.
100 /// @param len the value length.
101 /// @return hash result.
102 virtual uint64_t Hash(const FLBA
* value
, uint32_t len
) const = 0;
104 virtual ~BloomFilter() {}
107 // Hash strategy available for Bloom filter.
108 enum class HashStrategy
: uint32_t { MURMUR3_X64_128
= 0 };
110 // Bloom filter algorithm.
111 enum class Algorithm
: uint32_t { BLOCK
= 0 };
114 // The BlockSplitBloomFilter is implemented using block-based Bloom filters from
115 // Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
116 // hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
118 // This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
119 // filter is 32 bytes to take advantage of 32-byte SIMD instructions.
120 class PARQUET_EXPORT BlockSplitBloomFilter
: public BloomFilter
{
122 /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
123 BlockSplitBloomFilter();
125 /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
126 /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
127 /// rounded up/down to lower/upper bound if num_bytes is out of range and also
128 /// will be rounded up to a power of 2.
130 /// @param num_bytes The number of bytes to store Bloom filter bitset.
131 void Init(uint32_t num_bytes
);
133 /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
134 /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
135 /// which may lead to segfault when performing SIMD instructions. It is the caller's
136 /// responsibility to free the bitset passed in. This is used when reconstructing
137 /// a Bloom filter from a parquet file.
139 /// @param bitset The given bitset to initialize the Bloom filter.
140 /// @param num_bytes The number of bytes of given bitset.
141 void Init(const uint8_t* bitset
, uint32_t num_bytes
);
143 // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
144 static constexpr uint32_t kMinimumBloomFilterBytes
= 32;
146 /// Calculate optimal size according to the number of distinct values and false
147 /// positive probability.
149 /// @param ndv The number of distinct values.
150 /// @param fpp The false positive probability.
151 /// @return it always return a value between kMinimumBloomFilterBytes and
152 /// kMaximumBloomFilterBytes, and the return value is always a power of 2
153 static uint32_t OptimalNumOfBits(uint32_t ndv
, double fpp
) {
154 DCHECK(fpp
> 0.0 && fpp
< 1.0);
155 const double m
= -8.0 * ndv
/ log(1 - pow(fpp
, 1.0 / 8));
159 if (m
< 0 || m
> kMaximumBloomFilterBytes
<< 3) {
160 num_bits
= static_cast<uint32_t>(kMaximumBloomFilterBytes
<< 3);
162 num_bits
= static_cast<uint32_t>(m
);
165 // Round up to lower bound
166 if (num_bits
< kMinimumBloomFilterBytes
<< 3) {
167 num_bits
= kMinimumBloomFilterBytes
<< 3;
170 // Get next power of 2 if bits is not power of 2.
171 if ((num_bits
& (num_bits
- 1)) != 0) {
172 num_bits
= static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits
));
175 // Round down to upper bound
176 if (num_bits
> kMaximumBloomFilterBytes
<< 3) {
177 num_bits
= kMaximumBloomFilterBytes
<< 3;
183 bool FindHash(uint64_t hash
) const override
;
184 void InsertHash(uint64_t hash
) override
;
185 void WriteTo(ArrowOutputStream
* sink
) const override
;
186 uint32_t GetBitsetSize() const override
{ return num_bytes_
; }
188 uint64_t Hash(int64_t value
) const override
{ return hasher_
->Hash(value
); }
189 uint64_t Hash(float value
) const override
{ return hasher_
->Hash(value
); }
190 uint64_t Hash(double value
) const override
{ return hasher_
->Hash(value
); }
191 uint64_t Hash(const Int96
* value
) const override
{ return hasher_
->Hash(value
); }
192 uint64_t Hash(const ByteArray
* value
) const override
{ return hasher_
->Hash(value
); }
193 uint64_t Hash(int32_t value
) const override
{ return hasher_
->Hash(value
); }
194 uint64_t Hash(const FLBA
* value
, uint32_t len
) const override
{
195 return hasher_
->Hash(value
, len
);
198 /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
199 /// a Bloom filter from a parquet filter.
201 /// @param input_stream The input stream from which to construct the Bloom filter
202 /// @return The BlockSplitBloomFilter.
203 static BlockSplitBloomFilter
Deserialize(ArrowInputStream
* input_stream
);
206 // Bytes in a tiny Bloom filter block.
207 static constexpr int kBytesPerFilterBlock
= 32;
209 // The number of bits to be set in each tiny Bloom filter
210 static constexpr int kBitsSetPerBlock
= 8;
212 // A mask structure used to set bits in each tiny Bloom filter.
214 uint32_t item
[kBitsSetPerBlock
];
217 // The block-based algorithm needs eight odd SALT values to calculate eight indexes
218 // of bit to set, one bit in each 32-bit word.
219 static constexpr uint32_t SALT
[kBitsSetPerBlock
] = {
220 0x47b6137bU
, 0x44974d91U
, 0x8824ad5bU
, 0xa2b7289dU
,
221 0x705495c7U
, 0x2df1424bU
, 0x9efc4947U
, 0x5c6bfb31U
};
223 /// Set bits in mask array according to input key.
224 /// @param key the value to calculate mask values.
225 /// @param mask the mask array is used to set inside a block
226 void SetMask(uint32_t key
, BlockMask
& mask
) const;
228 // Memory pool to allocate aligned buffer for bitset
229 ::arrow::MemoryPool
* pool_
;
231 // The underlying buffer of bitset.
232 std::shared_ptr
<Buffer
> data_
;
234 // The number of bytes of Bloom filter bitset.
237 // Hash strategy used in this Bloom filter.
238 HashStrategy hash_strategy_
;
240 // Algorithm used in this Bloom filter.
241 Algorithm algorithm_
;
243 // The hash pointer points to actual hash class used.
244 std::unique_ptr
<Hasher
> hasher_
;
247 } // namespace parquet