]> git.proxmox.com Git - ceph.git/blob - ceph/src/arrow/cpp/src/parquet/bloom_filter.h
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / parquet / bloom_filter.h
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #pragma once
19
20 #include <cmath>
21 #include <cstdint>
22 #include <memory>
23
24 #include "arrow/util/bit_util.h"
25 #include "arrow/util/logging.h"
26 #include "parquet/hasher.h"
27 #include "parquet/platform.h"
28 #include "parquet/types.h"
29
30 namespace parquet {
31
32 // A Bloom filter is a compact structure to indicate whether an item is not in a set or
33 // probably in a set. The Bloom filter usually consists of a bit set that represents a
34 // set of elements, a hash strategy and a Bloom filter algorithm.
35 class PARQUET_EXPORT BloomFilter {
36 public:
37 // Maximum Bloom filter size, it sets to HDFS default block size 128MB
38 // This value will be reconsidered when implementing Bloom filter producer.
39 static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
40
41 /// Determine whether an element exist in set or not.
42 ///
43 /// @param hash the element to contain.
44 /// @return false if value is definitely not in set, and true means PROBABLY
45 /// in set.
46 virtual bool FindHash(uint64_t hash) const = 0;
47
48 /// Insert element to set represented by Bloom filter bitset.
49 /// @param hash the hash of value to insert into Bloom filter.
50 virtual void InsertHash(uint64_t hash) = 0;
51
52 /// Write this Bloom filter to an output stream. A Bloom filter structure should
53 /// include bitset length, hash strategy, algorithm, and bitset.
54 ///
55 /// @param sink the output stream to write
56 virtual void WriteTo(ArrowOutputStream* sink) const = 0;
57
58 /// Get the number of bytes of bitset
59 virtual uint32_t GetBitsetSize() const = 0;
60
61 /// Compute hash for 32 bits value by using its plain encoding result.
62 ///
63 /// @param value the value to hash.
64 /// @return hash result.
65 virtual uint64_t Hash(int32_t value) const = 0;
66
67 /// Compute hash for 64 bits value by using its plain encoding result.
68 ///
69 /// @param value the value to hash.
70 /// @return hash result.
71 virtual uint64_t Hash(int64_t value) const = 0;
72
73 /// Compute hash for float value by using its plain encoding result.
74 ///
75 /// @param value the value to hash.
76 /// @return hash result.
77 virtual uint64_t Hash(float value) const = 0;
78
79 /// Compute hash for double value by using its plain encoding result.
80 ///
81 /// @param value the value to hash.
82 /// @return hash result.
83 virtual uint64_t Hash(double value) const = 0;
84
85 /// Compute hash for Int96 value by using its plain encoding result.
86 ///
87 /// @param value the value to hash.
88 /// @return hash result.
89 virtual uint64_t Hash(const Int96* value) const = 0;
90
91 /// Compute hash for ByteArray value by using its plain encoding result.
92 ///
93 /// @param value the value to hash.
94 /// @return hash result.
95 virtual uint64_t Hash(const ByteArray* value) const = 0;
96
97 /// Compute hash for fixed byte array value by using its plain encoding result.
98 ///
99 /// @param value the value address.
100 /// @param len the value length.
101 /// @return hash result.
102 virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
103
104 virtual ~BloomFilter() {}
105
106 protected:
107 // Hash strategy available for Bloom filter.
108 enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
109
110 // Bloom filter algorithm.
111 enum class Algorithm : uint32_t { BLOCK = 0 };
112 };
113
114 // The BlockSplitBloomFilter is implemented using block-based Bloom filters from
115 // Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
116 // hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
117 //
118 // This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
119 // filter is 32 bytes to take advantage of 32-byte SIMD instructions.
120 class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
121 public:
122 /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
123 BlockSplitBloomFilter();
124
125 /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
126 /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
127 /// rounded up/down to lower/upper bound if num_bytes is out of range and also
128 /// will be rounded up to a power of 2.
129 ///
130 /// @param num_bytes The number of bytes to store Bloom filter bitset.
131 void Init(uint32_t num_bytes);
132
133 /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
134 /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
135 /// which may lead to segfault when performing SIMD instructions. It is the caller's
136 /// responsibility to free the bitset passed in. This is used when reconstructing
137 /// a Bloom filter from a parquet file.
138 ///
139 /// @param bitset The given bitset to initialize the Bloom filter.
140 /// @param num_bytes The number of bytes of given bitset.
141 void Init(const uint8_t* bitset, uint32_t num_bytes);
142
143 // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
144 static constexpr uint32_t kMinimumBloomFilterBytes = 32;
145
146 /// Calculate optimal size according to the number of distinct values and false
147 /// positive probability.
148 ///
149 /// @param ndv The number of distinct values.
150 /// @param fpp The false positive probability.
151 /// @return it always return a value between kMinimumBloomFilterBytes and
152 /// kMaximumBloomFilterBytes, and the return value is always a power of 2
153 static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
154 DCHECK(fpp > 0.0 && fpp < 1.0);
155 const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
156 uint32_t num_bits;
157
158 // Handle overflow.
159 if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
160 num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
161 } else {
162 num_bits = static_cast<uint32_t>(m);
163 }
164
165 // Round up to lower bound
166 if (num_bits < kMinimumBloomFilterBytes << 3) {
167 num_bits = kMinimumBloomFilterBytes << 3;
168 }
169
170 // Get next power of 2 if bits is not power of 2.
171 if ((num_bits & (num_bits - 1)) != 0) {
172 num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
173 }
174
175 // Round down to upper bound
176 if (num_bits > kMaximumBloomFilterBytes << 3) {
177 num_bits = kMaximumBloomFilterBytes << 3;
178 }
179
180 return num_bits;
181 }
182
183 bool FindHash(uint64_t hash) const override;
184 void InsertHash(uint64_t hash) override;
185 void WriteTo(ArrowOutputStream* sink) const override;
186 uint32_t GetBitsetSize() const override { return num_bytes_; }
187
188 uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
189 uint64_t Hash(float value) const override { return hasher_->Hash(value); }
190 uint64_t Hash(double value) const override { return hasher_->Hash(value); }
191 uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
192 uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
193 uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
194 uint64_t Hash(const FLBA* value, uint32_t len) const override {
195 return hasher_->Hash(value, len);
196 }
197
198 /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
199 /// a Bloom filter from a parquet filter.
200 ///
201 /// @param input_stream The input stream from which to construct the Bloom filter
202 /// @return The BlockSplitBloomFilter.
203 static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
204
205 private:
206 // Bytes in a tiny Bloom filter block.
207 static constexpr int kBytesPerFilterBlock = 32;
208
209 // The number of bits to be set in each tiny Bloom filter
210 static constexpr int kBitsSetPerBlock = 8;
211
212 // A mask structure used to set bits in each tiny Bloom filter.
213 struct BlockMask {
214 uint32_t item[kBitsSetPerBlock];
215 };
216
217 // The block-based algorithm needs eight odd SALT values to calculate eight indexes
218 // of bit to set, one bit in each 32-bit word.
219 static constexpr uint32_t SALT[kBitsSetPerBlock] = {
220 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
221 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
222
223 /// Set bits in mask array according to input key.
224 /// @param key the value to calculate mask values.
225 /// @param mask the mask array is used to set inside a block
226 void SetMask(uint32_t key, BlockMask& mask) const;
227
228 // Memory pool to allocate aligned buffer for bitset
229 ::arrow::MemoryPool* pool_;
230
231 // The underlying buffer of bitset.
232 std::shared_ptr<Buffer> data_;
233
234 // The number of bytes of Bloom filter bitset.
235 uint32_t num_bytes_;
236
237 // Hash strategy used in this Bloom filter.
238 HashStrategy hash_strategy_;
239
240 // Algorithm used in this Bloom filter.
241 Algorithm algorithm_;
242
243 // The hash pointer points to actual hash class used.
244 std::unique_ptr<Hasher> hasher_;
245 };
246
247 } // namespace parquet