ceph/src/arrow/cpp/src/parquet/bloom_filter.h

   1 // Licensed to the Apache Software Foundation (ASF) under one
   2 // or more contributor license agreements.  See the NOTICE file
   3 // distributed with this work for additional information
   4 // regarding copyright ownership.  The ASF licenses this file
   5 // to you under the Apache License, Version 2.0 (the
   6 // "License"); you may not use this file except in compliance
   7 // with the License.  You may obtain a copy of the License at
   8 //
   9 //   http://www.apache.org/licenses/LICENSE-2.0
  10 //
  11 // Unless required by applicable law or agreed to in writing,
  12 // software distributed under the License is distributed on an
  13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14 // KIND, either express or implied.  See the License for the
  15 // specific language governing permissions and limitations
  16 // under the License.
  17
  18 #pragma once
  19
  20 #include <cmath>
  21 #include <cstdint>
  22 #include <memory>
  23
  24 #include "arrow/util/bit_util.h"
  25 #include "arrow/util/logging.h"
  26 #include "parquet/hasher.h"
  27 #include "parquet/platform.h"
  28 #include "parquet/types.h"
  29
  30 namespace parquet {
  31
  32 // A Bloom filter is a compact structure to indicate whether an item is not in a set or
  33 // probably in a set. The Bloom filter usually consists of a bit set that represents a
  34 // set of elements, a hash strategy and a Bloom filter algorithm.
  35 class PARQUET_EXPORT BloomFilter {
  36  public:
  37   // Maximum Bloom filter size, it sets to HDFS default block size 128MB
  38   // This value will be reconsidered when implementing Bloom filter producer.
  39   static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
  40
  41   /// Determine whether an element exist in set or not.
  42   ///
  43   /// @param hash the element to contain.
  44   /// @return false if value is definitely not in set, and true means PROBABLY
  45   /// in set.
  46   virtual bool FindHash(uint64_t hash) const = 0;
  47
  48   /// Insert element to set represented by Bloom filter bitset.
  49   /// @param hash the hash of value to insert into Bloom filter.
  50   virtual void InsertHash(uint64_t hash) = 0;
  51
  52   /// Write this Bloom filter to an output stream. A Bloom filter structure should
  53   /// include bitset length, hash strategy, algorithm, and bitset.
  54   ///
  55   /// @param sink the output stream to write
  56   virtual void WriteTo(ArrowOutputStream* sink) const = 0;
  57
  58   /// Get the number of bytes of bitset
  59   virtual uint32_t GetBitsetSize() const = 0;
  60
  61   /// Compute hash for 32 bits value by using its plain encoding result.
  62   ///
  63   /// @param value the value to hash.
  64   /// @return hash result.
  65   virtual uint64_t Hash(int32_t value) const = 0;
  66
  67   /// Compute hash for 64 bits value by using its plain encoding result.
  68   ///
  69   /// @param value the value to hash.
  70   /// @return hash result.
  71   virtual uint64_t Hash(int64_t value) const = 0;
  72
  73   /// Compute hash for float value by using its plain encoding result.
  74   ///
  75   /// @param value the value to hash.
  76   /// @return hash result.
  77   virtual uint64_t Hash(float value) const = 0;
  78
  79   /// Compute hash for double value by using its plain encoding result.
  80   ///
  81   /// @param value the value to hash.
  82   /// @return hash result.
  83   virtual uint64_t Hash(double value) const = 0;
  84
  85   /// Compute hash for Int96 value by using its plain encoding result.
  86   ///
  87   /// @param value the value to hash.
  88   /// @return hash result.
  89   virtual uint64_t Hash(const Int96* value) const = 0;
  90
  91   /// Compute hash for ByteArray value by using its plain encoding result.
  92   ///
  93   /// @param value the value to hash.
  94   /// @return hash result.
  95   virtual uint64_t Hash(const ByteArray* value) const = 0;
  96
  97   /// Compute hash for fixed byte array value by using its plain encoding result.
  98   ///
  99   /// @param value the value address.
 100   /// @param len the value length.
 101   /// @return hash result.
 102   virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
 103
 104   virtual ~BloomFilter() {}
 105
 106  protected:
 107   // Hash strategy available for Bloom filter.
 108   enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
 109
 110   // Bloom filter algorithm.
 111   enum class Algorithm : uint32_t { BLOCK = 0 };
 112 };
 113
 114 // The BlockSplitBloomFilter is implemented using block-based Bloom filters from
 115 // Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
 116 // hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
 117 //
 118 // This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
 119 // filter is 32 bytes to take advantage of 32-byte SIMD instructions.
 120 class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
 121  public:
 122   /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
 123   BlockSplitBloomFilter();
 124
 125   /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
 126   /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
 127   /// rounded up/down to lower/upper bound if num_bytes is out of range and also
 128   /// will be rounded up to a power of 2.
 129   ///
 130   /// @param num_bytes The number of bytes to store Bloom filter bitset.
 131   void Init(uint32_t num_bytes);
 132
 133   /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
 134   /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
 135   /// which may lead to segfault when performing SIMD instructions. It is the caller's
 136   /// responsibility to free the bitset passed in. This is used when reconstructing
 137   /// a Bloom filter from a parquet file.
 138   ///
 139   /// @param bitset The given bitset to initialize the Bloom filter.
 140   /// @param num_bytes  The number of bytes of given bitset.
 141   void Init(const uint8_t* bitset, uint32_t num_bytes);
 142
 143   // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
 144   static constexpr uint32_t kMinimumBloomFilterBytes = 32;
 145
 146   /// Calculate optimal size according to the number of distinct values and false
 147   /// positive probability.
 148   ///
 149   /// @param ndv The number of distinct values.
 150   /// @param fpp The false positive probability.
 151   /// @return it always return a value between kMinimumBloomFilterBytes and
 152   /// kMaximumBloomFilterBytes, and the return value is always a power of 2
 153   static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
 154     DCHECK(fpp > 0.0 && fpp < 1.0);
 155     const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
 156     uint32_t num_bits;
 157
 158     // Handle overflow.
 159     if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
 160       num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
 161     } else {
 162       num_bits = static_cast<uint32_t>(m);
 163     }
 164
 165     // Round up to lower bound
 166     if (num_bits < kMinimumBloomFilterBytes << 3) {
 167       num_bits = kMinimumBloomFilterBytes << 3;
 168     }
 169
 170     // Get next power of 2 if bits is not power of 2.
 171     if ((num_bits & (num_bits - 1)) != 0) {
 172       num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
 173     }
 174
 175     // Round down to upper bound
 176     if (num_bits > kMaximumBloomFilterBytes << 3) {
 177       num_bits = kMaximumBloomFilterBytes << 3;
 178     }
 179
 180     return num_bits;
 181   }
 182
 183   bool FindHash(uint64_t hash) const override;
 184   void InsertHash(uint64_t hash) override;
 185   void WriteTo(ArrowOutputStream* sink) const override;
 186   uint32_t GetBitsetSize() const override { return num_bytes_; }
 187
 188   uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
 189   uint64_t Hash(float value) const override { return hasher_->Hash(value); }
 190   uint64_t Hash(double value) const override { return hasher_->Hash(value); }
 191   uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
 192   uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
 193   uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
 194   uint64_t Hash(const FLBA* value, uint32_t len) const override {
 195     return hasher_->Hash(value, len);
 196   }
 197
 198   /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
 199   /// a Bloom filter from a parquet filter.
 200   ///
 201   /// @param input_stream The input stream from which to construct the Bloom filter
 202   /// @return The BlockSplitBloomFilter.
 203   static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
 204
 205  private:
 206   // Bytes in a tiny Bloom filter block.
 207   static constexpr int kBytesPerFilterBlock = 32;
 208
 209   // The number of bits to be set in each tiny Bloom filter
 210   static constexpr int kBitsSetPerBlock = 8;
 211
 212   // A mask structure used to set bits in each tiny Bloom filter.
 213   struct BlockMask {
 214     uint32_t item[kBitsSetPerBlock];
 215   };
 216
 217   // The block-based algorithm needs eight odd SALT values to calculate eight indexes
 218   // of bit to set, one bit in each 32-bit word.
 219   static constexpr uint32_t SALT[kBitsSetPerBlock] = {
 220       0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
 221       0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
 222
 223   /// Set bits in mask array according to input key.
 224   /// @param key the value to calculate mask values.
 225   /// @param mask the mask array is used to set inside a block
 226   void SetMask(uint32_t key, BlockMask& mask) const;
 227
 228   // Memory pool to allocate aligned buffer for bitset
 229   ::arrow::MemoryPool* pool_;
 230
 231   // The underlying buffer of bitset.
 232   std::shared_ptr<Buffer> data_;
 233
 234   // The number of bytes of Bloom filter bitset.
 235   uint32_t num_bytes_;
 236
 237   // Hash strategy used in this Bloom filter.
 238   HashStrategy hash_strategy_;
 239
 240   // Algorithm used in this Bloom filter.
 241   Algorithm algorithm_;
 242
 243   // The hash pointer points to actual hash class used.
 244   std::unique_ptr<Hasher> hasher_;
 245 };
 246
 247 }  // namespace parquet