ceph/src/rocksdb/table/data_block_hash_index.h

   1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5
   6 #pragma once
   7
   8 #include <string>
   9 #include <vector>
  10
  11 #include "rocksdb/slice.h"
  12
  13 namespace rocksdb {
  14 // This is an experimental feature aiming to reduce the CPU utilization of
  15 // point-lookup within a data-block. It is only used in data blocks, and not
  16 // in meta-data blocks or per-table index blocks.
  17 //
  18 // It only used to support BlockBasedTable::Get().
  19 //
  20 // A serialized hash index is appended to the data-block. The new block data
  21 // format is as follows:
  22 //
  23 // DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
  24 //
  25 // RI:       Restart Interval (the same as the default data-block format)
  26 // RI_IDX:   Restart Interval index (the same as the default data-block format)
  27 // HASH_IDX: The new data-block hash index feature.
  28 // FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
  29 //           the flag indicating if this hash index is in use. Note that
  30 //           given a data block < 32KB, the MSB is never used. So we can
  31 //           borrow the MSB as the hash index flag. Therefore, this format is
  32 //           compatible with the legacy data-blocks with num_restarts < 32768,
  33 //           as the MSB is 0.
  34 //
  35 // The format of the data-block hash index is as follows:
  36 //
  37 // HASH_IDX: [B B B ... B NUM_BUCK]
  38 //
  39 // B:         bucket, an array of restart index. Each buckets is uint8_t.
  40 // NUM_BUCK:  Number of buckets, which is the length of the bucket array.
  41 //
  42 // We reserve two special flag:
  43 //    kNoEntry=255,
  44 //    kCollision=254.
  45 //
  46 // Therefore, the max number of restarts this hash index can supoport is 253.
  47 //
  48 // Buckets are initialized to be kNoEntry.
  49 //
  50 // When storing a key in the hash index, the key is first hashed to a bucket.
  51 // If there the bucket is empty (kNoEntry), the restart index is stored in
  52 // the bucket. If there is already a restart index there, we will update the
  53 // existing restart index to a collision marker (kCollision). If the
  54 // the bucket is already marked as collision, we do not store the restart
  55 // index either.
  56 //
  57 // During query process, a key is first hashed to a bucket. Then we examine if
  58 // the buckets store nothing (kNoEntry) or the bucket had a collision
  59 // (kCollision). If either of those happens, we get the restart index of
  60 // the key and will directly go to the restart interval to search the key.
  61 //
  62 // Note that we only support blocks with #restart_interval < 254. If a block
  63 // has more restart interval than that, hash index will not be create for it.
  64
  65 const uint8_t kNoEntry = 255;
  66 const uint8_t kCollision = 254;
  67 const uint8_t kMaxRestartSupportedByHashIndex = 253;
  68
  69 // Because we use uint16_t address, we only support block no more than 64KB
  70 const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
  71 const double kDefaultUtilRatio = 0.75;
  72
  73 class DataBlockHashIndexBuilder {
  74  public:
  75   DataBlockHashIndexBuilder()
  76       : bucket_per_key_(-1 /*uninitialized marker*/),
  77         estimated_num_buckets_(0),
  78         valid_(false) {}
  79
  80   void Initialize(double util_ratio) {
  81     if (util_ratio <= 0) {
  82       util_ratio = kDefaultUtilRatio;  // sanity check
  83     }
  84     bucket_per_key_ = 1 / util_ratio;
  85     valid_ = true;
  86   }
  87
  88   inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
  89   void Add(const Slice& key, const size_t restart_index);
  90   void Finish(std::string& buffer);
  91   void Reset();
  92   inline size_t EstimateSize() const {
  93     uint16_t estimated_num_buckets =
  94         static_cast<uint16_t>(estimated_num_buckets_);
  95
  96     // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
  97     estimated_num_buckets |= 1;
  98
  99     return sizeof(uint16_t) +
 100            static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
 101   }
 102
 103  private:
 104   double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
 105   double estimated_num_buckets_;
 106
 107   // Now the only usage for `valid_` is to mark false when the inserted
 108   // restart_index is larger than supported. In this case HashIndex is not
 109   // appended to the block content.
 110   bool valid_;
 111
 112   std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
 113   friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
 114 };
 115
 116 class DataBlockHashIndex {
 117  public:
 118   DataBlockHashIndex() : num_buckets_(0) {}
 119
 120   void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
 121
 122   uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
 123
 124   inline bool Valid() { return num_buckets_ != 0; }
 125
 126  private:
 127   // To make the serialized hash index compact and to save the space overhead,
 128   // here all the data fields persisted in the block are in uint16 format.
 129   // We find that a uint16 is large enough to index every offset of a 64KiB
 130   // block.
 131   // So in other words, DataBlockHashIndex does not support block size equal
 132   // or greater then 64KiB.
 133   uint16_t num_buckets_;
 134 };
 135
 136 }  // namespace rocksdb