ceph/src/rocksdb/table/plain/plain_table_index.h

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5
   6 #pragma once
   7
   8 #ifndef ROCKSDB_LITE
   9
  10 #include <string>
  11 #include <vector>
  12
  13 #include "memory/arena.h"
  14 #include "monitoring/histogram.h"
  15 #include "options/cf_options.h"
  16 #include "rocksdb/options.h"
  17
  18 namespace ROCKSDB_NAMESPACE {
  19
  20 // The file contains two classes PlainTableIndex and PlainTableIndexBuilder
  21 // The two classes implement the index format of PlainTable.
  22 // For description of PlainTable format, see comments of class
  23 // PlainTableFactory
  24 //
  25 //
  26 // PlainTableIndex contains buckets size of index_size_, each is a
  27 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
  28 // and the first bit of the integer indicates type of the offset.
  29 //
  30 // +--------------+------------------------------------------------------+
  31 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
  32 // +--------------+------------------------------------------------------+
  33 //
  34 // Explanation for the "flag bit":
  35 //
  36 // 0 indicates that the bucket contains only one prefix (no conflict when
  37 //   hashing this prefix), whose first row starts from this offset of the
  38 // file.
  39 // 1 indicates that the bucket contains more than one prefixes, or there
  40 //   are too many rows for one prefix so we need a binary search for it. In
  41 //   this case, the offset indicates the offset of sub_index_ holding the
  42 //   binary search indexes of keys for those rows. Those binary search indexes
  43 //   are organized in this way:
  44 //
  45 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
  46 // it, there are N 32-bit integers, each points of an offset of the file,
  47 // which
  48 // points to starting of a row. Those offsets need to be guaranteed to be in
  49 // ascending order so the keys they are pointing to are also in ascending
  50 // order
  51 // to make sure we can use them to do binary searches. Below is visual
  52 // presentation of a bucket.
  53 //
  54 // <begin>
  55 //   number_of_records:  varint32
  56 //   record 1 file offset:  fixedint32
  57 //   record 2 file offset:  fixedint32
  58 //    ....
  59 //   record N file offset:  fixedint32
  60 // <end>
  61
  62 // The class loads the index block from a PlainTable SST file, and executes
  63 // the index lookup.
  64 // The class is used by PlainTableReader class.
  65 class PlainTableIndex {
  66  public:
  67   enum IndexSearchResult {
  68     kNoPrefixForBucket = 0,
  69     kDirectToFile = 1,
  70     kSubindex = 2
  71   };
  72
  73   explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
  74
  75   PlainTableIndex()
  76       : index_size_(0),
  77         sub_index_size_(0),
  78         num_prefixes_(0),
  79         index_(nullptr),
  80         sub_index_(nullptr) {}
  81
  82   // The function that executes the lookup the hash table.
  83   // The hash key is `prefix_hash`. The function fills the hash bucket
  84   // content in `bucket_value`, which is up to the caller to interpret.
  85   IndexSearchResult GetOffset(uint32_t prefix_hash,
  86                               uint32_t* bucket_value) const;
  87
  88   // Initialize data from `index_data`, which points to raw data for
  89   // index stored in the SST file.
  90   Status InitFromRawData(Slice index_data);
  91
  92   // Decode the sub index for specific hash bucket.
  93   // The `offset` is the value returned as `bucket_value` by GetOffset()
  94   // and is only valid when the return value is `kSubindex`.
  95   // The return value is the pointer to the starting address of the
  96   // sub-index. `upper_bound` is filled with the value indicating how many
  97   // entries the sub-index has.
  98   const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
  99                                               uint32_t* upper_bound) const {
 100     const char* index_ptr = &sub_index_[offset];
 101     return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
 102   }
 103
 104   uint32_t GetIndexSize() const { return index_size_; }
 105
 106   uint32_t GetSubIndexSize() const { return sub_index_size_; }
 107
 108   uint32_t GetNumPrefixes() const { return num_prefixes_; }
 109
 110   static const uint64_t kMaxFileSize = (1u << 31) - 1;
 111   static const uint32_t kSubIndexMask = 0x80000000;
 112   static const size_t kOffsetLen = sizeof(uint32_t);
 113
 114  private:
 115   uint32_t index_size_;
 116   uint32_t sub_index_size_;
 117   uint32_t num_prefixes_;
 118
 119   uint32_t* index_;
 120   char* sub_index_;
 121 };
 122
 123 // PlainTableIndexBuilder is used to create plain table index.
 124 // After calling Finish(), it returns Slice, which is usually
 125 // used either to initialize PlainTableIndex or
 126 // to save index to sst file.
 127 // For more details about the index, please refer to:
 128 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
 129 // #wiki-in-memory-index-format
 130 // The class is used by PlainTableBuilder class.
 131 class PlainTableIndexBuilder {
 132  public:
 133   PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
 134                          const SliceTransform* prefix_extractor,
 135                          size_t index_sparseness, double hash_table_ratio,
 136                          size_t huge_page_tlb_size)
 137       : arena_(arena),
 138         ioptions_(ioptions),
 139         record_list_(kRecordsPerGroup),
 140         is_first_record_(true),
 141         due_index_(false),
 142         num_prefixes_(0),
 143         num_keys_per_prefix_(0),
 144         prev_key_prefix_hash_(0),
 145         index_sparseness_(index_sparseness),
 146         index_size_(0),
 147         sub_index_size_(0),
 148         prefix_extractor_(prefix_extractor),
 149         hash_table_ratio_(hash_table_ratio),
 150         huge_page_tlb_size_(huge_page_tlb_size) {}
 151
 152   void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
 153
 154   Slice Finish();
 155
 156   uint32_t GetTotalSize() const {
 157     return VarintLength(index_size_) + VarintLength(num_prefixes_) +
 158            PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
 159   }
 160
 161   static const std::string kPlainTableIndexBlock;
 162
 163  private:
 164   struct IndexRecord {
 165     uint32_t hash;    // hash of the prefix
 166     uint32_t offset;  // offset of a row
 167     IndexRecord* next;
 168   };
 169
 170   // Helper class to track all the index records
 171   class IndexRecordList {
 172    public:
 173     explicit IndexRecordList(size_t num_records_per_group)
 174         : kNumRecordsPerGroup(num_records_per_group),
 175           current_group_(nullptr),
 176           num_records_in_current_group_(num_records_per_group) {}
 177
 178     ~IndexRecordList() {
 179       for (size_t i = 0; i < groups_.size(); i++) {
 180         delete[] groups_[i];
 181       }
 182     }
 183
 184     void AddRecord(uint32_t hash, uint32_t offset);
 185
 186     size_t GetNumRecords() const {
 187       return (groups_.size() - 1) * kNumRecordsPerGroup +
 188              num_records_in_current_group_;
 189     }
 190     IndexRecord* At(size_t index) {
 191       return &(
 192           groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
 193     }
 194
 195    private:
 196     IndexRecord* AllocateNewGroup() {
 197       IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
 198       groups_.push_back(result);
 199       return result;
 200     }
 201
 202     // Each group in `groups_` contains fix-sized records (determined by
 203     // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
 204     // occurs.
 205     const size_t kNumRecordsPerGroup;
 206     IndexRecord* current_group_;
 207     // List of arrays allocated
 208     std::vector<IndexRecord*> groups_;
 209     size_t num_records_in_current_group_;
 210   };
 211
 212   void AllocateIndex();
 213
 214   // Internal helper function to bucket index record list to hash buckets.
 215   void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
 216                         std::vector<uint32_t>* entries_per_bucket);
 217
 218   // Internal helper class to fill the indexes and bloom filters to internal
 219   // data structures.
 220   Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
 221                     const std::vector<uint32_t>& entries_per_bucket);
 222
 223   Arena* arena_;
 224   const ImmutableOptions ioptions_;
 225   HistogramImpl keys_per_prefix_hist_;
 226   IndexRecordList record_list_;
 227   bool is_first_record_;
 228   bool due_index_;
 229   uint32_t num_prefixes_;
 230   uint32_t num_keys_per_prefix_;
 231
 232   uint32_t prev_key_prefix_hash_;
 233   size_t index_sparseness_;
 234   uint32_t index_size_;
 235   uint32_t sub_index_size_;
 236
 237   const SliceTransform* prefix_extractor_;
 238   double hash_table_ratio_;
 239   size_t huge_page_tlb_size_;
 240
 241   std::string prev_key_prefix_;
 242
 243   static const size_t kRecordsPerGroup = 256;
 244 };
 245
 246 }  // namespace ROCKSDB_NAMESPACE
 247
 248 #endif  // ROCKSDB_LITE