ceph/src/rocksdb/table/plain_table_index.h

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under the BSD-style license found in the
   3 //  LICENSE file in the root directory of this source tree. An additional grant
   4 //  of patent rights can be found in the PATENTS file in the same directory.
   5
   6 #pragma once
   7
   8 #ifndef ROCKSDB_LITE
   9
  10 #include <string>
  11 #include <vector>
  12
  13 #include "db/dbformat.h"
  14 #include "monitoring/histogram.h"
  15 #include "options/cf_options.h"
  16 #include "rocksdb/options.h"
  17 #include "util/arena.h"
  18 #include "util/hash.h"
  19 #include "util/murmurhash.h"
  20
  21 namespace rocksdb {
  22
  23 // PlainTableIndex contains buckets size of index_size_, each is a
  24 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
  25 // and the first bit of the integer indicates type of the offset.
  26 //
  27 // +--------------+------------------------------------------------------+
  28 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
  29 // +--------------+------------------------------------------------------+
  30 //
  31 // Explanation for the "flag bit":
  32 //
  33 // 0 indicates that the bucket contains only one prefix (no conflict when
  34 //   hashing this prefix), whose first row starts from this offset of the
  35 // file.
  36 // 1 indicates that the bucket contains more than one prefixes, or there
  37 //   are too many rows for one prefix so we need a binary search for it. In
  38 //   this case, the offset indicates the offset of sub_index_ holding the
  39 //   binary search indexes of keys for those rows. Those binary search indexes
  40 //   are organized in this way:
  41 //
  42 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
  43 // it, there are N 32-bit integers, each points of an offset of the file,
  44 // which
  45 // points to starting of a row. Those offsets need to be guaranteed to be in
  46 // ascending order so the keys they are pointing to are also in ascending
  47 // order
  48 // to make sure we can use them to do binary searches. Below is visual
  49 // presentation of a bucket.
  50 //
  51 // <begin>
  52 //   number_of_records:  varint32
  53 //   record 1 file offset:  fixedint32
  54 //   record 2 file offset:  fixedint32
  55 //    ....
  56 //   record N file offset:  fixedint32
  57 // <end>
  58 class PlainTableIndex {
  59  public:
  60   enum IndexSearchResult {
  61     kNoPrefixForBucket = 0,
  62     kDirectToFile = 1,
  63     kSubindex = 2
  64   };
  65
  66   explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
  67
  68   PlainTableIndex()
  69       : index_size_(0),
  70         sub_index_size_(0),
  71         num_prefixes_(0),
  72         index_(nullptr),
  73         sub_index_(nullptr) {}
  74
  75   IndexSearchResult GetOffset(uint32_t prefix_hash,
  76                               uint32_t* bucket_value) const;
  77
  78   Status InitFromRawData(Slice data);
  79
  80   const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
  81                                               uint32_t* upper_bound) const {
  82     const char* index_ptr = &sub_index_[offset];
  83     return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
  84   }
  85
  86   uint32_t GetIndexSize() const { return index_size_; }
  87
  88   uint32_t GetSubIndexSize() const { return sub_index_size_; }
  89
  90   uint32_t GetNumPrefixes() const { return num_prefixes_; }
  91
  92   static const uint64_t kMaxFileSize = (1u << 31) - 1;
  93   static const uint32_t kSubIndexMask = 0x80000000;
  94   static const size_t kOffsetLen = sizeof(uint32_t);
  95
  96  private:
  97   uint32_t index_size_;
  98   uint32_t sub_index_size_;
  99   uint32_t num_prefixes_;
 100
 101   uint32_t* index_;
 102   char* sub_index_;
 103 };
 104
 105 // PlainTableIndexBuilder is used to create plain table index.
 106 // After calling Finish(), it returns Slice, which is usually
 107 // used either to initialize PlainTableIndex or
 108 // to save index to sst file.
 109 // For more details about the  index, please refer to:
 110 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
 111 // #wiki-in-memory-index-format
 112 class PlainTableIndexBuilder {
 113  public:
 114   PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
 115                          size_t index_sparseness, double hash_table_ratio,
 116                          size_t huge_page_tlb_size)
 117       : arena_(arena),
 118         ioptions_(ioptions),
 119         record_list_(kRecordsPerGroup),
 120         is_first_record_(true),
 121         due_index_(false),
 122         num_prefixes_(0),
 123         num_keys_per_prefix_(0),
 124         prev_key_prefix_hash_(0),
 125         index_sparseness_(index_sparseness),
 126         prefix_extractor_(ioptions.prefix_extractor),
 127         hash_table_ratio_(hash_table_ratio),
 128         huge_page_tlb_size_(huge_page_tlb_size) {}
 129
 130   void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
 131
 132   Slice Finish();
 133
 134   uint32_t GetTotalSize() const {
 135     return VarintLength(index_size_) + VarintLength(num_prefixes_) +
 136            PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
 137   }
 138
 139   static const std::string kPlainTableIndexBlock;
 140
 141  private:
 142   struct IndexRecord {
 143     uint32_t hash;    // hash of the prefix
 144     uint32_t offset;  // offset of a row
 145     IndexRecord* next;
 146   };
 147
 148   // Helper class to track all the index records
 149   class IndexRecordList {
 150    public:
 151     explicit IndexRecordList(size_t num_records_per_group)
 152         : kNumRecordsPerGroup(num_records_per_group),
 153           current_group_(nullptr),
 154           num_records_in_current_group_(num_records_per_group) {}
 155
 156     ~IndexRecordList() {
 157       for (size_t i = 0; i < groups_.size(); i++) {
 158         delete[] groups_[i];
 159       }
 160     }
 161
 162     void AddRecord(uint32_t hash, uint32_t offset);
 163
 164     size_t GetNumRecords() const {
 165       return (groups_.size() - 1) * kNumRecordsPerGroup +
 166              num_records_in_current_group_;
 167     }
 168     IndexRecord* At(size_t index) {
 169       return &(groups_[index / kNumRecordsPerGroup]
 170                       [index % kNumRecordsPerGroup]);
 171     }
 172
 173    private:
 174     IndexRecord* AllocateNewGroup() {
 175       IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
 176       groups_.push_back(result);
 177       return result;
 178     }
 179
 180     // Each group in `groups_` contains fix-sized records (determined by
 181     // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
 182     // occurs.
 183     const size_t kNumRecordsPerGroup;
 184     IndexRecord* current_group_;
 185     // List of arrays allocated
 186     std::vector<IndexRecord*> groups_;
 187     size_t num_records_in_current_group_;
 188   };
 189
 190   void AllocateIndex();
 191
 192   // Internal helper function to bucket index record list to hash buckets.
 193   void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
 194                         std::vector<uint32_t>* entries_per_bucket);
 195
 196   // Internal helper class to fill the indexes and bloom filters to internal
 197   // data structures.
 198   Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
 199                     const std::vector<uint32_t>& entries_per_bucket);
 200
 201   Arena* arena_;
 202   const ImmutableCFOptions ioptions_;
 203   HistogramImpl keys_per_prefix_hist_;
 204   IndexRecordList record_list_;
 205   bool is_first_record_;
 206   bool due_index_;
 207   uint32_t num_prefixes_;
 208   uint32_t num_keys_per_prefix_;
 209
 210   uint32_t prev_key_prefix_hash_;
 211   size_t index_sparseness_;
 212   uint32_t index_size_;
 213   uint32_t sub_index_size_;
 214
 215   const SliceTransform* prefix_extractor_;
 216   double hash_table_ratio_;
 217   size_t huge_page_tlb_size_;
 218
 219   std::string prev_key_prefix_;
 220
 221   static const size_t kRecordsPerGroup = 256;
 222 };
 223
 224 };  // namespace rocksdb
 225
 226 #endif  // ROCKSDB_LITE