]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/data_block_hash_index.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / data_block_hash_index.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #pragma once
7
8 #include <string>
9 #include <vector>
10
11 #include "rocksdb/slice.h"
12
13 namespace rocksdb {
14 // This is an experimental feature aiming to reduce the CPU utilization of
15 // point-lookup within a data-block. It is only used in data blocks, and not
16 // in meta-data blocks or per-table index blocks.
17 //
18 // It only used to support BlockBasedTable::Get().
19 //
20 // A serialized hash index is appended to the data-block. The new block data
21 // format is as follows:
22 //
23 // DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
24 //
25 // RI: Restart Interval (the same as the default data-block format)
26 // RI_IDX: Restart Interval index (the same as the default data-block format)
27 // HASH_IDX: The new data-block hash index feature.
28 // FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as
29 // the flag indicating if this hash index is in use. Note that
30 // given a data block < 32KB, the MSB is never used. So we can
31 // borrow the MSB as the hash index flag. Therefore, this format is
32 // compatible with the legacy data-blocks with num_restarts < 32768,
33 // as the MSB is 0.
34 //
35 // The format of the data-block hash index is as follows:
36 //
37 // HASH_IDX: [B B B ... B NUM_BUCK]
38 //
39 // B: bucket, an array of restart index. Each buckets is uint8_t.
40 // NUM_BUCK: Number of buckets, which is the length of the bucket array.
41 //
42 // We reserve two special flag:
43 // kNoEntry=255,
44 // kCollision=254.
45 //
46 // Therefore, the max number of restarts this hash index can supoport is 253.
47 //
48 // Buckets are initialized to be kNoEntry.
49 //
50 // When storing a key in the hash index, the key is first hashed to a bucket.
51 // If there the bucket is empty (kNoEntry), the restart index is stored in
52 // the bucket. If there is already a restart index there, we will update the
53 // existing restart index to a collision marker (kCollision). If the
54 // the bucket is already marked as collision, we do not store the restart
55 // index either.
56 //
57 // During query process, a key is first hashed to a bucket. Then we examine if
58 // the buckets store nothing (kNoEntry) or the bucket had a collision
59 // (kCollision). If either of those happens, we get the restart index of
60 // the key and will directly go to the restart interval to search the key.
61 //
62 // Note that we only support blocks with #restart_interval < 254. If a block
63 // has more restart interval than that, hash index will not be create for it.
64
65 const uint8_t kNoEntry = 255;
66 const uint8_t kCollision = 254;
67 const uint8_t kMaxRestartSupportedByHashIndex = 253;
68
69 // Because we use uint16_t address, we only support block no more than 64KB
70 const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
71 const double kDefaultUtilRatio = 0.75;
72
73 class DataBlockHashIndexBuilder {
74 public:
75 DataBlockHashIndexBuilder()
76 : bucket_per_key_(-1 /*uninitialized marker*/),
77 estimated_num_buckets_(0),
78 valid_(false) {}
79
80 void Initialize(double util_ratio) {
81 if (util_ratio <= 0) {
82 util_ratio = kDefaultUtilRatio; // sanity check
83 }
84 bucket_per_key_ = 1 / util_ratio;
85 valid_ = true;
86 }
87
88 inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
89 void Add(const Slice& key, const size_t restart_index);
90 void Finish(std::string& buffer);
91 void Reset();
92 inline size_t EstimateSize() const {
93 uint16_t estimated_num_buckets =
94 static_cast<uint16_t>(estimated_num_buckets_);
95
96 // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
97 estimated_num_buckets |= 1;
98
99 return sizeof(uint16_t) +
100 static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
101 }
102
103 private:
104 double bucket_per_key_; // is the multiplicative inverse of util_ratio_
105 double estimated_num_buckets_;
106
107 // Now the only usage for `valid_` is to mark false when the inserted
108 // restart_index is larger than supported. In this case HashIndex is not
109 // appended to the block content.
110 bool valid_;
111
112 std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
113 friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
114 };
115
116 class DataBlockHashIndex {
117 public:
118 DataBlockHashIndex() : num_buckets_(0) {}
119
120 void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
121
122 uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
123
124 inline bool Valid() { return num_buckets_ != 0; }
125
126 private:
127 // To make the serialized hash index compact and to save the space overhead,
128 // here all the data fields persisted in the block are in uint16 format.
129 // We find that a uint16 is large enough to index every offset of a 64KiB
130 // block.
131 // So in other words, DataBlockHashIndex does not support block size equal
132 // or greater then 64KiB.
133 uint16_t num_buckets_;
134 };
135
136 } // namespace rocksdb