]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/data_block_hash_index.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / data_block_hash_index.h
CommitLineData
11fdf7f2
TL
1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5
6#pragma once
7
1e59de90 8#include <cstdint>
11fdf7f2
TL
9#include <string>
10#include <vector>
11
12#include "rocksdb/slice.h"
13
f67539c2 14namespace ROCKSDB_NAMESPACE {
11fdf7f2
TL
15// This is an experimental feature aiming to reduce the CPU utilization of
16// point-lookup within a data-block. It is only used in data blocks, and not
17// in meta-data blocks or per-table index blocks.
18//
19// It only used to support BlockBasedTable::Get().
20//
21// A serialized hash index is appended to the data-block. The new block data
22// format is as follows:
23//
24// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
25//
26// RI: Restart Interval (the same as the default data-block format)
27// RI_IDX: Restart Interval index (the same as the default data-block format)
28// HASH_IDX: The new data-block hash index feature.
29// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as
30// the flag indicating if this hash index is in use. Note that
31// given a data block < 32KB, the MSB is never used. So we can
32// borrow the MSB as the hash index flag. Therefore, this format is
33// compatible with the legacy data-blocks with num_restarts < 32768,
34// as the MSB is 0.
35//
36// The format of the data-block hash index is as follows:
37//
38// HASH_IDX: [B B B ... B NUM_BUCK]
39//
40// B: bucket, an array of restart index. Each buckets is uint8_t.
41// NUM_BUCK: Number of buckets, which is the length of the bucket array.
42//
43// We reserve two special flag:
44// kNoEntry=255,
45// kCollision=254.
46//
47// Therefore, the max number of restarts this hash index can supoport is 253.
48//
49// Buckets are initialized to be kNoEntry.
50//
51// When storing a key in the hash index, the key is first hashed to a bucket.
52// If there the bucket is empty (kNoEntry), the restart index is stored in
53// the bucket. If there is already a restart index there, we will update the
54// existing restart index to a collision marker (kCollision). If the
55// the bucket is already marked as collision, we do not store the restart
56// index either.
57//
58// During query process, a key is first hashed to a bucket. Then we examine if
59// the buckets store nothing (kNoEntry) or the bucket had a collision
60// (kCollision). If either of those happens, we get the restart index of
61// the key and will directly go to the restart interval to search the key.
62//
63// Note that we only support blocks with #restart_interval < 254. If a block
64// has more restart interval than that, hash index will not be create for it.
65
66const uint8_t kNoEntry = 255;
67const uint8_t kCollision = 254;
68const uint8_t kMaxRestartSupportedByHashIndex = 253;
69
70// Because we use uint16_t address, we only support block no more than 64KB
71const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
72const double kDefaultUtilRatio = 0.75;
73
74class DataBlockHashIndexBuilder {
75 public:
76 DataBlockHashIndexBuilder()
77 : bucket_per_key_(-1 /*uninitialized marker*/),
78 estimated_num_buckets_(0),
79 valid_(false) {}
80
81 void Initialize(double util_ratio) {
82 if (util_ratio <= 0) {
83 util_ratio = kDefaultUtilRatio; // sanity check
84 }
85 bucket_per_key_ = 1 / util_ratio;
86 valid_ = true;
87 }
88
89 inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
90 void Add(const Slice& key, const size_t restart_index);
91 void Finish(std::string& buffer);
92 void Reset();
93 inline size_t EstimateSize() const {
94 uint16_t estimated_num_buckets =
95 static_cast<uint16_t>(estimated_num_buckets_);
96
97 // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
98 estimated_num_buckets |= 1;
99
100 return sizeof(uint16_t) +
101 static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
102 }
103
104 private:
105 double bucket_per_key_; // is the multiplicative inverse of util_ratio_
106 double estimated_num_buckets_;
107
108 // Now the only usage for `valid_` is to mark false when the inserted
109 // restart_index is larger than supported. In this case HashIndex is not
110 // appended to the block content.
111 bool valid_;
112
113 std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
114 friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
115};
116
117class DataBlockHashIndex {
118 public:
119 DataBlockHashIndex() : num_buckets_(0) {}
120
121 void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
122
123 uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
124
125 inline bool Valid() { return num_buckets_ != 0; }
126
127 private:
128 // To make the serialized hash index compact and to save the space overhead,
129 // here all the data fields persisted in the block are in uint16 format.
130 // We find that a uint16 is large enough to index every offset of a 64KiB
131 // block.
132 // So in other words, DataBlockHashIndex does not support block size equal
133 // or greater then 64KiB.
134 uint16_t num_buckets_;
135};
136
f67539c2 137} // namespace ROCKSDB_NAMESPACE