[ceph.git] / ceph / src / rocksdb / table / block_based / data_block_hash_index.h

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#pragma once

#include <cstdint>
#include <string>
#include <vector>

#include "rocksdb/slice.h"

namespace ROCKSDB_NAMESPACE {
// This is an experimental feature aiming to reduce the CPU utilization of
// point-lookup within a data-block. It is only used in data blocks, and not
// in meta-data blocks or per-table index blocks.
//
// It only used to support BlockBasedTable::Get().
//
// A serialized hash index is appended to the data-block. The new block data
// format is as follows:
//
// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
//
// RI:       Restart Interval (the same as the default data-block format)
// RI_IDX:   Restart Interval index (the same as the default data-block format)
// HASH_IDX: The new data-block hash index feature.
// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
//           the flag indicating if this hash index is in use. Note that
//           given a data block < 32KB, the MSB is never used. So we can
//           borrow the MSB as the hash index flag. Therefore, this format is
//           compatible with the legacy data-blocks with num_restarts < 32768,
//           as the MSB is 0.
//
// The format of the data-block hash index is as follows:
//
// HASH_IDX: [B B B ... B NUM_BUCK]
//
// B:         bucket, an array of restart index. Each buckets is uint8_t.
// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
//
// We reserve two special flag:
//    kNoEntry=255,
//    kCollision=254.
//
// Therefore, the max number of restarts this hash index can supoport is 253.
//
// Buckets are initialized to be kNoEntry.
//
// When storing a key in the hash index, the key is first hashed to a bucket.
// If there the bucket is empty (kNoEntry), the restart index is stored in
// the bucket. If there is already a restart index there, we will update the
// existing restart index to a collision marker (kCollision). If the
// the bucket is already marked as collision, we do not store the restart
// index either.
//
// During query process, a key is first hashed to a bucket. Then we examine if
// the buckets store nothing (kNoEntry) or the bucket had a collision
// (kCollision). If either of those happens, we get the restart index of
// the key and will directly go to the restart interval to search the key.
//
// Note that we only support blocks with #restart_interval < 254. If a block
// has more restart interval than that, hash index will not be create for it.

const uint8_t kNoEntry = 255;
const uint8_t kCollision = 254;
const uint8_t kMaxRestartSupportedByHashIndex = 253;

// Because we use uint16_t address, we only support block no more than 64KB
const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
const double kDefaultUtilRatio = 0.75;

class DataBlockHashIndexBuilder {
 public:
  DataBlockHashIndexBuilder()
      : bucket_per_key_(-1 /*uninitialized marker*/),
        estimated_num_buckets_(0),
        valid_(false) {}

  void Initialize(double util_ratio) {
    if (util_ratio <= 0) {
      util_ratio = kDefaultUtilRatio;  // sanity check
    }
    bucket_per_key_ = 1 / util_ratio;
    valid_ = true;
  }

  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
  void Add(const Slice& key, const size_t restart_index);
  void Finish(std::string& buffer);
  void Reset();
  inline size_t EstimateSize() const {
    uint16_t estimated_num_buckets =
        static_cast<uint16_t>(estimated_num_buckets_);

    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
    estimated_num_buckets |= 1;

    return sizeof(uint16_t) +
           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
  }

 private:
  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
  double estimated_num_buckets_;

  // Now the only usage for `valid_` is to mark false when the inserted
  // restart_index is larger than supported. In this case HashIndex is not
  // appended to the block content.
  bool valid_;

  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
};

class DataBlockHashIndex {
 public:
  DataBlockHashIndex() : num_buckets_(0) {}

  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);

  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;

  inline bool Valid() { return num_buckets_ != 0; }

 private:
  // To make the serialized hash index compact and to save the space overhead,
  // here all the data fields persisted in the block are in uint16 format.
  // We find that a uint16 is large enough to index every offset of a 64KiB
  // block.
  // So in other words, DataBlockHashIndex does not support block size equal
  // or greater then 64KiB.
  uint16_t num_buckets_;
};

}  // namespace ROCKSDB_NAMESPACE
Commit	Line	Data
11fdf7f2 TL	1	// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
	2	// This source code is licensed under both the GPLv2 (found in the
	3	// COPYING file in the root directory) and Apache 2.0 License
	4	// (found in the LICENSE.Apache file in the root directory).
	5
	6	#pragma once
	7
1e59de90	8	#include <cstdint>
11fdf7f2 TL	9	#include <string>
	10	#include <vector>
	11
	12	#include "rocksdb/slice.h"
	13
f67539c2	14	namespace ROCKSDB_NAMESPACE {
11fdf7f2 TL	15	// This is an experimental feature aiming to reduce the CPU utilization of
	16	// point-lookup within a data-block. It is only used in data blocks, and not
	17	// in meta-data blocks or per-table index blocks.
	18	//
	19	// It only used to support BlockBasedTable::Get().
	20	//
	21	// A serialized hash index is appended to the data-block. The new block data
	22	// format is as follows:
	23	//
	24	// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
	25	//
	26	// RI: Restart Interval (the same as the default data-block format)
	27	// RI_IDX: Restart Interval index (the same as the default data-block format)
	28	// HASH_IDX: The new data-block hash index feature.
	29	// FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as
	30	// the flag indicating if this hash index is in use. Note that
	31	// given a data block < 32KB, the MSB is never used. So we can
	32	// borrow the MSB as the hash index flag. Therefore, this format is
	33	// compatible with the legacy data-blocks with num_restarts < 32768,
	34	// as the MSB is 0.
	35	//
	36	// The format of the data-block hash index is as follows:
	37	//
	38	// HASH_IDX: [B B B ... B NUM_BUCK]
	39	//
	40	// B: bucket, an array of restart index. Each buckets is uint8_t.
	41	// NUM_BUCK: Number of buckets, which is the length of the bucket array.
	42	//
	43	// We reserve two special flag:
	44	// kNoEntry=255,
	45	// kCollision=254.
	46	//
	47	// Therefore, the max number of restarts this hash index can supoport is 253.
	48	//
	49	// Buckets are initialized to be kNoEntry.
	50	//
	51	// When storing a key in the hash index, the key is first hashed to a bucket.
	52	// If there the bucket is empty (kNoEntry), the restart index is stored in
	53	// the bucket. If there is already a restart index there, we will update the
	54	// existing restart index to a collision marker (kCollision). If the
	55	// the bucket is already marked as collision, we do not store the restart
	56	// index either.
	57	//
	58	// During query process, a key is first hashed to a bucket. Then we examine if
	59	// the buckets store nothing (kNoEntry) or the bucket had a collision
	60	// (kCollision). If either of those happens, we get the restart index of
	61	// the key and will directly go to the restart interval to search the key.
	62	//
	63	// Note that we only support blocks with #restart_interval < 254. If a block
	64	// has more restart interval than that, hash index will not be create for it.
	65
	66	const uint8_t kNoEntry = 255;
	67	const uint8_t kCollision = 254;
	68	const uint8_t kMaxRestartSupportedByHashIndex = 253;
	69
	70	// Because we use uint16_t address, we only support block no more than 64KB
	71	const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
	72	const double kDefaultUtilRatio = 0.75;
	73
	74	class DataBlockHashIndexBuilder {
	75	public:
	76	DataBlockHashIndexBuilder()
	77	: bucket_per_key_(-1 /uninitialized marker/),
	78	estimated_num_buckets_(0),
79	valid_(false) {}
80
81	void Initialize(double util_ratio) {
82	if (util_ratio <= 0) {
83	util_ratio = kDefaultUtilRatio; // sanity check
84	}
85	bucket_per_key_ = 1 / util_ratio;
86	valid_ = true;
87	}
88
89	inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
90	void Add(const Slice& key, const size_t restart_index);
91	void Finish(std::string& buffer);
92	void Reset();
93	inline size_t EstimateSize() const {
94	uint16_t estimated_num_buckets =
95	static_cast<uint16_t>(estimated_num_buckets_);
96
97	// Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
98	estimated_num_buckets \|= 1;
99
100	return sizeof(uint16_t) +
101	static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
102	}
103
104	private:
105	double bucket_per_key_; // is the multiplicative inverse of util_ratio_
106	double estimated_num_buckets_;
107
108	// Now the only usage for `valid_` is to mark false when the inserted
109	// restart_index is larger than supported. In this case HashIndex is not
110	// appended to the block content.
111	bool valid_;
112
113	std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
114	friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
115	};
116
117	class DataBlockHashIndex {
118	public:
119	DataBlockHashIndex() : num_buckets_(0) {}
120
121	void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
122
123	uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
124
125	inline bool Valid() { return num_buckets_ != 0; }
126
127	private:
128	// To make the serialized hash index compact and to save the space overhead,
129	// here all the data fields persisted in the block are in uint16 format.
130	// We find that a uint16 is large enough to index every offset of a 64KiB
131	// block.
132	// So in other words, DataBlockHashIndex does not support block size equal
133	// or greater then 64KiB.
134	uint16_t num_buckets_;
135	};
136
f67539c2	137	} // namespace ROCKSDB_NAMESPACE