]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/format.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / table / format.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
7c673cae 11#include <stdint.h>
11fdf7f2
TL
12#include <string>
13#ifdef ROCKSDB_MALLOC_USABLE_SIZE
14#ifdef OS_FREEBSD
15#include <malloc_np.h>
16#else
17#include <malloc.h>
18#endif
19#endif
20#include "rocksdb/options.h"
7c673cae
FG
21#include "rocksdb/slice.h"
22#include "rocksdb/status.h"
7c673cae
FG
23#include "rocksdb/table.h"
24
25#include "options/cf_options.h"
26#include "port/port.h" // noexcept
27#include "table/persistent_cache_options.h"
11fdf7f2 28#include "util/file_reader_writer.h"
494da23a 29#include "util/memory_allocator.h"
7c673cae
FG
30
31namespace rocksdb {
32
7c673cae
FG
33class RandomAccessFile;
34struct ReadOptions;
35
36extern bool ShouldReportDetailedTime(Env* env, Statistics* stats);
37
38// the length of the magic number in bytes.
39const int kMagicNumberLengthByte = 8;
40
41// BlockHandle is a pointer to the extent of a file that stores a data
42// block or a meta block.
43class BlockHandle {
44 public:
45 BlockHandle();
46 BlockHandle(uint64_t offset, uint64_t size);
47
48 // The offset of the block in the file.
49 uint64_t offset() const { return offset_; }
50 void set_offset(uint64_t _offset) { offset_ = _offset; }
51
52 // The size of the stored block
53 uint64_t size() const { return size_; }
54 void set_size(uint64_t _size) { size_ = _size; }
55
56 void EncodeTo(std::string* dst) const;
57 Status DecodeFrom(Slice* input);
11fdf7f2 58 Status DecodeSizeFrom(uint64_t offset, Slice* input);
7c673cae
FG
59
60 // Return a string that contains the copy of handle.
61 std::string ToString(bool hex = true) const;
62
63 // if the block handle's offset and size are both "0", we will view it
64 // as a null block handle that points to no where.
11fdf7f2 65 bool IsNull() const { return offset_ == 0 && size_ == 0; }
7c673cae 66
11fdf7f2 67 static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
7c673cae
FG
68
69 // Maximum encoding length of a BlockHandle
70 enum { kMaxEncodedLength = 10 + 10 };
71
72 private:
73 uint64_t offset_;
74 uint64_t size_;
75
76 static const BlockHandle kNullBlockHandle;
77};
78
494da23a
TL
79inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
80 uint32_t version) {
11fdf7f2
TL
81#ifdef NDEBUG
82 (void)compression_type;
83#endif
7c673cae
FG
84 // snappy is not versioned
85 assert(compression_type != kSnappyCompression &&
86 compression_type != kXpressCompression &&
87 compression_type != kNoCompression);
88 // As of version 2, we encode compressed block with
89 // compress_format_version == 2. Before that, the version is 1.
90 // DO NOT CHANGE THIS FUNCTION, it affects disk format
91 return version >= 2 ? 2 : 1;
92}
93
94inline bool BlockBasedTableSupportedVersion(uint32_t version) {
11fdf7f2 95 return version <= 4;
7c673cae
FG
96}
97
98// Footer encapsulates the fixed information stored at the tail
99// end of every table file.
100class Footer {
101 public:
102 // Constructs a footer without specifying its table magic number.
103 // In such case, the table magic number of such footer should be
104 // initialized via @ReadFooterFromFile().
105 // Use this when you plan to load Footer with DecodeFrom(). Never use this
106 // when you plan to EncodeTo.
107 Footer() : Footer(kInvalidTableMagicNumber, 0) {}
108
109 // Use this constructor when you plan to write out the footer using
110 // EncodeTo(). Never use this constructor with DecodeFrom().
111 Footer(uint64_t table_magic_number, uint32_t version);
112
113 // The version of the footer in this file
114 uint32_t version() const { return version_; }
115
116 // The checksum type used in this file
117 ChecksumType checksum() const { return checksum_; }
118 void set_checksum(const ChecksumType c) { checksum_ = c; }
119
120 // The block handle for the metaindex block of the table
121 const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
122 void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
123
124 // The block handle for the index block of the table
125 const BlockHandle& index_handle() const { return index_handle_; }
126
127 void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
128
129 uint64_t table_magic_number() const { return table_magic_number_; }
130
131 void EncodeTo(std::string* dst) const;
132
133 // Set the current footer based on the input slice.
134 //
135 // REQUIRES: table_magic_number_ is not set (i.e.,
136 // HasInitializedTableMagicNumber() is true). The function will initialize the
137 // magic number
138 Status DecodeFrom(Slice* input);
139
140 // Encoded length of a Footer. Note that the serialization of a Footer will
141 // always occupy at least kMinEncodedLength bytes. If fields are changed
142 // the version number should be incremented and kMaxEncodedLength should be
143 // increased accordingly.
144 enum {
145 // Footer version 0 (legacy) will always occupy exactly this many bytes.
146 // It consists of two block handles, padding, and a magic number.
147 kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
148 // Footer of versions 1 and higher will always occupy exactly this many
149 // bytes. It consists of the checksum type, two block handles, padding,
150 // a version number (bigger than 1), and a magic number
151 kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
152 kMinEncodedLength = kVersion0EncodedLength,
153 kMaxEncodedLength = kNewVersionsEncodedLength,
154 };
155
156 static const uint64_t kInvalidTableMagicNumber = 0;
157
158 // convert this object to a human readable form
159 std::string ToString() const;
160
161 private:
162 // REQUIRES: magic number wasn't initialized.
163 void set_table_magic_number(uint64_t magic_number) {
164 assert(!HasInitializedTableMagicNumber());
165 table_magic_number_ = magic_number;
166 }
167
168 // return true if @table_magic_number_ is set to a value different
169 // from @kInvalidTableMagicNumber.
170 bool HasInitializedTableMagicNumber() const {
171 return (table_magic_number_ != kInvalidTableMagicNumber);
172 }
173
174 uint32_t version_;
175 ChecksumType checksum_;
176 BlockHandle metaindex_handle_;
177 BlockHandle index_handle_;
178 uint64_t table_magic_number_ = 0;
179};
180
181// Read the footer from file
182// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
183// corruption if table_magic number is not equal to enforce_table_magic_number
11fdf7f2
TL
184Status ReadFooterFromFile(RandomAccessFileReader* file,
185 FilePrefetchBuffer* prefetch_buffer,
186 uint64_t file_size, Footer* footer,
7c673cae
FG
187 uint64_t enforce_table_magic_number = 0);
188
189// 1-byte type + 32-bit crc
190static const size_t kBlockTrailerSize = 5;
191
494da23a
TL
192inline CompressionType get_block_compression_type(const char* block_data,
193 size_t block_size) {
194 return static_cast<CompressionType>(block_data[block_size]);
195}
196
7c673cae 197struct BlockContents {
494da23a
TL
198 Slice data; // Actual contents of data
199 CacheAllocationPtr allocation;
200
201#ifndef NDEBUG
202 // Whether the block is a raw block, which contains compression type
203 // byte. It is only used for assertion.
204 bool is_raw_block = false;
205#endif // NDEBUG
7c673cae 206
494da23a 207 BlockContents() {}
7c673cae 208
494da23a
TL
209 BlockContents(const Slice& _data) : data(_data) {}
210
211 BlockContents(CacheAllocationPtr&& _data, size_t _size)
212 : data(_data.get(), _size), allocation(std::move(_data)) {}
213
214 BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
215 : data(_data.get(), _size) {
216 allocation.reset(_data.release());
217 }
7c673cae 218
494da23a
TL
219 bool own_bytes() const { return allocation.get() != nullptr; }
220
221 // It's the caller's responsibility to make sure that this is
222 // for raw block contents, which contains the compression
223 // byte in the end.
224 CompressionType get_compression_type() const {
225 assert(is_raw_block);
226 return get_block_compression_type(data.data(), data.size());
227 }
7c673cae 228
11fdf7f2
TL
229 // The additional memory space taken by the block data.
230 size_t usable_size() const {
231 if (allocation.get() != nullptr) {
494da23a
TL
232 auto allocator = allocation.get_deleter().allocator;
233 if (allocator) {
234 return allocator->UsableSize(allocation.get(), data.size());
235 }
11fdf7f2
TL
236#ifdef ROCKSDB_MALLOC_USABLE_SIZE
237 return malloc_usable_size(allocation.get());
238#else
239 return data.size();
240#endif // ROCKSDB_MALLOC_USABLE_SIZE
241 } else {
242 return 0; // no extra memory is occupied by the data
243 }
244 }
245
494da23a
TL
246 size_t ApproximateMemoryUsage() const {
247 return usable_size() + sizeof(*this);
248 }
249
11fdf7f2
TL
250 BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
251 *this = std::move(other);
252 }
7c673cae
FG
253
254 BlockContents& operator=(BlockContents&& other) {
255 data = std::move(other.data);
7c673cae 256 allocation = std::move(other.allocation);
494da23a
TL
257#ifndef NDEBUG
258 is_raw_block = other.is_raw_block;
259#endif // NDEBUG
7c673cae
FG
260 return *this;
261 }
262};
263
264// Read the block identified by "handle" from "file". On failure
265// return non-OK. On success fill *result and return OK.
266extern Status ReadBlockContents(
11fdf7f2
TL
267 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
268 const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
269 BlockContents* contents, const ImmutableCFOptions& ioptions,
7c673cae
FG
270 bool do_uncompress = true, const Slice& compression_dict = Slice(),
271 const PersistentCacheOptions& cache_options = PersistentCacheOptions());
272
273// The 'data' points to the raw block contents read in from file.
274// This method allocates a new heap buffer and the raw block
275// contents are uncompresed into this buffer. This buffer is
276// returned via 'result' and it is upto the caller to
277// free this buffer.
278// For description of compress_format_version and possible values, see
279// util/compression.h
494da23a
TL
280extern Status UncompressBlockContents(const UncompressionInfo& info,
281 const char* data, size_t n,
282 BlockContents* contents,
283 uint32_t compress_format_version,
284 const ImmutableCFOptions& ioptions,
285 MemoryAllocator* allocator = nullptr);
7c673cae
FG
286
287// This is an extension to UncompressBlockContents that accepts
288// a specific compression type. This is used by un-wrapped blocks
289// with no compression header.
290extern Status UncompressBlockContentsForCompressionType(
494da23a 291 const UncompressionInfo& info, const char* data, size_t n,
11fdf7f2 292 BlockContents* contents, uint32_t compress_format_version,
494da23a 293 const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
7c673cae
FG
294
295// Implementation details follow. Clients should ignore,
296
297// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
298// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
299// uninitialized.
300inline BlockHandle::BlockHandle()
11fdf7f2 301 : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
7c673cae
FG
302
303inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
304 : offset_(_offset), size_(_size) {}
305
306} // namespace rocksdb