1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
13 #include "file/file_prefetch_buffer.h"
14 #include "file/random_access_file_reader.h"
16 #include "rocksdb/options.h"
17 #include "rocksdb/slice.h"
18 #include "rocksdb/status.h"
19 #include "rocksdb/table.h"
21 #include "memory/memory_allocator.h"
22 #include "options/cf_options.h"
23 #include "port/malloc.h"
24 #include "port/port.h" // noexcept
25 #include "table/persistent_cache_options.h"
27 namespace ROCKSDB_NAMESPACE
{
29 class RandomAccessFile
;
32 extern bool ShouldReportDetailedTime(Env
* env
, Statistics
* stats
);
34 // the length of the magic number in bytes.
35 const int kMagicNumberLengthByte
= 8;
37 // BlockHandle is a pointer to the extent of a file that stores a data
38 // block or a meta block.
41 // Creates a block handle with special values indicating "uninitialized,"
42 // distinct from the "null" block handle.
44 BlockHandle(uint64_t offset
, uint64_t size
);
46 // The offset of the block in the file.
47 uint64_t offset() const { return offset_
; }
48 void set_offset(uint64_t _offset
) { offset_
= _offset
; }
50 // The size of the stored block
51 uint64_t size() const { return size_
; }
52 void set_size(uint64_t _size
) { size_
= _size
; }
54 void EncodeTo(std::string
* dst
) const;
55 Status
DecodeFrom(Slice
* input
);
56 Status
DecodeSizeFrom(uint64_t offset
, Slice
* input
);
58 // Return a string that contains the copy of handle.
59 std::string
ToString(bool hex
= true) const;
61 // if the block handle's offset and size are both "0", we will view it
62 // as a null block handle that points to no where.
63 bool IsNull() const { return offset_
== 0 && size_
== 0; }
65 static const BlockHandle
& NullBlockHandle() { return kNullBlockHandle
; }
67 // Maximum encoding length of a BlockHandle
68 enum { kMaxEncodedLength
= 10 + 10 };
70 inline bool operator==(const BlockHandle
& rhs
) const {
71 return offset_
== rhs
.offset_
&& size_
== rhs
.size_
;
73 inline bool operator!=(const BlockHandle
& rhs
) const {
74 return !(*this == rhs
);
81 static const BlockHandle kNullBlockHandle
;
84 // Value in block-based table file index.
86 // The index entry for block n is: y -> h, [x],
87 // where: y is some key between the last key of block n (inclusive) and the
88 // first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
89 // x, if present, is the first key of block n (unshortened).
90 // This struct represents the "h, [x]" part.
93 // Empty means unknown.
94 Slice first_internal_key
;
96 IndexValue() = default;
97 IndexValue(BlockHandle _handle
, Slice _first_internal_key
)
98 : handle(_handle
), first_internal_key(_first_internal_key
) {}
100 // have_first_key indicates whether the `first_internal_key` is used.
101 // If previous_handle is not null, delta encoding is used;
102 // in this case, the two handles must point to consecutive blocks:
103 // handle.offset() ==
104 // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
105 void EncodeTo(std::string
* dst
, bool have_first_key
,
106 const BlockHandle
* previous_handle
) const;
107 Status
DecodeFrom(Slice
* input
, bool have_first_key
,
108 const BlockHandle
* previous_handle
);
110 std::string
ToString(bool hex
, bool have_first_key
) const;
113 inline uint32_t GetCompressFormatForVersion(uint32_t format_version
) {
114 // As of format_version 2, we encode compressed block with
115 // compress_format_version == 2. Before that, the version is 1.
116 // DO NOT CHANGE THIS FUNCTION, it affects disk format
117 return format_version
>= 2 ? 2 : 1;
120 inline bool BlockBasedTableSupportedVersion(uint32_t version
) {
124 // Footer encapsulates the fixed information stored at the tail
125 // end of every table file.
128 // Constructs a footer without specifying its table magic number.
129 // In such case, the table magic number of such footer should be
130 // initialized via @ReadFooterFromFile().
131 // Use this when you plan to load Footer with DecodeFrom(). Never use this
132 // when you plan to EncodeTo.
133 Footer() : Footer(kInvalidTableMagicNumber
, 0) {}
135 // Use this constructor when you plan to write out the footer using
136 // EncodeTo(). Never use this constructor with DecodeFrom().
137 Footer(uint64_t table_magic_number
, uint32_t version
);
139 // The version of the footer in this file
140 uint32_t version() const { return version_
; }
142 // The checksum type used in this file
143 ChecksumType
checksum() const { return checksum_
; }
144 void set_checksum(const ChecksumType c
) { checksum_
= c
; }
146 // The block handle for the metaindex block of the table
147 const BlockHandle
& metaindex_handle() const { return metaindex_handle_
; }
148 void set_metaindex_handle(const BlockHandle
& h
) { metaindex_handle_
= h
; }
150 // The block handle for the index block of the table
151 const BlockHandle
& index_handle() const { return index_handle_
; }
153 void set_index_handle(const BlockHandle
& h
) { index_handle_
= h
; }
155 uint64_t table_magic_number() const { return table_magic_number_
; }
157 void EncodeTo(std::string
* dst
) const;
159 // Set the current footer based on the input slice.
161 // REQUIRES: table_magic_number_ is not set (i.e.,
162 // HasInitializedTableMagicNumber() is true). The function will initialize the
164 Status
DecodeFrom(Slice
* input
);
166 // Encoded length of a Footer. Note that the serialization of a Footer will
167 // always occupy at least kMinEncodedLength bytes. If fields are changed
168 // the version number should be incremented and kMaxEncodedLength should be
169 // increased accordingly.
171 // Footer version 0 (legacy) will always occupy exactly this many bytes.
172 // It consists of two block handles, padding, and a magic number.
173 kVersion0EncodedLength
= 2 * BlockHandle::kMaxEncodedLength
+ 8,
174 // Footer of versions 1 and higher will always occupy exactly this many
175 // bytes. It consists of the checksum type, two block handles, padding,
176 // a version number (bigger than 1), and a magic number
177 kNewVersionsEncodedLength
= 1 + 2 * BlockHandle::kMaxEncodedLength
+ 4 + 8,
178 kMinEncodedLength
= kVersion0EncodedLength
,
179 kMaxEncodedLength
= kNewVersionsEncodedLength
,
182 static const uint64_t kInvalidTableMagicNumber
= 0;
184 // convert this object to a human readable form
185 std::string
ToString() const;
188 // REQUIRES: magic number wasn't initialized.
189 void set_table_magic_number(uint64_t magic_number
) {
190 assert(!HasInitializedTableMagicNumber());
191 table_magic_number_
= magic_number
;
194 // return true if @table_magic_number_ is set to a value different
195 // from @kInvalidTableMagicNumber.
196 bool HasInitializedTableMagicNumber() const {
197 return (table_magic_number_
!= kInvalidTableMagicNumber
);
201 ChecksumType checksum_
;
202 BlockHandle metaindex_handle_
;
203 BlockHandle index_handle_
;
204 uint64_t table_magic_number_
= 0;
207 // Read the footer from file
208 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
209 // corruption if table_magic number is not equal to enforce_table_magic_number
210 Status
ReadFooterFromFile(const IOOptions
& opts
, RandomAccessFileReader
* file
,
211 FilePrefetchBuffer
* prefetch_buffer
,
212 uint64_t file_size
, Footer
* footer
,
213 uint64_t enforce_table_magic_number
= 0);
215 // 1-byte compression type + 32-bit checksum
216 static const size_t kBlockTrailerSize
= 5;
218 // Make block size calculation for IO less error prone
219 inline uint64_t block_size(const BlockHandle
& handle
) {
220 return handle
.size() + kBlockTrailerSize
;
223 inline CompressionType
get_block_compression_type(const char* block_data
,
225 return static_cast<CompressionType
>(block_data
[block_size
]);
228 // Represents the contents of a block read from an SST file. Depending on how
229 // it's created, it may or may not own the actual block bytes. As an example,
230 // BlockContents objects representing data read from mmapped files only point
231 // into the mmapped region.
232 struct BlockContents
{
233 Slice data
; // Actual contents of data
234 CacheAllocationPtr allocation
;
237 // Whether the block is a raw block, which contains compression type
238 // byte. It is only used for assertion.
239 bool is_raw_block
= false;
244 // Does not take ownership of the underlying data bytes.
245 BlockContents(const Slice
& _data
) : data(_data
) {}
247 // Takes ownership of the underlying data bytes.
248 BlockContents(CacheAllocationPtr
&& _data
, size_t _size
)
249 : data(_data
.get(), _size
), allocation(std::move(_data
)) {}
251 // Takes ownership of the underlying data bytes.
252 BlockContents(std::unique_ptr
<char[]>&& _data
, size_t _size
)
253 : data(_data
.get(), _size
) {
254 allocation
.reset(_data
.release());
257 // Returns whether the object has ownership of the underlying data bytes.
258 bool own_bytes() const { return allocation
.get() != nullptr; }
260 // It's the caller's responsibility to make sure that this is
261 // for raw block contents, which contains the compression
263 CompressionType
get_compression_type() const {
264 assert(is_raw_block
);
265 return get_block_compression_type(data
.data(), data
.size());
268 // The additional memory space taken by the block data.
269 size_t usable_size() const {
270 if (allocation
.get() != nullptr) {
271 auto allocator
= allocation
.get_deleter().allocator
;
273 return allocator
->UsableSize(allocation
.get(), data
.size());
275 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
276 return malloc_usable_size(allocation
.get());
279 #endif // ROCKSDB_MALLOC_USABLE_SIZE
281 return 0; // no extra memory is occupied by the data
285 size_t ApproximateMemoryUsage() const {
286 return usable_size() + sizeof(*this);
289 BlockContents(BlockContents
&& other
) ROCKSDB_NOEXCEPT
{
290 *this = std::move(other
);
293 BlockContents
& operator=(BlockContents
&& other
) {
294 data
= std::move(other
.data
);
295 allocation
= std::move(other
.allocation
);
297 is_raw_block
= other
.is_raw_block
;
303 // Read the block identified by "handle" from "file". On failure
304 // return non-OK. On success fill *result and return OK.
305 extern Status
ReadBlockContents(
306 RandomAccessFileReader
* file
, FilePrefetchBuffer
* prefetch_buffer
,
307 const Footer
& footer
, const ReadOptions
& options
, const BlockHandle
& handle
,
308 BlockContents
* contents
, const ImmutableCFOptions
& ioptions
,
309 bool do_uncompress
= true, const Slice
& compression_dict
= Slice(),
310 const PersistentCacheOptions
& cache_options
= PersistentCacheOptions());
312 // The 'data' points to the raw block contents read in from file.
313 // This method allocates a new heap buffer and the raw block
314 // contents are uncompresed into this buffer. This buffer is
315 // returned via 'result' and it is upto the caller to
317 // For description of compress_format_version and possible values, see
318 // util/compression.h
319 extern Status
UncompressBlockContents(const UncompressionInfo
& info
,
320 const char* data
, size_t n
,
321 BlockContents
* contents
,
322 uint32_t compress_format_version
,
323 const ImmutableCFOptions
& ioptions
,
324 MemoryAllocator
* allocator
= nullptr);
326 // This is an extension to UncompressBlockContents that accepts
327 // a specific compression type. This is used by un-wrapped blocks
328 // with no compression header.
329 extern Status
UncompressBlockContentsForCompressionType(
330 const UncompressionInfo
& info
, const char* data
, size_t n
,
331 BlockContents
* contents
, uint32_t compress_format_version
,
332 const ImmutableCFOptions
& ioptions
, MemoryAllocator
* allocator
= nullptr);
334 // Replace db_host_id contents with the real hostname if necessary
335 extern Status
ReifyDbHostIdProperty(Env
* env
, std::string
* db_host_id
);
337 // Implementation details follow. Clients should ignore,
339 // TODO(andrewkr): we should prefer one way of representing a null/uninitialized
340 // BlockHandle. Currently we use zeros for null and use negation-of-zeros for
342 inline BlockHandle::BlockHandle()
343 : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
345 inline BlockHandle::BlockHandle(uint64_t _offset
, uint64_t _size
)
346 : offset_(_offset
), size_(_size
) {}
348 } // namespace ROCKSDB_NAMESPACE