1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #include "table/format.h"
15 #include "monitoring/perf_context_imp.h"
16 #include "monitoring/statistics.h"
17 #include "rocksdb/env.h"
18 #include "table/block.h"
19 #include "table/block_based_table_reader.h"
20 #include "table/persistent_cache_helper.h"
21 #include "util/coding.h"
22 #include "util/compression.h"
23 #include "util/crc32c.h"
24 #include "util/file_reader_writer.h"
25 #include "util/logging.h"
26 #include "util/stop_watch.h"
27 #include "util/string_util.h"
28 #include "util/xxhash.h"
32 extern const uint64_t kLegacyBlockBasedTableMagicNumber
;
33 extern const uint64_t kBlockBasedTableMagicNumber
;
36 extern const uint64_t kLegacyPlainTableMagicNumber
;
37 extern const uint64_t kPlainTableMagicNumber
;
39 // ROCKSDB_LITE doesn't have plain table
40 const uint64_t kLegacyPlainTableMagicNumber
= 0;
41 const uint64_t kPlainTableMagicNumber
= 0;
43 const uint32_t DefaultStackBufferSize
= 5000;
45 bool ShouldReportDetailedTime(Env
* env
, Statistics
* stats
) {
46 return env
!= nullptr && stats
!= nullptr &&
47 stats
->stats_level_
> kExceptDetailedTimers
;
50 void BlockHandle::EncodeTo(std::string
* dst
) const {
51 // Sanity check that all fields have been set
52 assert(offset_
!= ~static_cast<uint64_t>(0));
53 assert(size_
!= ~static_cast<uint64_t>(0));
54 PutVarint64Varint64(dst
, offset_
, size_
);
57 Status
BlockHandle::DecodeFrom(Slice
* input
) {
58 if (GetVarint64(input
, &offset_
) &&
59 GetVarint64(input
, &size_
)) {
62 // reset in case failure after partially decoding
65 return Status::Corruption("bad block handle");
69 // Return a string that contains the copy of handle.
70 std::string
BlockHandle::ToString(bool hex
) const {
71 std::string handle_str
;
72 EncodeTo(&handle_str
);
74 return Slice(handle_str
).ToString(true);
80 const BlockHandle
BlockHandle::kNullBlockHandle(0, 0);
83 inline bool IsLegacyFooterFormat(uint64_t magic_number
) {
84 return magic_number
== kLegacyBlockBasedTableMagicNumber
||
85 magic_number
== kLegacyPlainTableMagicNumber
;
87 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number
) {
88 if (magic_number
== kLegacyBlockBasedTableMagicNumber
) {
89 return kBlockBasedTableMagicNumber
;
91 if (magic_number
== kLegacyPlainTableMagicNumber
) {
92 return kPlainTableMagicNumber
;
99 // legacy footer format:
100 // metaindex handle (varint64 offset, varint64 size)
101 // index handle (varint64 offset, varint64 size)
102 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
103 // table_magic_number (8 bytes)
104 // new footer format:
105 // checksum (char, 1 byte)
106 // metaindex handle (varint64 offset, varint64 size)
107 // index handle (varint64 offset, varint64 size)
108 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
109 // footer version (4 bytes)
110 // table_magic_number (8 bytes)
111 void Footer::EncodeTo(std::string
* dst
) const {
112 assert(HasInitializedTableMagicNumber());
113 if (IsLegacyFooterFormat(table_magic_number())) {
114 // has to be default checksum with legacy footer
115 assert(checksum_
== kCRC32c
);
116 const size_t original_size
= dst
->size();
117 metaindex_handle_
.EncodeTo(dst
);
118 index_handle_
.EncodeTo(dst
);
119 dst
->resize(original_size
+ 2 * BlockHandle::kMaxEncodedLength
); // Padding
120 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() & 0xffffffffu
));
121 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() >> 32));
122 assert(dst
->size() == original_size
+ kVersion0EncodedLength
);
124 const size_t original_size
= dst
->size();
125 dst
->push_back(static_cast<char>(checksum_
));
126 metaindex_handle_
.EncodeTo(dst
);
127 index_handle_
.EncodeTo(dst
);
128 dst
->resize(original_size
+ kNewVersionsEncodedLength
- 12); // Padding
129 PutFixed32(dst
, version());
130 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() & 0xffffffffu
));
131 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() >> 32));
132 assert(dst
->size() == original_size
+ kNewVersionsEncodedLength
);
136 Footer::Footer(uint64_t _table_magic_number
, uint32_t _version
)
137 : version_(_version
),
139 table_magic_number_(_table_magic_number
) {
140 // This should be guaranteed by constructor callers
141 assert(!IsLegacyFooterFormat(_table_magic_number
) || version_
== 0);
144 Status
Footer::DecodeFrom(Slice
* input
) {
145 assert(!HasInitializedTableMagicNumber());
146 assert(input
!= nullptr);
147 assert(input
->size() >= kMinEncodedLength
);
149 const char *magic_ptr
=
150 input
->data() + input
->size() - kMagicNumberLengthByte
;
151 const uint32_t magic_lo
= DecodeFixed32(magic_ptr
);
152 const uint32_t magic_hi
= DecodeFixed32(magic_ptr
+ 4);
153 uint64_t magic
= ((static_cast<uint64_t>(magic_hi
) << 32) |
154 (static_cast<uint64_t>(magic_lo
)));
156 // We check for legacy formats here and silently upconvert them
157 bool legacy
= IsLegacyFooterFormat(magic
);
159 magic
= UpconvertLegacyFooterFormat(magic
);
161 set_table_magic_number(magic
);
164 // The size is already asserted to be at least kMinEncodedLength
165 // at the beginning of the function
166 input
->remove_prefix(input
->size() - kVersion0EncodedLength
);
167 version_
= 0 /* legacy */;
170 version_
= DecodeFixed32(magic_ptr
- 4);
171 // Footer version 1 and higher will always occupy exactly this many bytes.
172 // It consists of the checksum type, two block handles, padding,
173 // a version number, and a magic number
174 if (input
->size() < kNewVersionsEncodedLength
) {
175 return Status::Corruption("input is too short to be an sstable");
177 input
->remove_prefix(input
->size() - kNewVersionsEncodedLength
);
180 if (!GetVarint32(input
, &chksum
)) {
181 return Status::Corruption("bad checksum type");
183 checksum_
= static_cast<ChecksumType
>(chksum
);
186 Status result
= metaindex_handle_
.DecodeFrom(input
);
188 result
= index_handle_
.DecodeFrom(input
);
191 // We skip over any leftover data (just padding for now) in "input"
192 const char* end
= magic_ptr
+ kMagicNumberLengthByte
;
193 *input
= Slice(end
, input
->data() + input
->size() - end
);
198 std::string
Footer::ToString() const {
199 std::string result
, handle_
;
200 result
.reserve(1024);
202 bool legacy
= IsLegacyFooterFormat(table_magic_number_
);
204 result
.append("metaindex handle: " + metaindex_handle_
.ToString() + "\n ");
205 result
.append("index handle: " + index_handle_
.ToString() + "\n ");
206 result
.append("table_magic_number: " +
207 rocksdb::ToString(table_magic_number_
) + "\n ");
209 result
.append("checksum: " + rocksdb::ToString(checksum_
) + "\n ");
210 result
.append("metaindex handle: " + metaindex_handle_
.ToString() + "\n ");
211 result
.append("index handle: " + index_handle_
.ToString() + "\n ");
212 result
.append("footer version: " + rocksdb::ToString(version_
) + "\n ");
213 result
.append("table_magic_number: " +
214 rocksdb::ToString(table_magic_number_
) + "\n ");
219 Status
ReadFooterFromFile(RandomAccessFileReader
* file
, uint64_t file_size
,
220 Footer
* footer
, uint64_t enforce_table_magic_number
) {
221 if (file_size
< Footer::kMinEncodedLength
) {
222 return Status::Corruption("file is too short to be an sstable");
225 char footer_space
[Footer::kMaxEncodedLength
];
228 (file_size
> Footer::kMaxEncodedLength
)
229 ? static_cast<size_t>(file_size
- Footer::kMaxEncodedLength
)
231 Status s
= file
->Read(read_offset
, Footer::kMaxEncodedLength
, &footer_input
,
233 if (!s
.ok()) return s
;
235 // Check that we actually read the whole footer from the file. It may be
236 // that size isn't correct.
237 if (footer_input
.size() < Footer::kMinEncodedLength
) {
238 return Status::Corruption("file is too short to be an sstable");
241 s
= footer
->DecodeFrom(&footer_input
);
245 if (enforce_table_magic_number
!= 0 &&
246 enforce_table_magic_number
!= footer
->table_magic_number()) {
247 return Status::Corruption("Bad table magic number");
252 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
255 // Read a block and check its CRC
256 // contents is the result of reading.
257 // According to the implementation of file->Read, contents may not point to buf
258 Status
ReadBlock(RandomAccessFileReader
* file
, const Footer
& footer
,
259 const ReadOptions
& options
, const BlockHandle
& handle
,
260 Slice
* contents
, /* result of reading */ char* buf
) {
261 size_t n
= static_cast<size_t>(handle
.size());
265 PERF_TIMER_GUARD(block_read_time
);
266 s
= file
->Read(handle
.offset(), n
+ kBlockTrailerSize
, contents
, buf
);
269 PERF_COUNTER_ADD(block_read_count
, 1);
270 PERF_COUNTER_ADD(block_read_byte
, n
+ kBlockTrailerSize
);
275 if (contents
->size() != n
+ kBlockTrailerSize
) {
276 return Status::Corruption("truncated block read");
279 // Check the crc of the type and the block contents
280 const char* data
= contents
->data(); // Pointer to where Read put the data
281 if (options
.verify_checksums
) {
282 PERF_TIMER_GUARD(block_checksum_time
);
283 uint32_t value
= DecodeFixed32(data
+ n
+ 1);
285 switch (footer
.checksum()) {
287 value
= crc32c::Unmask(value
);
288 actual
= crc32c::Value(data
, n
+ 1);
291 actual
= XXH32(data
, static_cast<int>(n
) + 1, 0);
294 s
= Status::Corruption("unknown checksum type");
296 if (s
.ok() && actual
!= value
) {
297 s
= Status::Corruption("block checksum mismatch");
308 Status
ReadBlockContents(RandomAccessFileReader
* file
, const Footer
& footer
,
309 const ReadOptions
& read_options
,
310 const BlockHandle
& handle
, BlockContents
* contents
,
311 const ImmutableCFOptions
&ioptions
,
312 bool decompression_requested
,
313 const Slice
& compression_dict
,
314 const PersistentCacheOptions
& cache_options
) {
317 size_t n
= static_cast<size_t>(handle
.size());
318 std::unique_ptr
<char[]> heap_buf
;
319 char stack_buf
[DefaultStackBufferSize
];
320 char* used_buf
= nullptr;
321 rocksdb::CompressionType compression_type
;
323 if (cache_options
.persistent_cache
&&
324 !cache_options
.persistent_cache
->IsCompressed()) {
325 status
= PersistentCacheHelper::LookupUncompressedPage(cache_options
,
328 // uncompressed page is found for the block handle
331 // uncompressed page is not found
332 if (ioptions
.info_log
&& !status
.IsNotFound()) {
333 assert(!status
.ok());
334 ROCKS_LOG_INFO(ioptions
.info_log
,
335 "Error reading from persistent cache. %s",
336 status
.ToString().c_str());
341 if (cache_options
.persistent_cache
&&
342 cache_options
.persistent_cache
->IsCompressed()) {
343 // lookup uncompressed cache mode p-cache
344 status
= PersistentCacheHelper::LookupRawPage(
345 cache_options
, handle
, &heap_buf
, n
+ kBlockTrailerSize
);
347 status
= Status::NotFound();
352 used_buf
= heap_buf
.get();
353 slice
= Slice(heap_buf
.get(), n
);
355 if (ioptions
.info_log
&& !status
.IsNotFound()) {
356 assert(!status
.ok());
357 ROCKS_LOG_INFO(ioptions
.info_log
,
358 "Error reading from persistent cache. %s",
359 status
.ToString().c_str());
361 // cache miss read from device
362 if (decompression_requested
&&
363 n
+ kBlockTrailerSize
< DefaultStackBufferSize
) {
364 // If we've got a small enough hunk of data, read it in to the
365 // trivially allocated stack buffer instead of needing a full malloc()
366 used_buf
= &stack_buf
[0];
368 heap_buf
= std::unique_ptr
<char[]>(new char[n
+ kBlockTrailerSize
]);
369 used_buf
= heap_buf
.get();
372 status
= ReadBlock(file
, footer
, read_options
, handle
, &slice
, used_buf
);
373 if (status
.ok() && read_options
.fill_cache
&&
374 cache_options
.persistent_cache
&&
375 cache_options
.persistent_cache
->IsCompressed()) {
376 // insert to raw cache
377 PersistentCacheHelper::InsertRawPage(cache_options
, handle
, used_buf
,
378 n
+ kBlockTrailerSize
);
386 PERF_TIMER_GUARD(block_decompress_time
);
388 compression_type
= static_cast<rocksdb::CompressionType
>(slice
.data()[n
]);
390 if (decompression_requested
&& compression_type
!= kNoCompression
) {
391 // compressed page, uncompress, update cache
392 status
= UncompressBlockContents(slice
.data(), n
, contents
,
393 footer
.version(), compression_dict
,
395 } else if (slice
.data() != used_buf
) {
396 // the slice content is not the buffer provided
397 *contents
= BlockContents(Slice(slice
.data(), n
), false, compression_type
);
399 // page is uncompressed, the buffer either stack or heap provided
400 if (used_buf
== &stack_buf
[0]) {
401 heap_buf
= std::unique_ptr
<char[]>(new char[n
]);
402 memcpy(heap_buf
.get(), stack_buf
, n
);
404 *contents
= BlockContents(std::move(heap_buf
), n
, true, compression_type
);
407 if (status
.ok() && read_options
.fill_cache
&&
408 cache_options
.persistent_cache
&&
409 !cache_options
.persistent_cache
->IsCompressed()) {
410 // insert to uncompressed cache
411 PersistentCacheHelper::InsertUncompressedPage(cache_options
, handle
,
418 Status
UncompressBlockContentsForCompressionType(
419 const char* data
, size_t n
, BlockContents
* contents
,
420 uint32_t format_version
, const Slice
& compression_dict
,
421 CompressionType compression_type
, const ImmutableCFOptions
&ioptions
) {
422 std::unique_ptr
<char[]> ubuf
;
424 assert(compression_type
!= kNoCompression
&& "Invalid compression type");
426 StopWatchNano
timer(ioptions
.env
,
427 ShouldReportDetailedTime(ioptions
.env
, ioptions
.statistics
));
428 int decompress_size
= 0;
429 switch (compression_type
) {
430 case kSnappyCompression
: {
432 static char snappy_corrupt_msg
[] =
433 "Snappy not supported or corrupted Snappy compressed block contents";
434 if (!Snappy_GetUncompressedLength(data
, n
, &ulength
)) {
435 return Status::Corruption(snappy_corrupt_msg
);
437 ubuf
.reset(new char[ulength
]);
438 if (!Snappy_Uncompress(data
, n
, ubuf
.get())) {
439 return Status::Corruption(snappy_corrupt_msg
);
441 *contents
= BlockContents(std::move(ubuf
), ulength
, true, kNoCompression
);
444 case kZlibCompression
:
445 ubuf
.reset(Zlib_Uncompress(
446 data
, n
, &decompress_size
,
447 GetCompressFormatForVersion(kZlibCompression
, format_version
),
450 static char zlib_corrupt_msg
[] =
451 "Zlib not supported or corrupted Zlib compressed block contents";
452 return Status::Corruption(zlib_corrupt_msg
);
455 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
457 case kBZip2Compression
:
458 ubuf
.reset(BZip2_Uncompress(
459 data
, n
, &decompress_size
,
460 GetCompressFormatForVersion(kBZip2Compression
, format_version
)));
462 static char bzip2_corrupt_msg
[] =
463 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
464 return Status::Corruption(bzip2_corrupt_msg
);
467 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
469 case kLZ4Compression
:
470 ubuf
.reset(LZ4_Uncompress(
471 data
, n
, &decompress_size
,
472 GetCompressFormatForVersion(kLZ4Compression
, format_version
),
475 static char lz4_corrupt_msg
[] =
476 "LZ4 not supported or corrupted LZ4 compressed block contents";
477 return Status::Corruption(lz4_corrupt_msg
);
480 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
482 case kLZ4HCCompression
:
483 ubuf
.reset(LZ4_Uncompress(
484 data
, n
, &decompress_size
,
485 GetCompressFormatForVersion(kLZ4HCCompression
, format_version
),
488 static char lz4hc_corrupt_msg
[] =
489 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
490 return Status::Corruption(lz4hc_corrupt_msg
);
493 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
495 case kXpressCompression
:
496 ubuf
.reset(XPRESS_Uncompress(data
, n
, &decompress_size
));
498 static char xpress_corrupt_msg
[] =
499 "XPRESS not supported or corrupted XPRESS compressed block contents";
500 return Status::Corruption(xpress_corrupt_msg
);
503 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
506 case kZSTDNotFinalCompression
:
507 ubuf
.reset(ZSTD_Uncompress(data
, n
, &decompress_size
, compression_dict
));
509 static char zstd_corrupt_msg
[] =
510 "ZSTD not supported or corrupted ZSTD compressed block contents";
511 return Status::Corruption(zstd_corrupt_msg
);
514 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
517 return Status::Corruption("bad block type");
520 if(ShouldReportDetailedTime(ioptions
.env
, ioptions
.statistics
)){
521 MeasureTime(ioptions
.statistics
, DECOMPRESSION_TIMES_NANOS
,
522 timer
.ElapsedNanos());
523 MeasureTime(ioptions
.statistics
, BYTES_DECOMPRESSED
, contents
->data
.size());
524 RecordTick(ioptions
.statistics
, NUMBER_BLOCK_DECOMPRESSED
);
531 // The 'data' points to the raw block contents that was read in from file.
532 // This method allocates a new heap buffer and the raw block
533 // contents are uncompresed into this buffer. This
534 // buffer is returned via 'result' and it is upto the caller to
536 // format_version is the block format as defined in include/rocksdb/table.h
537 Status
UncompressBlockContents(const char* data
, size_t n
,
538 BlockContents
* contents
, uint32_t format_version
,
539 const Slice
& compression_dict
,
540 const ImmutableCFOptions
&ioptions
) {
541 assert(data
[n
] != kNoCompression
);
542 return UncompressBlockContentsForCompressionType(
543 data
, n
, contents
, format_version
, compression_dict
,
544 (CompressionType
)data
[n
], ioptions
);
547 } // namespace rocksdb