1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #include "table/format.h"
15 #include "monitoring/perf_context_imp.h"
16 #include "monitoring/statistics.h"
17 #include "rocksdb/env.h"
18 #include "table/block.h"
19 #include "table/block_based_table_reader.h"
20 #include "table/block_fetcher.h"
21 #include "table/persistent_cache_helper.h"
22 #include "util/coding.h"
23 #include "util/compression.h"
24 #include "util/crc32c.h"
25 #include "util/file_reader_writer.h"
26 #include "util/logging.h"
27 #include "util/memory_allocator.h"
28 #include "util/stop_watch.h"
29 #include "util/string_util.h"
30 #include "util/xxhash.h"
34 extern const uint64_t kLegacyBlockBasedTableMagicNumber
;
35 extern const uint64_t kBlockBasedTableMagicNumber
;
38 extern const uint64_t kLegacyPlainTableMagicNumber
;
39 extern const uint64_t kPlainTableMagicNumber
;
41 // ROCKSDB_LITE doesn't have plain table
42 const uint64_t kLegacyPlainTableMagicNumber
= 0;
43 const uint64_t kPlainTableMagicNumber
= 0;
46 bool ShouldReportDetailedTime(Env
* env
, Statistics
* stats
) {
47 return env
!= nullptr && stats
!= nullptr &&
48 stats
->get_stats_level() > kExceptDetailedTimers
;
51 void BlockHandle::EncodeTo(std::string
* dst
) const {
52 // Sanity check that all fields have been set
53 assert(offset_
!= ~static_cast<uint64_t>(0));
54 assert(size_
!= ~static_cast<uint64_t>(0));
55 PutVarint64Varint64(dst
, offset_
, size_
);
58 Status
BlockHandle::DecodeFrom(Slice
* input
) {
59 if (GetVarint64(input
, &offset_
) && GetVarint64(input
, &size_
)) {
62 // reset in case failure after partially decoding
65 return Status::Corruption("bad block handle");
69 Status
BlockHandle::DecodeSizeFrom(uint64_t _offset
, Slice
* input
) {
70 if (GetVarint64(input
, &size_
)) {
74 // reset in case failure after partially decoding
77 return Status::Corruption("bad block handle");
81 // Return a string that contains the copy of handle.
82 std::string
BlockHandle::ToString(bool hex
) const {
83 std::string handle_str
;
84 EncodeTo(&handle_str
);
86 return Slice(handle_str
).ToString(true);
92 const BlockHandle
BlockHandle::kNullBlockHandle(0, 0);
95 inline bool IsLegacyFooterFormat(uint64_t magic_number
) {
96 return magic_number
== kLegacyBlockBasedTableMagicNumber
||
97 magic_number
== kLegacyPlainTableMagicNumber
;
99 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number
) {
100 if (magic_number
== kLegacyBlockBasedTableMagicNumber
) {
101 return kBlockBasedTableMagicNumber
;
103 if (magic_number
== kLegacyPlainTableMagicNumber
) {
104 return kPlainTableMagicNumber
;
111 // legacy footer format:
112 // metaindex handle (varint64 offset, varint64 size)
113 // index handle (varint64 offset, varint64 size)
114 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
115 // table_magic_number (8 bytes)
116 // new footer format:
117 // checksum type (char, 1 byte)
118 // metaindex handle (varint64 offset, varint64 size)
119 // index handle (varint64 offset, varint64 size)
120 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
121 // footer version (4 bytes)
122 // table_magic_number (8 bytes)
123 void Footer::EncodeTo(std::string
* dst
) const {
124 assert(HasInitializedTableMagicNumber());
125 if (IsLegacyFooterFormat(table_magic_number())) {
126 // has to be default checksum with legacy footer
127 assert(checksum_
== kCRC32c
);
128 const size_t original_size
= dst
->size();
129 metaindex_handle_
.EncodeTo(dst
);
130 index_handle_
.EncodeTo(dst
);
131 dst
->resize(original_size
+ 2 * BlockHandle::kMaxEncodedLength
); // Padding
132 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() & 0xffffffffu
));
133 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() >> 32));
134 assert(dst
->size() == original_size
+ kVersion0EncodedLength
);
136 const size_t original_size
= dst
->size();
137 dst
->push_back(static_cast<char>(checksum_
));
138 metaindex_handle_
.EncodeTo(dst
);
139 index_handle_
.EncodeTo(dst
);
140 dst
->resize(original_size
+ kNewVersionsEncodedLength
- 12); // Padding
141 PutFixed32(dst
, version());
142 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() & 0xffffffffu
));
143 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() >> 32));
144 assert(dst
->size() == original_size
+ kNewVersionsEncodedLength
);
148 Footer::Footer(uint64_t _table_magic_number
, uint32_t _version
)
149 : version_(_version
),
151 table_magic_number_(_table_magic_number
) {
152 // This should be guaranteed by constructor callers
153 assert(!IsLegacyFooterFormat(_table_magic_number
) || version_
== 0);
156 Status
Footer::DecodeFrom(Slice
* input
) {
157 assert(!HasInitializedTableMagicNumber());
158 assert(input
!= nullptr);
159 assert(input
->size() >= kMinEncodedLength
);
161 const char* magic_ptr
=
162 input
->data() + input
->size() - kMagicNumberLengthByte
;
163 const uint32_t magic_lo
= DecodeFixed32(magic_ptr
);
164 const uint32_t magic_hi
= DecodeFixed32(magic_ptr
+ 4);
165 uint64_t magic
= ((static_cast<uint64_t>(magic_hi
) << 32) |
166 (static_cast<uint64_t>(magic_lo
)));
168 // We check for legacy formats here and silently upconvert them
169 bool legacy
= IsLegacyFooterFormat(magic
);
171 magic
= UpconvertLegacyFooterFormat(magic
);
173 set_table_magic_number(magic
);
176 // The size is already asserted to be at least kMinEncodedLength
177 // at the beginning of the function
178 input
->remove_prefix(input
->size() - kVersion0EncodedLength
);
179 version_
= 0 /* legacy */;
182 version_
= DecodeFixed32(magic_ptr
- 4);
183 // Footer version 1 and higher will always occupy exactly this many bytes.
184 // It consists of the checksum type, two block handles, padding,
185 // a version number, and a magic number
186 if (input
->size() < kNewVersionsEncodedLength
) {
187 return Status::Corruption("input is too short to be an sstable");
189 input
->remove_prefix(input
->size() - kNewVersionsEncodedLength
);
192 if (!GetVarint32(input
, &chksum
)) {
193 return Status::Corruption("bad checksum type");
195 checksum_
= static_cast<ChecksumType
>(chksum
);
198 Status result
= metaindex_handle_
.DecodeFrom(input
);
200 result
= index_handle_
.DecodeFrom(input
);
203 // We skip over any leftover data (just padding for now) in "input"
204 const char* end
= magic_ptr
+ kMagicNumberLengthByte
;
205 *input
= Slice(end
, input
->data() + input
->size() - end
);
210 std::string
Footer::ToString() const {
212 result
.reserve(1024);
214 bool legacy
= IsLegacyFooterFormat(table_magic_number_
);
216 result
.append("metaindex handle: " + metaindex_handle_
.ToString() + "\n ");
217 result
.append("index handle: " + index_handle_
.ToString() + "\n ");
218 result
.append("table_magic_number: " +
219 rocksdb::ToString(table_magic_number_
) + "\n ");
221 result
.append("checksum: " + rocksdb::ToString(checksum_
) + "\n ");
222 result
.append("metaindex handle: " + metaindex_handle_
.ToString() + "\n ");
223 result
.append("index handle: " + index_handle_
.ToString() + "\n ");
224 result
.append("footer version: " + rocksdb::ToString(version_
) + "\n ");
225 result
.append("table_magic_number: " +
226 rocksdb::ToString(table_magic_number_
) + "\n ");
231 Status
ReadFooterFromFile(RandomAccessFileReader
* file
,
232 FilePrefetchBuffer
* prefetch_buffer
,
233 uint64_t file_size
, Footer
* footer
,
234 uint64_t enforce_table_magic_number
) {
235 if (file_size
< Footer::kMinEncodedLength
) {
236 return Status::Corruption("file is too short (" + ToString(file_size
) +
242 char footer_space
[Footer::kMaxEncodedLength
];
245 (file_size
> Footer::kMaxEncodedLength
)
246 ? static_cast<size_t>(file_size
- Footer::kMaxEncodedLength
)
249 if (prefetch_buffer
== nullptr ||
250 !prefetch_buffer
->TryReadFromCache(read_offset
, Footer::kMaxEncodedLength
,
252 s
= file
->Read(read_offset
, Footer::kMaxEncodedLength
, &footer_input
,
254 if (!s
.ok()) return s
;
257 // Check that we actually read the whole footer from the file. It may be
258 // that size isn't correct.
259 if (footer_input
.size() < Footer::kMinEncodedLength
) {
260 return Status::Corruption("file is too short (" + ToString(file_size
) +
266 s
= footer
->DecodeFrom(&footer_input
);
270 if (enforce_table_magic_number
!= 0 &&
271 enforce_table_magic_number
!= footer
->table_magic_number()) {
272 return Status::Corruption(
273 "Bad table magic number: expected " +
274 ToString(enforce_table_magic_number
) + ", found " +
275 ToString(footer
->table_magic_number()) + " in " + file
->file_name());
280 Status
UncompressBlockContentsForCompressionType(
281 const UncompressionInfo
& uncompression_info
, const char* data
, size_t n
,
282 BlockContents
* contents
, uint32_t format_version
,
283 const ImmutableCFOptions
& ioptions
, MemoryAllocator
* allocator
) {
284 CacheAllocationPtr ubuf
;
286 assert(uncompression_info
.type() != kNoCompression
&&
287 "Invalid compression type");
289 StopWatchNano
timer(ioptions
.env
, ShouldReportDetailedTime(
290 ioptions
.env
, ioptions
.statistics
));
291 int decompress_size
= 0;
292 switch (uncompression_info
.type()) {
293 case kSnappyCompression
: {
295 static char snappy_corrupt_msg
[] =
296 "Snappy not supported or corrupted Snappy compressed block contents";
297 if (!Snappy_GetUncompressedLength(data
, n
, &ulength
)) {
298 return Status::Corruption(snappy_corrupt_msg
);
300 ubuf
= AllocateBlock(ulength
, allocator
);
301 if (!Snappy_Uncompress(data
, n
, ubuf
.get())) {
302 return Status::Corruption(snappy_corrupt_msg
);
304 *contents
= BlockContents(std::move(ubuf
), ulength
);
307 case kZlibCompression
:
308 ubuf
= Zlib_Uncompress(
309 uncompression_info
, data
, n
, &decompress_size
,
310 GetCompressFormatForVersion(kZlibCompression
, format_version
),
313 static char zlib_corrupt_msg
[] =
314 "Zlib not supported or corrupted Zlib compressed block contents";
315 return Status::Corruption(zlib_corrupt_msg
);
317 *contents
= BlockContents(std::move(ubuf
), decompress_size
);
319 case kBZip2Compression
:
320 ubuf
= BZip2_Uncompress(
321 data
, n
, &decompress_size
,
322 GetCompressFormatForVersion(kBZip2Compression
, format_version
),
325 static char bzip2_corrupt_msg
[] =
326 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
327 return Status::Corruption(bzip2_corrupt_msg
);
329 *contents
= BlockContents(std::move(ubuf
), decompress_size
);
331 case kLZ4Compression
:
332 ubuf
= LZ4_Uncompress(
333 uncompression_info
, data
, n
, &decompress_size
,
334 GetCompressFormatForVersion(kLZ4Compression
, format_version
),
337 static char lz4_corrupt_msg
[] =
338 "LZ4 not supported or corrupted LZ4 compressed block contents";
339 return Status::Corruption(lz4_corrupt_msg
);
341 *contents
= BlockContents(std::move(ubuf
), decompress_size
);
343 case kLZ4HCCompression
:
344 ubuf
= LZ4_Uncompress(
345 uncompression_info
, data
, n
, &decompress_size
,
346 GetCompressFormatForVersion(kLZ4HCCompression
, format_version
),
349 static char lz4hc_corrupt_msg
[] =
350 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
351 return Status::Corruption(lz4hc_corrupt_msg
);
353 *contents
= BlockContents(std::move(ubuf
), decompress_size
);
355 case kXpressCompression
:
356 // XPRESS allocates memory internally, thus no support for custom
358 ubuf
.reset(XPRESS_Uncompress(data
, n
, &decompress_size
));
360 static char xpress_corrupt_msg
[] =
361 "XPRESS not supported or corrupted XPRESS compressed block "
363 return Status::Corruption(xpress_corrupt_msg
);
365 *contents
= BlockContents(std::move(ubuf
), decompress_size
);
368 case kZSTDNotFinalCompression
:
369 ubuf
= ZSTD_Uncompress(uncompression_info
, data
, n
, &decompress_size
,
372 static char zstd_corrupt_msg
[] =
373 "ZSTD not supported or corrupted ZSTD compressed block contents";
374 return Status::Corruption(zstd_corrupt_msg
);
376 *contents
= BlockContents(std::move(ubuf
), decompress_size
);
379 return Status::Corruption("bad block type");
382 if (ShouldReportDetailedTime(ioptions
.env
, ioptions
.statistics
)) {
383 RecordTimeToHistogram(ioptions
.statistics
, DECOMPRESSION_TIMES_NANOS
,
384 timer
.ElapsedNanos());
386 RecordTimeToHistogram(ioptions
.statistics
, BYTES_DECOMPRESSED
,
387 contents
->data
.size());
388 RecordTick(ioptions
.statistics
, NUMBER_BLOCK_DECOMPRESSED
);
394 // The 'data' points to the raw block contents that was read in from file.
395 // This method allocates a new heap buffer and the raw block
396 // contents are uncompresed into this buffer. This
397 // buffer is returned via 'result' and it is upto the caller to
399 // format_version is the block format as defined in include/rocksdb/table.h
400 Status
UncompressBlockContents(const UncompressionInfo
& uncompression_info
,
401 const char* data
, size_t n
,
402 BlockContents
* contents
, uint32_t format_version
,
403 const ImmutableCFOptions
& ioptions
,
404 MemoryAllocator
* allocator
) {
405 assert(data
[n
] != kNoCompression
);
406 assert(data
[n
] == uncompression_info
.type());
407 return UncompressBlockContentsForCompressionType(uncompression_info
, data
, n
,
408 contents
, format_version
,
409 ioptions
, allocator
);
412 } // namespace rocksdb