1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #include "table/format.h"
15 #include "monitoring/perf_context_imp.h"
16 #include "monitoring/statistics.h"
17 #include "rocksdb/env.h"
18 #include "table/block.h"
19 #include "table/block_based_table_reader.h"
20 #include "table/block_fetcher.h"
21 #include "table/persistent_cache_helper.h"
22 #include "util/coding.h"
23 #include "util/compression.h"
24 #include "util/crc32c.h"
25 #include "util/file_reader_writer.h"
26 #include "util/logging.h"
27 #include "util/stop_watch.h"
28 #include "util/string_util.h"
29 #include "util/xxhash.h"
33 extern const uint64_t kLegacyBlockBasedTableMagicNumber
;
34 extern const uint64_t kBlockBasedTableMagicNumber
;
37 extern const uint64_t kLegacyPlainTableMagicNumber
;
38 extern const uint64_t kPlainTableMagicNumber
;
40 // ROCKSDB_LITE doesn't have plain table
41 const uint64_t kLegacyPlainTableMagicNumber
= 0;
42 const uint64_t kPlainTableMagicNumber
= 0;
45 bool ShouldReportDetailedTime(Env
* env
, Statistics
* stats
) {
46 return env
!= nullptr && stats
!= nullptr &&
47 stats
->stats_level_
> kExceptDetailedTimers
;
50 void BlockHandle::EncodeTo(std::string
* dst
) const {
51 // Sanity check that all fields have been set
52 assert(offset_
!= ~static_cast<uint64_t>(0));
53 assert(size_
!= ~static_cast<uint64_t>(0));
54 PutVarint64Varint64(dst
, offset_
, size_
);
57 Status
BlockHandle::DecodeFrom(Slice
* input
) {
58 if (GetVarint64(input
, &offset_
) &&
59 GetVarint64(input
, &size_
)) {
62 // reset in case failure after partially decoding
65 return Status::Corruption("bad block handle");
69 Status
BlockHandle::DecodeSizeFrom(uint64_t _offset
, Slice
* input
) {
70 if (GetVarint64(input
, &size_
)) {
74 // reset in case failure after partially decoding
77 return Status::Corruption("bad block handle");
81 // Return a string that contains the copy of handle.
82 std::string
BlockHandle::ToString(bool hex
) const {
83 std::string handle_str
;
84 EncodeTo(&handle_str
);
86 return Slice(handle_str
).ToString(true);
92 const BlockHandle
BlockHandle::kNullBlockHandle(0, 0);
95 inline bool IsLegacyFooterFormat(uint64_t magic_number
) {
96 return magic_number
== kLegacyBlockBasedTableMagicNumber
||
97 magic_number
== kLegacyPlainTableMagicNumber
;
99 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number
) {
100 if (magic_number
== kLegacyBlockBasedTableMagicNumber
) {
101 return kBlockBasedTableMagicNumber
;
103 if (magic_number
== kLegacyPlainTableMagicNumber
) {
104 return kPlainTableMagicNumber
;
111 // legacy footer format:
112 // metaindex handle (varint64 offset, varint64 size)
113 // index handle (varint64 offset, varint64 size)
114 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
115 // table_magic_number (8 bytes)
116 // new footer format:
117 // checksum type (char, 1 byte)
118 // metaindex handle (varint64 offset, varint64 size)
119 // index handle (varint64 offset, varint64 size)
120 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
121 // footer version (4 bytes)
122 // table_magic_number (8 bytes)
123 void Footer::EncodeTo(std::string
* dst
) const {
124 assert(HasInitializedTableMagicNumber());
125 if (IsLegacyFooterFormat(table_magic_number())) {
126 // has to be default checksum with legacy footer
127 assert(checksum_
== kCRC32c
);
128 const size_t original_size
= dst
->size();
129 metaindex_handle_
.EncodeTo(dst
);
130 index_handle_
.EncodeTo(dst
);
131 dst
->resize(original_size
+ 2 * BlockHandle::kMaxEncodedLength
); // Padding
132 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() & 0xffffffffu
));
133 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() >> 32));
134 assert(dst
->size() == original_size
+ kVersion0EncodedLength
);
136 const size_t original_size
= dst
->size();
137 dst
->push_back(static_cast<char>(checksum_
));
138 metaindex_handle_
.EncodeTo(dst
);
139 index_handle_
.EncodeTo(dst
);
140 dst
->resize(original_size
+ kNewVersionsEncodedLength
- 12); // Padding
141 PutFixed32(dst
, version());
142 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() & 0xffffffffu
));
143 PutFixed32(dst
, static_cast<uint32_t>(table_magic_number() >> 32));
144 assert(dst
->size() == original_size
+ kNewVersionsEncodedLength
);
148 Footer::Footer(uint64_t _table_magic_number
, uint32_t _version
)
149 : version_(_version
),
151 table_magic_number_(_table_magic_number
) {
152 // This should be guaranteed by constructor callers
153 assert(!IsLegacyFooterFormat(_table_magic_number
) || version_
== 0);
156 Status
Footer::DecodeFrom(Slice
* input
) {
157 assert(!HasInitializedTableMagicNumber());
158 assert(input
!= nullptr);
159 assert(input
->size() >= kMinEncodedLength
);
161 const char *magic_ptr
=
162 input
->data() + input
->size() - kMagicNumberLengthByte
;
163 const uint32_t magic_lo
= DecodeFixed32(magic_ptr
);
164 const uint32_t magic_hi
= DecodeFixed32(magic_ptr
+ 4);
165 uint64_t magic
= ((static_cast<uint64_t>(magic_hi
) << 32) |
166 (static_cast<uint64_t>(magic_lo
)));
168 // We check for legacy formats here and silently upconvert them
169 bool legacy
= IsLegacyFooterFormat(magic
);
171 magic
= UpconvertLegacyFooterFormat(magic
);
173 set_table_magic_number(magic
);
176 // The size is already asserted to be at least kMinEncodedLength
177 // at the beginning of the function
178 input
->remove_prefix(input
->size() - kVersion0EncodedLength
);
179 version_
= 0 /* legacy */;
182 version_
= DecodeFixed32(magic_ptr
- 4);
183 // Footer version 1 and higher will always occupy exactly this many bytes.
184 // It consists of the checksum type, two block handles, padding,
185 // a version number, and a magic number
186 if (input
->size() < kNewVersionsEncodedLength
) {
187 return Status::Corruption("input is too short to be an sstable");
189 input
->remove_prefix(input
->size() - kNewVersionsEncodedLength
);
192 if (!GetVarint32(input
, &chksum
)) {
193 return Status::Corruption("bad checksum type");
195 checksum_
= static_cast<ChecksumType
>(chksum
);
198 Status result
= metaindex_handle_
.DecodeFrom(input
);
200 result
= index_handle_
.DecodeFrom(input
);
203 // We skip over any leftover data (just padding for now) in "input"
204 const char* end
= magic_ptr
+ kMagicNumberLengthByte
;
205 *input
= Slice(end
, input
->data() + input
->size() - end
);
210 std::string
Footer::ToString() const {
212 result
.reserve(1024);
214 bool legacy
= IsLegacyFooterFormat(table_magic_number_
);
216 result
.append("metaindex handle: " + metaindex_handle_
.ToString() + "\n ");
217 result
.append("index handle: " + index_handle_
.ToString() + "\n ");
218 result
.append("table_magic_number: " +
219 rocksdb::ToString(table_magic_number_
) + "\n ");
221 result
.append("checksum: " + rocksdb::ToString(checksum_
) + "\n ");
222 result
.append("metaindex handle: " + metaindex_handle_
.ToString() + "\n ");
223 result
.append("index handle: " + index_handle_
.ToString() + "\n ");
224 result
.append("footer version: " + rocksdb::ToString(version_
) + "\n ");
225 result
.append("table_magic_number: " +
226 rocksdb::ToString(table_magic_number_
) + "\n ");
231 Status
ReadFooterFromFile(RandomAccessFileReader
* file
,
232 FilePrefetchBuffer
* prefetch_buffer
,
233 uint64_t file_size
, Footer
* footer
,
234 uint64_t enforce_table_magic_number
) {
235 if (file_size
< Footer::kMinEncodedLength
) {
236 return Status::Corruption(
237 "file is too short (" + ToString(file_size
) + " bytes) to be an "
238 "sstable: " + file
->file_name());
241 char footer_space
[Footer::kMaxEncodedLength
];
244 (file_size
> Footer::kMaxEncodedLength
)
245 ? static_cast<size_t>(file_size
- Footer::kMaxEncodedLength
)
248 if (prefetch_buffer
== nullptr ||
249 !prefetch_buffer
->TryReadFromCache(read_offset
, Footer::kMaxEncodedLength
,
251 s
= file
->Read(read_offset
, Footer::kMaxEncodedLength
, &footer_input
,
253 if (!s
.ok()) return s
;
256 // Check that we actually read the whole footer from the file. It may be
257 // that size isn't correct.
258 if (footer_input
.size() < Footer::kMinEncodedLength
) {
259 return Status::Corruption(
260 "file is too short (" + ToString(file_size
) + " bytes) to be an "
261 "sstable" + file
->file_name());
264 s
= footer
->DecodeFrom(&footer_input
);
268 if (enforce_table_magic_number
!= 0 &&
269 enforce_table_magic_number
!= footer
->table_magic_number()) {
270 return Status::Corruption(
271 "Bad table magic number: expected "
272 + ToString(enforce_table_magic_number
) + ", found "
273 + ToString(footer
->table_magic_number())
274 + " in " + file
->file_name());
279 Status
UncompressBlockContentsForCompressionType(
280 const UncompressionContext
& uncompression_ctx
, const char* data
, size_t n
,
281 BlockContents
* contents
, uint32_t format_version
,
282 const ImmutableCFOptions
& ioptions
) {
283 std::unique_ptr
<char[]> ubuf
;
285 assert(uncompression_ctx
.type() != kNoCompression
&&
286 "Invalid compression type");
288 StopWatchNano
timer(ioptions
.env
,
289 ShouldReportDetailedTime(ioptions
.env
, ioptions
.statistics
));
290 int decompress_size
= 0;
291 switch (uncompression_ctx
.type()) {
292 case kSnappyCompression
: {
294 static char snappy_corrupt_msg
[] =
295 "Snappy not supported or corrupted Snappy compressed block contents";
296 if (!Snappy_GetUncompressedLength(data
, n
, &ulength
)) {
297 return Status::Corruption(snappy_corrupt_msg
);
299 ubuf
.reset(new char[ulength
]);
300 if (!Snappy_Uncompress(data
, n
, ubuf
.get())) {
301 return Status::Corruption(snappy_corrupt_msg
);
303 *contents
= BlockContents(std::move(ubuf
), ulength
, true, kNoCompression
);
306 case kZlibCompression
:
307 ubuf
.reset(Zlib_Uncompress(
308 uncompression_ctx
, data
, n
, &decompress_size
,
309 GetCompressFormatForVersion(kZlibCompression
, format_version
)));
311 static char zlib_corrupt_msg
[] =
312 "Zlib not supported or corrupted Zlib compressed block contents";
313 return Status::Corruption(zlib_corrupt_msg
);
316 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
318 case kBZip2Compression
:
319 ubuf
.reset(BZip2_Uncompress(
320 data
, n
, &decompress_size
,
321 GetCompressFormatForVersion(kBZip2Compression
, format_version
)));
323 static char bzip2_corrupt_msg
[] =
324 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
325 return Status::Corruption(bzip2_corrupt_msg
);
328 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
330 case kLZ4Compression
:
331 ubuf
.reset(LZ4_Uncompress(
332 uncompression_ctx
, data
, n
, &decompress_size
,
333 GetCompressFormatForVersion(kLZ4Compression
, format_version
)));
335 static char lz4_corrupt_msg
[] =
336 "LZ4 not supported or corrupted LZ4 compressed block contents";
337 return Status::Corruption(lz4_corrupt_msg
);
340 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
342 case kLZ4HCCompression
:
343 ubuf
.reset(LZ4_Uncompress(
344 uncompression_ctx
, data
, n
, &decompress_size
,
345 GetCompressFormatForVersion(kLZ4HCCompression
, format_version
)));
347 static char lz4hc_corrupt_msg
[] =
348 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
349 return Status::Corruption(lz4hc_corrupt_msg
);
352 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
354 case kXpressCompression
:
355 ubuf
.reset(XPRESS_Uncompress(data
, n
, &decompress_size
));
357 static char xpress_corrupt_msg
[] =
358 "XPRESS not supported or corrupted XPRESS compressed block contents";
359 return Status::Corruption(xpress_corrupt_msg
);
362 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
365 case kZSTDNotFinalCompression
:
366 ubuf
.reset(ZSTD_Uncompress(uncompression_ctx
, data
, n
, &decompress_size
));
368 static char zstd_corrupt_msg
[] =
369 "ZSTD not supported or corrupted ZSTD compressed block contents";
370 return Status::Corruption(zstd_corrupt_msg
);
373 BlockContents(std::move(ubuf
), decompress_size
, true, kNoCompression
);
376 return Status::Corruption("bad block type");
379 if(ShouldReportDetailedTime(ioptions
.env
, ioptions
.statistics
)){
380 MeasureTime(ioptions
.statistics
, DECOMPRESSION_TIMES_NANOS
,
381 timer
.ElapsedNanos());
383 MeasureTime(ioptions
.statistics
, BYTES_DECOMPRESSED
, contents
->data
.size());
384 RecordTick(ioptions
.statistics
, NUMBER_BLOCK_DECOMPRESSED
);
390 // The 'data' points to the raw block contents that was read in from file.
391 // This method allocates a new heap buffer and the raw block
392 // contents are uncompresed into this buffer. This
393 // buffer is returned via 'result' and it is upto the caller to
395 // format_version is the block format as defined in include/rocksdb/table.h
396 Status
UncompressBlockContents(const UncompressionContext
& uncompression_ctx
,
397 const char* data
, size_t n
,
398 BlockContents
* contents
, uint32_t format_version
,
399 const ImmutableCFOptions
& ioptions
) {
400 assert(data
[n
] != kNoCompression
);
401 assert(data
[n
] == uncompression_ctx
.type());
402 return UncompressBlockContentsForCompressionType(
403 uncompression_ctx
, data
, n
, contents
, format_version
, ioptions
);
406 } // namespace rocksdb