]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/format.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / table / format.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#include "table/format.h"
11
f67539c2 12#include <cinttypes>
494da23a 13#include <string>
7c673cae 14
f67539c2
TL
15#include "block_fetcher.h"
16#include "file/random_access_file_reader.h"
17#include "logging/logging.h"
18#include "memory/memory_allocator.h"
7c673cae
FG
19#include "monitoring/perf_context_imp.h"
20#include "monitoring/statistics.h"
21#include "rocksdb/env.h"
f67539c2
TL
22#include "table/block_based/block.h"
23#include "table/block_based/block_based_table_reader.h"
7c673cae
FG
24#include "table/persistent_cache_helper.h"
25#include "util/coding.h"
26#include "util/compression.h"
27#include "util/crc32c.h"
7c673cae
FG
28#include "util/stop_watch.h"
29#include "util/string_util.h"
7c673cae 30
f67539c2 31namespace ROCKSDB_NAMESPACE {
7c673cae
FG
32
33extern const uint64_t kLegacyBlockBasedTableMagicNumber;
34extern const uint64_t kBlockBasedTableMagicNumber;
35
36#ifndef ROCKSDB_LITE
37extern const uint64_t kLegacyPlainTableMagicNumber;
38extern const uint64_t kPlainTableMagicNumber;
39#else
40// ROCKSDB_LITE doesn't have plain table
41const uint64_t kLegacyPlainTableMagicNumber = 0;
42const uint64_t kPlainTableMagicNumber = 0;
43#endif
7c673cae
FG
44
45bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
46 return env != nullptr && stats != nullptr &&
494da23a 47 stats->get_stats_level() > kExceptDetailedTimers;
7c673cae
FG
48}
49
50void BlockHandle::EncodeTo(std::string* dst) const {
51 // Sanity check that all fields have been set
52 assert(offset_ != ~static_cast<uint64_t>(0));
53 assert(size_ != ~static_cast<uint64_t>(0));
54 PutVarint64Varint64(dst, offset_, size_);
55}
56
57Status BlockHandle::DecodeFrom(Slice* input) {
494da23a 58 if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
7c673cae
FG
59 return Status::OK();
60 } else {
61 // reset in case failure after partially decoding
62 offset_ = 0;
63 size_ = 0;
64 return Status::Corruption("bad block handle");
65 }
66}
67
11fdf7f2
TL
68Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
69 if (GetVarint64(input, &size_)) {
70 offset_ = _offset;
71 return Status::OK();
72 } else {
73 // reset in case failure after partially decoding
74 offset_ = 0;
75 size_ = 0;
76 return Status::Corruption("bad block handle");
77 }
78}
79
7c673cae
FG
80// Return a string that contains the copy of handle.
81std::string BlockHandle::ToString(bool hex) const {
82 std::string handle_str;
83 EncodeTo(&handle_str);
84 if (hex) {
85 return Slice(handle_str).ToString(true);
86 } else {
87 return handle_str;
88 }
89}
90
91const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
92
f67539c2
TL
93void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
94 const BlockHandle* previous_handle) const {
95 if (previous_handle) {
96 assert(handle.offset() == previous_handle->offset() +
97 previous_handle->size() + kBlockTrailerSize);
98 PutVarsignedint64(dst, handle.size() - previous_handle->size());
99 } else {
100 handle.EncodeTo(dst);
101 }
102 assert(dst->size() != 0);
103
104 if (have_first_key) {
105 PutLengthPrefixedSlice(dst, first_internal_key);
106 }
107}
108
109Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
110 const BlockHandle* previous_handle) {
111 if (previous_handle) {
112 int64_t delta;
113 if (!GetVarsignedint64(input, &delta)) {
114 return Status::Corruption("bad delta-encoded index value");
115 }
116 handle = BlockHandle(
117 previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
118 previous_handle->size() + delta);
119 } else {
120 Status s = handle.DecodeFrom(input);
121 if (!s.ok()) {
122 return s;
123 }
124 }
125
126 if (!have_first_key) {
127 first_internal_key = Slice();
128 } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
129 return Status::Corruption("bad first key in block info");
130 }
131
132 return Status::OK();
133}
134
135std::string IndexValue::ToString(bool hex, bool have_first_key) const {
136 std::string s;
137 EncodeTo(&s, have_first_key, nullptr);
138 if (hex) {
139 return Slice(s).ToString(true);
140 } else {
141 return s;
142 }
143}
144
7c673cae
FG
145namespace {
146inline bool IsLegacyFooterFormat(uint64_t magic_number) {
147 return magic_number == kLegacyBlockBasedTableMagicNumber ||
148 magic_number == kLegacyPlainTableMagicNumber;
149}
150inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
151 if (magic_number == kLegacyBlockBasedTableMagicNumber) {
152 return kBlockBasedTableMagicNumber;
153 }
154 if (magic_number == kLegacyPlainTableMagicNumber) {
155 return kPlainTableMagicNumber;
156 }
157 assert(false);
158 return 0;
159}
160} // namespace
161
162// legacy footer format:
163// metaindex handle (varint64 offset, varint64 size)
164// index handle (varint64 offset, varint64 size)
165// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
166// table_magic_number (8 bytes)
167// new footer format:
11fdf7f2 168// checksum type (char, 1 byte)
7c673cae
FG
169// metaindex handle (varint64 offset, varint64 size)
170// index handle (varint64 offset, varint64 size)
171// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
172// footer version (4 bytes)
173// table_magic_number (8 bytes)
174void Footer::EncodeTo(std::string* dst) const {
175 assert(HasInitializedTableMagicNumber());
176 if (IsLegacyFooterFormat(table_magic_number())) {
177 // has to be default checksum with legacy footer
178 assert(checksum_ == kCRC32c);
179 const size_t original_size = dst->size();
180 metaindex_handle_.EncodeTo(dst);
181 index_handle_.EncodeTo(dst);
182 dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
183 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
184 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
185 assert(dst->size() == original_size + kVersion0EncodedLength);
186 } else {
187 const size_t original_size = dst->size();
188 dst->push_back(static_cast<char>(checksum_));
189 metaindex_handle_.EncodeTo(dst);
190 index_handle_.EncodeTo(dst);
191 dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
192 PutFixed32(dst, version());
193 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
194 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
195 assert(dst->size() == original_size + kNewVersionsEncodedLength);
196 }
197}
198
199Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
200 : version_(_version),
201 checksum_(kCRC32c),
202 table_magic_number_(_table_magic_number) {
203 // This should be guaranteed by constructor callers
204 assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
205}
206
207Status Footer::DecodeFrom(Slice* input) {
208 assert(!HasInitializedTableMagicNumber());
209 assert(input != nullptr);
210 assert(input->size() >= kMinEncodedLength);
211
494da23a 212 const char* magic_ptr =
7c673cae
FG
213 input->data() + input->size() - kMagicNumberLengthByte;
214 const uint32_t magic_lo = DecodeFixed32(magic_ptr);
215 const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
216 uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
217 (static_cast<uint64_t>(magic_lo)));
218
219 // We check for legacy formats here and silently upconvert them
220 bool legacy = IsLegacyFooterFormat(magic);
221 if (legacy) {
222 magic = UpconvertLegacyFooterFormat(magic);
223 }
224 set_table_magic_number(magic);
225
226 if (legacy) {
227 // The size is already asserted to be at least kMinEncodedLength
228 // at the beginning of the function
229 input->remove_prefix(input->size() - kVersion0EncodedLength);
230 version_ = 0 /* legacy */;
231 checksum_ = kCRC32c;
232 } else {
233 version_ = DecodeFixed32(magic_ptr - 4);
234 // Footer version 1 and higher will always occupy exactly this many bytes.
235 // It consists of the checksum type, two block handles, padding,
236 // a version number, and a magic number
237 if (input->size() < kNewVersionsEncodedLength) {
238 return Status::Corruption("input is too short to be an sstable");
239 } else {
240 input->remove_prefix(input->size() - kNewVersionsEncodedLength);
241 }
242 uint32_t chksum;
243 if (!GetVarint32(input, &chksum)) {
244 return Status::Corruption("bad checksum type");
245 }
246 checksum_ = static_cast<ChecksumType>(chksum);
247 }
248
249 Status result = metaindex_handle_.DecodeFrom(input);
250 if (result.ok()) {
251 result = index_handle_.DecodeFrom(input);
252 }
253 if (result.ok()) {
254 // We skip over any leftover data (just padding for now) in "input"
255 const char* end = magic_ptr + kMagicNumberLengthByte;
256 *input = Slice(end, input->data() + input->size() - end);
257 }
258 return result;
259}
260
261std::string Footer::ToString() const {
11fdf7f2 262 std::string result;
7c673cae
FG
263 result.reserve(1024);
264
265 bool legacy = IsLegacyFooterFormat(table_magic_number_);
266 if (legacy) {
267 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
268 result.append("index handle: " + index_handle_.ToString() + "\n ");
269 result.append("table_magic_number: " +
f67539c2 270 ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
7c673cae 271 } else {
f67539c2
TL
272 result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
273 "\n ");
7c673cae
FG
274 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
275 result.append("index handle: " + index_handle_.ToString() + "\n ");
f67539c2
TL
276 result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
277 "\n ");
7c673cae 278 result.append("table_magic_number: " +
f67539c2 279 ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
7c673cae
FG
280 }
281 return result;
282}
283
11fdf7f2
TL
284Status ReadFooterFromFile(RandomAccessFileReader* file,
285 FilePrefetchBuffer* prefetch_buffer,
286 uint64_t file_size, Footer* footer,
287 uint64_t enforce_table_magic_number) {
7c673cae 288 if (file_size < Footer::kMinEncodedLength) {
494da23a
TL
289 return Status::Corruption("file is too short (" + ToString(file_size) +
290 " bytes) to be an "
291 "sstable: " +
292 file->file_name());
7c673cae
FG
293 }
294
295 char footer_space[Footer::kMaxEncodedLength];
296 Slice footer_input;
297 size_t read_offset =
298 (file_size > Footer::kMaxEncodedLength)
299 ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
300 : 0;
11fdf7f2
TL
301 Status s;
302 if (prefetch_buffer == nullptr ||
303 !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
304 &footer_input)) {
305 s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
306 footer_space);
307 if (!s.ok()) return s;
308 }
7c673cae
FG
309
310 // Check that we actually read the whole footer from the file. It may be
311 // that size isn't correct.
312 if (footer_input.size() < Footer::kMinEncodedLength) {
494da23a
TL
313 return Status::Corruption("file is too short (" + ToString(file_size) +
314 " bytes) to be an "
315 "sstable" +
316 file->file_name());
7c673cae
FG
317 }
318
319 s = footer->DecodeFrom(&footer_input);
320 if (!s.ok()) {
321 return s;
322 }
323 if (enforce_table_magic_number != 0 &&
324 enforce_table_magic_number != footer->table_magic_number()) {
11fdf7f2 325 return Status::Corruption(
494da23a
TL
326 "Bad table magic number: expected " +
327 ToString(enforce_table_magic_number) + ", found " +
328 ToString(footer->table_magic_number()) + " in " + file->file_name());
7c673cae
FG
329 }
330 return Status::OK();
331}
332
7c673cae 333Status UncompressBlockContentsForCompressionType(
494da23a 334 const UncompressionInfo& uncompression_info, const char* data, size_t n,
11fdf7f2 335 BlockContents* contents, uint32_t format_version,
494da23a
TL
336 const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
337 CacheAllocationPtr ubuf;
7c673cae 338
494da23a 339 assert(uncompression_info.type() != kNoCompression &&
11fdf7f2 340 "Invalid compression type");
7c673cae 341
494da23a
TL
342 StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
343 ioptions.env, ioptions.statistics));
7c673cae 344 int decompress_size = 0;
494da23a 345 switch (uncompression_info.type()) {
7c673cae
FG
346 case kSnappyCompression: {
347 size_t ulength = 0;
348 static char snappy_corrupt_msg[] =
494da23a 349 "Snappy not supported or corrupted Snappy compressed block contents";
7c673cae
FG
350 if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
351 return Status::Corruption(snappy_corrupt_msg);
352 }
494da23a 353 ubuf = AllocateBlock(ulength, allocator);
7c673cae
FG
354 if (!Snappy_Uncompress(data, n, ubuf.get())) {
355 return Status::Corruption(snappy_corrupt_msg);
356 }
494da23a 357 *contents = BlockContents(std::move(ubuf), ulength);
7c673cae
FG
358 break;
359 }
360 case kZlibCompression:
494da23a
TL
361 ubuf = Zlib_Uncompress(
362 uncompression_info, data, n, &decompress_size,
363 GetCompressFormatForVersion(kZlibCompression, format_version),
364 allocator);
7c673cae
FG
365 if (!ubuf) {
366 static char zlib_corrupt_msg[] =
494da23a 367 "Zlib not supported or corrupted Zlib compressed block contents";
7c673cae
FG
368 return Status::Corruption(zlib_corrupt_msg);
369 }
494da23a 370 *contents = BlockContents(std::move(ubuf), decompress_size);
7c673cae
FG
371 break;
372 case kBZip2Compression:
494da23a 373 ubuf = BZip2_Uncompress(
7c673cae 374 data, n, &decompress_size,
494da23a
TL
375 GetCompressFormatForVersion(kBZip2Compression, format_version),
376 allocator);
7c673cae
FG
377 if (!ubuf) {
378 static char bzip2_corrupt_msg[] =
494da23a 379 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
7c673cae
FG
380 return Status::Corruption(bzip2_corrupt_msg);
381 }
494da23a 382 *contents = BlockContents(std::move(ubuf), decompress_size);
7c673cae
FG
383 break;
384 case kLZ4Compression:
494da23a
TL
385 ubuf = LZ4_Uncompress(
386 uncompression_info, data, n, &decompress_size,
387 GetCompressFormatForVersion(kLZ4Compression, format_version),
388 allocator);
7c673cae
FG
389 if (!ubuf) {
390 static char lz4_corrupt_msg[] =
494da23a 391 "LZ4 not supported or corrupted LZ4 compressed block contents";
7c673cae
FG
392 return Status::Corruption(lz4_corrupt_msg);
393 }
494da23a 394 *contents = BlockContents(std::move(ubuf), decompress_size);
7c673cae
FG
395 break;
396 case kLZ4HCCompression:
494da23a
TL
397 ubuf = LZ4_Uncompress(
398 uncompression_info, data, n, &decompress_size,
399 GetCompressFormatForVersion(kLZ4HCCompression, format_version),
400 allocator);
7c673cae
FG
401 if (!ubuf) {
402 static char lz4hc_corrupt_msg[] =
494da23a 403 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
7c673cae
FG
404 return Status::Corruption(lz4hc_corrupt_msg);
405 }
494da23a 406 *contents = BlockContents(std::move(ubuf), decompress_size);
7c673cae
FG
407 break;
408 case kXpressCompression:
494da23a
TL
409 // XPRESS allocates memory internally, thus no support for custom
410 // allocator.
7c673cae
FG
411 ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
412 if (!ubuf) {
413 static char xpress_corrupt_msg[] =
494da23a
TL
414 "XPRESS not supported or corrupted XPRESS compressed block "
415 "contents";
7c673cae
FG
416 return Status::Corruption(xpress_corrupt_msg);
417 }
494da23a 418 *contents = BlockContents(std::move(ubuf), decompress_size);
7c673cae
FG
419 break;
420 case kZSTD:
421 case kZSTDNotFinalCompression:
494da23a
TL
422 ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
423 allocator);
7c673cae
FG
424 if (!ubuf) {
425 static char zstd_corrupt_msg[] =
426 "ZSTD not supported or corrupted ZSTD compressed block contents";
427 return Status::Corruption(zstd_corrupt_msg);
428 }
494da23a 429 *contents = BlockContents(std::move(ubuf), decompress_size);
7c673cae
FG
430 break;
431 default:
432 return Status::Corruption("bad block type");
433 }
434
494da23a
TL
435 if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
436 RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
437 timer.ElapsedNanos());
7c673cae 438 }
494da23a
TL
439 RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
440 contents->data.size());
11fdf7f2 441 RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
7c673cae
FG
442
443 return Status::OK();
444}
445
446//
447// The 'data' points to the raw block contents that was read in from file.
448// This method allocates a new heap buffer and the raw block
449// contents are uncompresed into this buffer. This
450// buffer is returned via 'result' and it is upto the caller to
451// free this buffer.
452// format_version is the block format as defined in include/rocksdb/table.h
494da23a 453Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
11fdf7f2 454 const char* data, size_t n,
7c673cae 455 BlockContents* contents, uint32_t format_version,
494da23a
TL
456 const ImmutableCFOptions& ioptions,
457 MemoryAllocator* allocator) {
7c673cae 458 assert(data[n] != kNoCompression);
494da23a
TL
459 assert(data[n] == uncompression_info.type());
460 return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
461 contents, format_version,
462 ioptions, allocator);
7c673cae
FG
463}
464
f67539c2 465} // namespace ROCKSDB_NAMESPACE