]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/format.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / table / format.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/format.h"
11
12 #include <string>
13 #include <inttypes.h>
14
15 #include "monitoring/perf_context_imp.h"
16 #include "monitoring/statistics.h"
17 #include "rocksdb/env.h"
18 #include "table/block.h"
19 #include "table/block_based_table_reader.h"
20 #include "table/persistent_cache_helper.h"
21 #include "util/coding.h"
22 #include "util/compression.h"
23 #include "util/crc32c.h"
24 #include "util/file_reader_writer.h"
25 #include "util/logging.h"
26 #include "util/stop_watch.h"
27 #include "util/string_util.h"
28 #include "util/xxhash.h"
29
30 namespace rocksdb {
31
32 extern const uint64_t kLegacyBlockBasedTableMagicNumber;
33 extern const uint64_t kBlockBasedTableMagicNumber;
34
35 #ifndef ROCKSDB_LITE
36 extern const uint64_t kLegacyPlainTableMagicNumber;
37 extern const uint64_t kPlainTableMagicNumber;
38 #else
39 // ROCKSDB_LITE doesn't have plain table
40 const uint64_t kLegacyPlainTableMagicNumber = 0;
41 const uint64_t kPlainTableMagicNumber = 0;
42 #endif
43 const uint32_t DefaultStackBufferSize = 5000;
44
45 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
46 return env != nullptr && stats != nullptr &&
47 stats->stats_level_ > kExceptDetailedTimers;
48 }
49
50 void BlockHandle::EncodeTo(std::string* dst) const {
51 // Sanity check that all fields have been set
52 assert(offset_ != ~static_cast<uint64_t>(0));
53 assert(size_ != ~static_cast<uint64_t>(0));
54 PutVarint64Varint64(dst, offset_, size_);
55 }
56
57 Status BlockHandle::DecodeFrom(Slice* input) {
58 if (GetVarint64(input, &offset_) &&
59 GetVarint64(input, &size_)) {
60 return Status::OK();
61 } else {
62 // reset in case failure after partially decoding
63 offset_ = 0;
64 size_ = 0;
65 return Status::Corruption("bad block handle");
66 }
67 }
68
69 // Return a string that contains the copy of handle.
70 std::string BlockHandle::ToString(bool hex) const {
71 std::string handle_str;
72 EncodeTo(&handle_str);
73 if (hex) {
74 return Slice(handle_str).ToString(true);
75 } else {
76 return handle_str;
77 }
78 }
79
80 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
81
82 namespace {
83 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
84 return magic_number == kLegacyBlockBasedTableMagicNumber ||
85 magic_number == kLegacyPlainTableMagicNumber;
86 }
87 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
88 if (magic_number == kLegacyBlockBasedTableMagicNumber) {
89 return kBlockBasedTableMagicNumber;
90 }
91 if (magic_number == kLegacyPlainTableMagicNumber) {
92 return kPlainTableMagicNumber;
93 }
94 assert(false);
95 return 0;
96 }
97 } // namespace
98
99 // legacy footer format:
100 // metaindex handle (varint64 offset, varint64 size)
101 // index handle (varint64 offset, varint64 size)
102 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
103 // table_magic_number (8 bytes)
104 // new footer format:
105 // checksum (char, 1 byte)
106 // metaindex handle (varint64 offset, varint64 size)
107 // index handle (varint64 offset, varint64 size)
108 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
109 // footer version (4 bytes)
110 // table_magic_number (8 bytes)
111 void Footer::EncodeTo(std::string* dst) const {
112 assert(HasInitializedTableMagicNumber());
113 if (IsLegacyFooterFormat(table_magic_number())) {
114 // has to be default checksum with legacy footer
115 assert(checksum_ == kCRC32c);
116 const size_t original_size = dst->size();
117 metaindex_handle_.EncodeTo(dst);
118 index_handle_.EncodeTo(dst);
119 dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
120 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
121 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
122 assert(dst->size() == original_size + kVersion0EncodedLength);
123 } else {
124 const size_t original_size = dst->size();
125 dst->push_back(static_cast<char>(checksum_));
126 metaindex_handle_.EncodeTo(dst);
127 index_handle_.EncodeTo(dst);
128 dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
129 PutFixed32(dst, version());
130 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
131 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
132 assert(dst->size() == original_size + kNewVersionsEncodedLength);
133 }
134 }
135
136 Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
137 : version_(_version),
138 checksum_(kCRC32c),
139 table_magic_number_(_table_magic_number) {
140 // This should be guaranteed by constructor callers
141 assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
142 }
143
144 Status Footer::DecodeFrom(Slice* input) {
145 assert(!HasInitializedTableMagicNumber());
146 assert(input != nullptr);
147 assert(input->size() >= kMinEncodedLength);
148
149 const char *magic_ptr =
150 input->data() + input->size() - kMagicNumberLengthByte;
151 const uint32_t magic_lo = DecodeFixed32(magic_ptr);
152 const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
153 uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
154 (static_cast<uint64_t>(magic_lo)));
155
156 // We check for legacy formats here and silently upconvert them
157 bool legacy = IsLegacyFooterFormat(magic);
158 if (legacy) {
159 magic = UpconvertLegacyFooterFormat(magic);
160 }
161 set_table_magic_number(magic);
162
163 if (legacy) {
164 // The size is already asserted to be at least kMinEncodedLength
165 // at the beginning of the function
166 input->remove_prefix(input->size() - kVersion0EncodedLength);
167 version_ = 0 /* legacy */;
168 checksum_ = kCRC32c;
169 } else {
170 version_ = DecodeFixed32(magic_ptr - 4);
171 // Footer version 1 and higher will always occupy exactly this many bytes.
172 // It consists of the checksum type, two block handles, padding,
173 // a version number, and a magic number
174 if (input->size() < kNewVersionsEncodedLength) {
175 return Status::Corruption("input is too short to be an sstable");
176 } else {
177 input->remove_prefix(input->size() - kNewVersionsEncodedLength);
178 }
179 uint32_t chksum;
180 if (!GetVarint32(input, &chksum)) {
181 return Status::Corruption("bad checksum type");
182 }
183 checksum_ = static_cast<ChecksumType>(chksum);
184 }
185
186 Status result = metaindex_handle_.DecodeFrom(input);
187 if (result.ok()) {
188 result = index_handle_.DecodeFrom(input);
189 }
190 if (result.ok()) {
191 // We skip over any leftover data (just padding for now) in "input"
192 const char* end = magic_ptr + kMagicNumberLengthByte;
193 *input = Slice(end, input->data() + input->size() - end);
194 }
195 return result;
196 }
197
198 std::string Footer::ToString() const {
199 std::string result, handle_;
200 result.reserve(1024);
201
202 bool legacy = IsLegacyFooterFormat(table_magic_number_);
203 if (legacy) {
204 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
205 result.append("index handle: " + index_handle_.ToString() + "\n ");
206 result.append("table_magic_number: " +
207 rocksdb::ToString(table_magic_number_) + "\n ");
208 } else {
209 result.append("checksum: " + rocksdb::ToString(checksum_) + "\n ");
210 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
211 result.append("index handle: " + index_handle_.ToString() + "\n ");
212 result.append("footer version: " + rocksdb::ToString(version_) + "\n ");
213 result.append("table_magic_number: " +
214 rocksdb::ToString(table_magic_number_) + "\n ");
215 }
216 return result;
217 }
218
219 Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
220 Footer* footer, uint64_t enforce_table_magic_number) {
221 if (file_size < Footer::kMinEncodedLength) {
222 return Status::Corruption("file is too short to be an sstable");
223 }
224
225 char footer_space[Footer::kMaxEncodedLength];
226 Slice footer_input;
227 size_t read_offset =
228 (file_size > Footer::kMaxEncodedLength)
229 ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
230 : 0;
231 Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
232 footer_space);
233 if (!s.ok()) return s;
234
235 // Check that we actually read the whole footer from the file. It may be
236 // that size isn't correct.
237 if (footer_input.size() < Footer::kMinEncodedLength) {
238 return Status::Corruption("file is too short to be an sstable");
239 }
240
241 s = footer->DecodeFrom(&footer_input);
242 if (!s.ok()) {
243 return s;
244 }
245 if (enforce_table_magic_number != 0 &&
246 enforce_table_magic_number != footer->table_magic_number()) {
247 return Status::Corruption("Bad table magic number");
248 }
249 return Status::OK();
250 }
251
252 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
253 namespace {
254
255 // Read a block and check its CRC
256 // contents is the result of reading.
257 // According to the implementation of file->Read, contents may not point to buf
258 Status ReadBlock(RandomAccessFileReader* file, const Footer& footer,
259 const ReadOptions& options, const BlockHandle& handle,
260 Slice* contents, /* result of reading */ char* buf) {
261 size_t n = static_cast<size_t>(handle.size());
262 Status s;
263
264 {
265 PERF_TIMER_GUARD(block_read_time);
266 s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
267 }
268
269 PERF_COUNTER_ADD(block_read_count, 1);
270 PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
271
272 if (!s.ok()) {
273 return s;
274 }
275 if (contents->size() != n + kBlockTrailerSize) {
276 return Status::Corruption("truncated block read");
277 }
278
279 // Check the crc of the type and the block contents
280 const char* data = contents->data(); // Pointer to where Read put the data
281 if (options.verify_checksums) {
282 PERF_TIMER_GUARD(block_checksum_time);
283 uint32_t value = DecodeFixed32(data + n + 1);
284 uint32_t actual = 0;
285 switch (footer.checksum()) {
286 case kCRC32c:
287 value = crc32c::Unmask(value);
288 actual = crc32c::Value(data, n + 1);
289 break;
290 case kxxHash:
291 actual = XXH32(data, static_cast<int>(n) + 1, 0);
292 break;
293 default:
294 s = Status::Corruption("unknown checksum type");
295 }
296 if (s.ok() && actual != value) {
297 s = Status::Corruption("block checksum mismatch");
298 }
299 if (!s.ok()) {
300 return s;
301 }
302 }
303 return s;
304 }
305
306 } // namespace
307
308 Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
309 const ReadOptions& read_options,
310 const BlockHandle& handle, BlockContents* contents,
311 const ImmutableCFOptions &ioptions,
312 bool decompression_requested,
313 const Slice& compression_dict,
314 const PersistentCacheOptions& cache_options) {
315 Status status;
316 Slice slice;
317 size_t n = static_cast<size_t>(handle.size());
318 std::unique_ptr<char[]> heap_buf;
319 char stack_buf[DefaultStackBufferSize];
320 char* used_buf = nullptr;
321 rocksdb::CompressionType compression_type;
322
323 if (cache_options.persistent_cache &&
324 !cache_options.persistent_cache->IsCompressed()) {
325 status = PersistentCacheHelper::LookupUncompressedPage(cache_options,
326 handle, contents);
327 if (status.ok()) {
328 // uncompressed page is found for the block handle
329 return status;
330 } else {
331 // uncompressed page is not found
332 if (ioptions.info_log && !status.IsNotFound()) {
333 assert(!status.ok());
334 ROCKS_LOG_INFO(ioptions.info_log,
335 "Error reading from persistent cache. %s",
336 status.ToString().c_str());
337 }
338 }
339 }
340
341 if (cache_options.persistent_cache &&
342 cache_options.persistent_cache->IsCompressed()) {
343 // lookup uncompressed cache mode p-cache
344 status = PersistentCacheHelper::LookupRawPage(
345 cache_options, handle, &heap_buf, n + kBlockTrailerSize);
346 } else {
347 status = Status::NotFound();
348 }
349
350 if (status.ok()) {
351 // cache hit
352 used_buf = heap_buf.get();
353 slice = Slice(heap_buf.get(), n);
354 } else {
355 if (ioptions.info_log && !status.IsNotFound()) {
356 assert(!status.ok());
357 ROCKS_LOG_INFO(ioptions.info_log,
358 "Error reading from persistent cache. %s",
359 status.ToString().c_str());
360 }
361 // cache miss read from device
362 if (decompression_requested &&
363 n + kBlockTrailerSize < DefaultStackBufferSize) {
364 // If we've got a small enough hunk of data, read it in to the
365 // trivially allocated stack buffer instead of needing a full malloc()
366 used_buf = &stack_buf[0];
367 } else {
368 heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
369 used_buf = heap_buf.get();
370 }
371
372 status = ReadBlock(file, footer, read_options, handle, &slice, used_buf);
373 if (status.ok() && read_options.fill_cache &&
374 cache_options.persistent_cache &&
375 cache_options.persistent_cache->IsCompressed()) {
376 // insert to raw cache
377 PersistentCacheHelper::InsertRawPage(cache_options, handle, used_buf,
378 n + kBlockTrailerSize);
379 }
380 }
381
382 if (!status.ok()) {
383 return status;
384 }
385
386 PERF_TIMER_GUARD(block_decompress_time);
387
388 compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]);
389
390 if (decompression_requested && compression_type != kNoCompression) {
391 // compressed page, uncompress, update cache
392 status = UncompressBlockContents(slice.data(), n, contents,
393 footer.version(), compression_dict,
394 ioptions);
395 } else if (slice.data() != used_buf) {
396 // the slice content is not the buffer provided
397 *contents = BlockContents(Slice(slice.data(), n), false, compression_type);
398 } else {
399 // page is uncompressed, the buffer either stack or heap provided
400 if (used_buf == &stack_buf[0]) {
401 heap_buf = std::unique_ptr<char[]>(new char[n]);
402 memcpy(heap_buf.get(), stack_buf, n);
403 }
404 *contents = BlockContents(std::move(heap_buf), n, true, compression_type);
405 }
406
407 if (status.ok() && read_options.fill_cache &&
408 cache_options.persistent_cache &&
409 !cache_options.persistent_cache->IsCompressed()) {
410 // insert to uncompressed cache
411 PersistentCacheHelper::InsertUncompressedPage(cache_options, handle,
412 *contents);
413 }
414
415 return status;
416 }
417
418 Status UncompressBlockContentsForCompressionType(
419 const char* data, size_t n, BlockContents* contents,
420 uint32_t format_version, const Slice& compression_dict,
421 CompressionType compression_type, const ImmutableCFOptions &ioptions) {
422 std::unique_ptr<char[]> ubuf;
423
424 assert(compression_type != kNoCompression && "Invalid compression type");
425
426 StopWatchNano timer(ioptions.env,
427 ShouldReportDetailedTime(ioptions.env, ioptions.statistics));
428 int decompress_size = 0;
429 switch (compression_type) {
430 case kSnappyCompression: {
431 size_t ulength = 0;
432 static char snappy_corrupt_msg[] =
433 "Snappy not supported or corrupted Snappy compressed block contents";
434 if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
435 return Status::Corruption(snappy_corrupt_msg);
436 }
437 ubuf.reset(new char[ulength]);
438 if (!Snappy_Uncompress(data, n, ubuf.get())) {
439 return Status::Corruption(snappy_corrupt_msg);
440 }
441 *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
442 break;
443 }
444 case kZlibCompression:
445 ubuf.reset(Zlib_Uncompress(
446 data, n, &decompress_size,
447 GetCompressFormatForVersion(kZlibCompression, format_version),
448 compression_dict));
449 if (!ubuf) {
450 static char zlib_corrupt_msg[] =
451 "Zlib not supported or corrupted Zlib compressed block contents";
452 return Status::Corruption(zlib_corrupt_msg);
453 }
454 *contents =
455 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
456 break;
457 case kBZip2Compression:
458 ubuf.reset(BZip2_Uncompress(
459 data, n, &decompress_size,
460 GetCompressFormatForVersion(kBZip2Compression, format_version)));
461 if (!ubuf) {
462 static char bzip2_corrupt_msg[] =
463 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
464 return Status::Corruption(bzip2_corrupt_msg);
465 }
466 *contents =
467 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
468 break;
469 case kLZ4Compression:
470 ubuf.reset(LZ4_Uncompress(
471 data, n, &decompress_size,
472 GetCompressFormatForVersion(kLZ4Compression, format_version),
473 compression_dict));
474 if (!ubuf) {
475 static char lz4_corrupt_msg[] =
476 "LZ4 not supported or corrupted LZ4 compressed block contents";
477 return Status::Corruption(lz4_corrupt_msg);
478 }
479 *contents =
480 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
481 break;
482 case kLZ4HCCompression:
483 ubuf.reset(LZ4_Uncompress(
484 data, n, &decompress_size,
485 GetCompressFormatForVersion(kLZ4HCCompression, format_version),
486 compression_dict));
487 if (!ubuf) {
488 static char lz4hc_corrupt_msg[] =
489 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
490 return Status::Corruption(lz4hc_corrupt_msg);
491 }
492 *contents =
493 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
494 break;
495 case kXpressCompression:
496 ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
497 if (!ubuf) {
498 static char xpress_corrupt_msg[] =
499 "XPRESS not supported or corrupted XPRESS compressed block contents";
500 return Status::Corruption(xpress_corrupt_msg);
501 }
502 *contents =
503 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
504 break;
505 case kZSTD:
506 case kZSTDNotFinalCompression:
507 ubuf.reset(ZSTD_Uncompress(data, n, &decompress_size, compression_dict));
508 if (!ubuf) {
509 static char zstd_corrupt_msg[] =
510 "ZSTD not supported or corrupted ZSTD compressed block contents";
511 return Status::Corruption(zstd_corrupt_msg);
512 }
513 *contents =
514 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
515 break;
516 default:
517 return Status::Corruption("bad block type");
518 }
519
520 if(ShouldReportDetailedTime(ioptions.env, ioptions.statistics)){
521 MeasureTime(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
522 timer.ElapsedNanos());
523 MeasureTime(ioptions.statistics, BYTES_DECOMPRESSED, contents->data.size());
524 RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
525 }
526
527 return Status::OK();
528 }
529
530 //
531 // The 'data' points to the raw block contents that was read in from file.
532 // This method allocates a new heap buffer and the raw block
533 // contents are uncompresed into this buffer. This
534 // buffer is returned via 'result' and it is upto the caller to
535 // free this buffer.
536 // format_version is the block format as defined in include/rocksdb/table.h
537 Status UncompressBlockContents(const char* data, size_t n,
538 BlockContents* contents, uint32_t format_version,
539 const Slice& compression_dict,
540 const ImmutableCFOptions &ioptions) {
541 assert(data[n] != kNoCompression);
542 return UncompressBlockContentsForCompressionType(
543 data, n, contents, format_version, compression_dict,
544 (CompressionType)data[n], ioptions);
545 }
546
547 } // namespace rocksdb