]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/format.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / table / format.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/format.h"
11
12 #include <inttypes.h>
13 #include <string>
14
15 #include "monitoring/perf_context_imp.h"
16 #include "monitoring/statistics.h"
17 #include "rocksdb/env.h"
18 #include "table/block.h"
19 #include "table/block_based_table_reader.h"
20 #include "table/block_fetcher.h"
21 #include "table/persistent_cache_helper.h"
22 #include "util/coding.h"
23 #include "util/compression.h"
24 #include "util/crc32c.h"
25 #include "util/file_reader_writer.h"
26 #include "util/logging.h"
27 #include "util/memory_allocator.h"
28 #include "util/stop_watch.h"
29 #include "util/string_util.h"
30 #include "util/xxhash.h"
31
32 namespace rocksdb {
33
34 extern const uint64_t kLegacyBlockBasedTableMagicNumber;
35 extern const uint64_t kBlockBasedTableMagicNumber;
36
37 #ifndef ROCKSDB_LITE
38 extern const uint64_t kLegacyPlainTableMagicNumber;
39 extern const uint64_t kPlainTableMagicNumber;
40 #else
41 // ROCKSDB_LITE doesn't have plain table
42 const uint64_t kLegacyPlainTableMagicNumber = 0;
43 const uint64_t kPlainTableMagicNumber = 0;
44 #endif
45
46 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
47 return env != nullptr && stats != nullptr &&
48 stats->get_stats_level() > kExceptDetailedTimers;
49 }
50
51 void BlockHandle::EncodeTo(std::string* dst) const {
52 // Sanity check that all fields have been set
53 assert(offset_ != ~static_cast<uint64_t>(0));
54 assert(size_ != ~static_cast<uint64_t>(0));
55 PutVarint64Varint64(dst, offset_, size_);
56 }
57
58 Status BlockHandle::DecodeFrom(Slice* input) {
59 if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
60 return Status::OK();
61 } else {
62 // reset in case failure after partially decoding
63 offset_ = 0;
64 size_ = 0;
65 return Status::Corruption("bad block handle");
66 }
67 }
68
69 Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
70 if (GetVarint64(input, &size_)) {
71 offset_ = _offset;
72 return Status::OK();
73 } else {
74 // reset in case failure after partially decoding
75 offset_ = 0;
76 size_ = 0;
77 return Status::Corruption("bad block handle");
78 }
79 }
80
81 // Return a string that contains the copy of handle.
82 std::string BlockHandle::ToString(bool hex) const {
83 std::string handle_str;
84 EncodeTo(&handle_str);
85 if (hex) {
86 return Slice(handle_str).ToString(true);
87 } else {
88 return handle_str;
89 }
90 }
91
92 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
93
94 namespace {
95 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
96 return magic_number == kLegacyBlockBasedTableMagicNumber ||
97 magic_number == kLegacyPlainTableMagicNumber;
98 }
99 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
100 if (magic_number == kLegacyBlockBasedTableMagicNumber) {
101 return kBlockBasedTableMagicNumber;
102 }
103 if (magic_number == kLegacyPlainTableMagicNumber) {
104 return kPlainTableMagicNumber;
105 }
106 assert(false);
107 return 0;
108 }
109 } // namespace
110
111 // legacy footer format:
112 // metaindex handle (varint64 offset, varint64 size)
113 // index handle (varint64 offset, varint64 size)
114 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
115 // table_magic_number (8 bytes)
116 // new footer format:
117 // checksum type (char, 1 byte)
118 // metaindex handle (varint64 offset, varint64 size)
119 // index handle (varint64 offset, varint64 size)
120 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
121 // footer version (4 bytes)
122 // table_magic_number (8 bytes)
123 void Footer::EncodeTo(std::string* dst) const {
124 assert(HasInitializedTableMagicNumber());
125 if (IsLegacyFooterFormat(table_magic_number())) {
126 // has to be default checksum with legacy footer
127 assert(checksum_ == kCRC32c);
128 const size_t original_size = dst->size();
129 metaindex_handle_.EncodeTo(dst);
130 index_handle_.EncodeTo(dst);
131 dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
132 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
133 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
134 assert(dst->size() == original_size + kVersion0EncodedLength);
135 } else {
136 const size_t original_size = dst->size();
137 dst->push_back(static_cast<char>(checksum_));
138 metaindex_handle_.EncodeTo(dst);
139 index_handle_.EncodeTo(dst);
140 dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
141 PutFixed32(dst, version());
142 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
143 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
144 assert(dst->size() == original_size + kNewVersionsEncodedLength);
145 }
146 }
147
148 Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
149 : version_(_version),
150 checksum_(kCRC32c),
151 table_magic_number_(_table_magic_number) {
152 // This should be guaranteed by constructor callers
153 assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
154 }
155
156 Status Footer::DecodeFrom(Slice* input) {
157 assert(!HasInitializedTableMagicNumber());
158 assert(input != nullptr);
159 assert(input->size() >= kMinEncodedLength);
160
161 const char* magic_ptr =
162 input->data() + input->size() - kMagicNumberLengthByte;
163 const uint32_t magic_lo = DecodeFixed32(magic_ptr);
164 const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
165 uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
166 (static_cast<uint64_t>(magic_lo)));
167
168 // We check for legacy formats here and silently upconvert them
169 bool legacy = IsLegacyFooterFormat(magic);
170 if (legacy) {
171 magic = UpconvertLegacyFooterFormat(magic);
172 }
173 set_table_magic_number(magic);
174
175 if (legacy) {
176 // The size is already asserted to be at least kMinEncodedLength
177 // at the beginning of the function
178 input->remove_prefix(input->size() - kVersion0EncodedLength);
179 version_ = 0 /* legacy */;
180 checksum_ = kCRC32c;
181 } else {
182 version_ = DecodeFixed32(magic_ptr - 4);
183 // Footer version 1 and higher will always occupy exactly this many bytes.
184 // It consists of the checksum type, two block handles, padding,
185 // a version number, and a magic number
186 if (input->size() < kNewVersionsEncodedLength) {
187 return Status::Corruption("input is too short to be an sstable");
188 } else {
189 input->remove_prefix(input->size() - kNewVersionsEncodedLength);
190 }
191 uint32_t chksum;
192 if (!GetVarint32(input, &chksum)) {
193 return Status::Corruption("bad checksum type");
194 }
195 checksum_ = static_cast<ChecksumType>(chksum);
196 }
197
198 Status result = metaindex_handle_.DecodeFrom(input);
199 if (result.ok()) {
200 result = index_handle_.DecodeFrom(input);
201 }
202 if (result.ok()) {
203 // We skip over any leftover data (just padding for now) in "input"
204 const char* end = magic_ptr + kMagicNumberLengthByte;
205 *input = Slice(end, input->data() + input->size() - end);
206 }
207 return result;
208 }
209
210 std::string Footer::ToString() const {
211 std::string result;
212 result.reserve(1024);
213
214 bool legacy = IsLegacyFooterFormat(table_magic_number_);
215 if (legacy) {
216 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
217 result.append("index handle: " + index_handle_.ToString() + "\n ");
218 result.append("table_magic_number: " +
219 rocksdb::ToString(table_magic_number_) + "\n ");
220 } else {
221 result.append("checksum: " + rocksdb::ToString(checksum_) + "\n ");
222 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
223 result.append("index handle: " + index_handle_.ToString() + "\n ");
224 result.append("footer version: " + rocksdb::ToString(version_) + "\n ");
225 result.append("table_magic_number: " +
226 rocksdb::ToString(table_magic_number_) + "\n ");
227 }
228 return result;
229 }
230
231 Status ReadFooterFromFile(RandomAccessFileReader* file,
232 FilePrefetchBuffer* prefetch_buffer,
233 uint64_t file_size, Footer* footer,
234 uint64_t enforce_table_magic_number) {
235 if (file_size < Footer::kMinEncodedLength) {
236 return Status::Corruption("file is too short (" + ToString(file_size) +
237 " bytes) to be an "
238 "sstable: " +
239 file->file_name());
240 }
241
242 char footer_space[Footer::kMaxEncodedLength];
243 Slice footer_input;
244 size_t read_offset =
245 (file_size > Footer::kMaxEncodedLength)
246 ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
247 : 0;
248 Status s;
249 if (prefetch_buffer == nullptr ||
250 !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
251 &footer_input)) {
252 s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
253 footer_space);
254 if (!s.ok()) return s;
255 }
256
257 // Check that we actually read the whole footer from the file. It may be
258 // that size isn't correct.
259 if (footer_input.size() < Footer::kMinEncodedLength) {
260 return Status::Corruption("file is too short (" + ToString(file_size) +
261 " bytes) to be an "
262 "sstable" +
263 file->file_name());
264 }
265
266 s = footer->DecodeFrom(&footer_input);
267 if (!s.ok()) {
268 return s;
269 }
270 if (enforce_table_magic_number != 0 &&
271 enforce_table_magic_number != footer->table_magic_number()) {
272 return Status::Corruption(
273 "Bad table magic number: expected " +
274 ToString(enforce_table_magic_number) + ", found " +
275 ToString(footer->table_magic_number()) + " in " + file->file_name());
276 }
277 return Status::OK();
278 }
279
280 Status UncompressBlockContentsForCompressionType(
281 const UncompressionInfo& uncompression_info, const char* data, size_t n,
282 BlockContents* contents, uint32_t format_version,
283 const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
284 CacheAllocationPtr ubuf;
285
286 assert(uncompression_info.type() != kNoCompression &&
287 "Invalid compression type");
288
289 StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
290 ioptions.env, ioptions.statistics));
291 int decompress_size = 0;
292 switch (uncompression_info.type()) {
293 case kSnappyCompression: {
294 size_t ulength = 0;
295 static char snappy_corrupt_msg[] =
296 "Snappy not supported or corrupted Snappy compressed block contents";
297 if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
298 return Status::Corruption(snappy_corrupt_msg);
299 }
300 ubuf = AllocateBlock(ulength, allocator);
301 if (!Snappy_Uncompress(data, n, ubuf.get())) {
302 return Status::Corruption(snappy_corrupt_msg);
303 }
304 *contents = BlockContents(std::move(ubuf), ulength);
305 break;
306 }
307 case kZlibCompression:
308 ubuf = Zlib_Uncompress(
309 uncompression_info, data, n, &decompress_size,
310 GetCompressFormatForVersion(kZlibCompression, format_version),
311 allocator);
312 if (!ubuf) {
313 static char zlib_corrupt_msg[] =
314 "Zlib not supported or corrupted Zlib compressed block contents";
315 return Status::Corruption(zlib_corrupt_msg);
316 }
317 *contents = BlockContents(std::move(ubuf), decompress_size);
318 break;
319 case kBZip2Compression:
320 ubuf = BZip2_Uncompress(
321 data, n, &decompress_size,
322 GetCompressFormatForVersion(kBZip2Compression, format_version),
323 allocator);
324 if (!ubuf) {
325 static char bzip2_corrupt_msg[] =
326 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
327 return Status::Corruption(bzip2_corrupt_msg);
328 }
329 *contents = BlockContents(std::move(ubuf), decompress_size);
330 break;
331 case kLZ4Compression:
332 ubuf = LZ4_Uncompress(
333 uncompression_info, data, n, &decompress_size,
334 GetCompressFormatForVersion(kLZ4Compression, format_version),
335 allocator);
336 if (!ubuf) {
337 static char lz4_corrupt_msg[] =
338 "LZ4 not supported or corrupted LZ4 compressed block contents";
339 return Status::Corruption(lz4_corrupt_msg);
340 }
341 *contents = BlockContents(std::move(ubuf), decompress_size);
342 break;
343 case kLZ4HCCompression:
344 ubuf = LZ4_Uncompress(
345 uncompression_info, data, n, &decompress_size,
346 GetCompressFormatForVersion(kLZ4HCCompression, format_version),
347 allocator);
348 if (!ubuf) {
349 static char lz4hc_corrupt_msg[] =
350 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
351 return Status::Corruption(lz4hc_corrupt_msg);
352 }
353 *contents = BlockContents(std::move(ubuf), decompress_size);
354 break;
355 case kXpressCompression:
356 // XPRESS allocates memory internally, thus no support for custom
357 // allocator.
358 ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
359 if (!ubuf) {
360 static char xpress_corrupt_msg[] =
361 "XPRESS not supported or corrupted XPRESS compressed block "
362 "contents";
363 return Status::Corruption(xpress_corrupt_msg);
364 }
365 *contents = BlockContents(std::move(ubuf), decompress_size);
366 break;
367 case kZSTD:
368 case kZSTDNotFinalCompression:
369 ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
370 allocator);
371 if (!ubuf) {
372 static char zstd_corrupt_msg[] =
373 "ZSTD not supported or corrupted ZSTD compressed block contents";
374 return Status::Corruption(zstd_corrupt_msg);
375 }
376 *contents = BlockContents(std::move(ubuf), decompress_size);
377 break;
378 default:
379 return Status::Corruption("bad block type");
380 }
381
382 if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
383 RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
384 timer.ElapsedNanos());
385 }
386 RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
387 contents->data.size());
388 RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
389
390 return Status::OK();
391 }
392
393 //
394 // The 'data' points to the raw block contents that was read in from file.
395 // This method allocates a new heap buffer and the raw block
396 // contents are uncompresed into this buffer. This
397 // buffer is returned via 'result' and it is upto the caller to
398 // free this buffer.
399 // format_version is the block format as defined in include/rocksdb/table.h
400 Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
401 const char* data, size_t n,
402 BlockContents* contents, uint32_t format_version,
403 const ImmutableCFOptions& ioptions,
404 MemoryAllocator* allocator) {
405 assert(data[n] != kNoCompression);
406 assert(data[n] == uncompression_info.type());
407 return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
408 contents, format_version,
409 ioptions, allocator);
410 }
411
412 } // namespace rocksdb