]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/format.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / format.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/format.h"
11
12 #include <string>
13 #include <inttypes.h>
14
15 #include "monitoring/perf_context_imp.h"
16 #include "monitoring/statistics.h"
17 #include "rocksdb/env.h"
18 #include "table/block.h"
19 #include "table/block_based_table_reader.h"
20 #include "table/block_fetcher.h"
21 #include "table/persistent_cache_helper.h"
22 #include "util/coding.h"
23 #include "util/compression.h"
24 #include "util/crc32c.h"
25 #include "util/file_reader_writer.h"
26 #include "util/logging.h"
27 #include "util/stop_watch.h"
28 #include "util/string_util.h"
29 #include "util/xxhash.h"
30
31 namespace rocksdb {
32
33 extern const uint64_t kLegacyBlockBasedTableMagicNumber;
34 extern const uint64_t kBlockBasedTableMagicNumber;
35
36 #ifndef ROCKSDB_LITE
37 extern const uint64_t kLegacyPlainTableMagicNumber;
38 extern const uint64_t kPlainTableMagicNumber;
39 #else
40 // ROCKSDB_LITE doesn't have plain table
41 const uint64_t kLegacyPlainTableMagicNumber = 0;
42 const uint64_t kPlainTableMagicNumber = 0;
43 #endif
44
45 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
46 return env != nullptr && stats != nullptr &&
47 stats->stats_level_ > kExceptDetailedTimers;
48 }
49
50 void BlockHandle::EncodeTo(std::string* dst) const {
51 // Sanity check that all fields have been set
52 assert(offset_ != ~static_cast<uint64_t>(0));
53 assert(size_ != ~static_cast<uint64_t>(0));
54 PutVarint64Varint64(dst, offset_, size_);
55 }
56
57 Status BlockHandle::DecodeFrom(Slice* input) {
58 if (GetVarint64(input, &offset_) &&
59 GetVarint64(input, &size_)) {
60 return Status::OK();
61 } else {
62 // reset in case failure after partially decoding
63 offset_ = 0;
64 size_ = 0;
65 return Status::Corruption("bad block handle");
66 }
67 }
68
69 Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
70 if (GetVarint64(input, &size_)) {
71 offset_ = _offset;
72 return Status::OK();
73 } else {
74 // reset in case failure after partially decoding
75 offset_ = 0;
76 size_ = 0;
77 return Status::Corruption("bad block handle");
78 }
79 }
80
81 // Return a string that contains the copy of handle.
82 std::string BlockHandle::ToString(bool hex) const {
83 std::string handle_str;
84 EncodeTo(&handle_str);
85 if (hex) {
86 return Slice(handle_str).ToString(true);
87 } else {
88 return handle_str;
89 }
90 }
91
92 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
93
94 namespace {
95 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
96 return magic_number == kLegacyBlockBasedTableMagicNumber ||
97 magic_number == kLegacyPlainTableMagicNumber;
98 }
99 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
100 if (magic_number == kLegacyBlockBasedTableMagicNumber) {
101 return kBlockBasedTableMagicNumber;
102 }
103 if (magic_number == kLegacyPlainTableMagicNumber) {
104 return kPlainTableMagicNumber;
105 }
106 assert(false);
107 return 0;
108 }
109 } // namespace
110
111 // legacy footer format:
112 // metaindex handle (varint64 offset, varint64 size)
113 // index handle (varint64 offset, varint64 size)
114 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
115 // table_magic_number (8 bytes)
116 // new footer format:
117 // checksum type (char, 1 byte)
118 // metaindex handle (varint64 offset, varint64 size)
119 // index handle (varint64 offset, varint64 size)
120 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
121 // footer version (4 bytes)
122 // table_magic_number (8 bytes)
123 void Footer::EncodeTo(std::string* dst) const {
124 assert(HasInitializedTableMagicNumber());
125 if (IsLegacyFooterFormat(table_magic_number())) {
126 // has to be default checksum with legacy footer
127 assert(checksum_ == kCRC32c);
128 const size_t original_size = dst->size();
129 metaindex_handle_.EncodeTo(dst);
130 index_handle_.EncodeTo(dst);
131 dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
132 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
133 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
134 assert(dst->size() == original_size + kVersion0EncodedLength);
135 } else {
136 const size_t original_size = dst->size();
137 dst->push_back(static_cast<char>(checksum_));
138 metaindex_handle_.EncodeTo(dst);
139 index_handle_.EncodeTo(dst);
140 dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
141 PutFixed32(dst, version());
142 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
143 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
144 assert(dst->size() == original_size + kNewVersionsEncodedLength);
145 }
146 }
147
148 Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
149 : version_(_version),
150 checksum_(kCRC32c),
151 table_magic_number_(_table_magic_number) {
152 // This should be guaranteed by constructor callers
153 assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
154 }
155
156 Status Footer::DecodeFrom(Slice* input) {
157 assert(!HasInitializedTableMagicNumber());
158 assert(input != nullptr);
159 assert(input->size() >= kMinEncodedLength);
160
161 const char *magic_ptr =
162 input->data() + input->size() - kMagicNumberLengthByte;
163 const uint32_t magic_lo = DecodeFixed32(magic_ptr);
164 const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
165 uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
166 (static_cast<uint64_t>(magic_lo)));
167
168 // We check for legacy formats here and silently upconvert them
169 bool legacy = IsLegacyFooterFormat(magic);
170 if (legacy) {
171 magic = UpconvertLegacyFooterFormat(magic);
172 }
173 set_table_magic_number(magic);
174
175 if (legacy) {
176 // The size is already asserted to be at least kMinEncodedLength
177 // at the beginning of the function
178 input->remove_prefix(input->size() - kVersion0EncodedLength);
179 version_ = 0 /* legacy */;
180 checksum_ = kCRC32c;
181 } else {
182 version_ = DecodeFixed32(magic_ptr - 4);
183 // Footer version 1 and higher will always occupy exactly this many bytes.
184 // It consists of the checksum type, two block handles, padding,
185 // a version number, and a magic number
186 if (input->size() < kNewVersionsEncodedLength) {
187 return Status::Corruption("input is too short to be an sstable");
188 } else {
189 input->remove_prefix(input->size() - kNewVersionsEncodedLength);
190 }
191 uint32_t chksum;
192 if (!GetVarint32(input, &chksum)) {
193 return Status::Corruption("bad checksum type");
194 }
195 checksum_ = static_cast<ChecksumType>(chksum);
196 }
197
198 Status result = metaindex_handle_.DecodeFrom(input);
199 if (result.ok()) {
200 result = index_handle_.DecodeFrom(input);
201 }
202 if (result.ok()) {
203 // We skip over any leftover data (just padding for now) in "input"
204 const char* end = magic_ptr + kMagicNumberLengthByte;
205 *input = Slice(end, input->data() + input->size() - end);
206 }
207 return result;
208 }
209
210 std::string Footer::ToString() const {
211 std::string result;
212 result.reserve(1024);
213
214 bool legacy = IsLegacyFooterFormat(table_magic_number_);
215 if (legacy) {
216 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
217 result.append("index handle: " + index_handle_.ToString() + "\n ");
218 result.append("table_magic_number: " +
219 rocksdb::ToString(table_magic_number_) + "\n ");
220 } else {
221 result.append("checksum: " + rocksdb::ToString(checksum_) + "\n ");
222 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
223 result.append("index handle: " + index_handle_.ToString() + "\n ");
224 result.append("footer version: " + rocksdb::ToString(version_) + "\n ");
225 result.append("table_magic_number: " +
226 rocksdb::ToString(table_magic_number_) + "\n ");
227 }
228 return result;
229 }
230
231 Status ReadFooterFromFile(RandomAccessFileReader* file,
232 FilePrefetchBuffer* prefetch_buffer,
233 uint64_t file_size, Footer* footer,
234 uint64_t enforce_table_magic_number) {
235 if (file_size < Footer::kMinEncodedLength) {
236 return Status::Corruption(
237 "file is too short (" + ToString(file_size) + " bytes) to be an "
238 "sstable: " + file->file_name());
239 }
240
241 char footer_space[Footer::kMaxEncodedLength];
242 Slice footer_input;
243 size_t read_offset =
244 (file_size > Footer::kMaxEncodedLength)
245 ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
246 : 0;
247 Status s;
248 if (prefetch_buffer == nullptr ||
249 !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
250 &footer_input)) {
251 s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
252 footer_space);
253 if (!s.ok()) return s;
254 }
255
256 // Check that we actually read the whole footer from the file. It may be
257 // that size isn't correct.
258 if (footer_input.size() < Footer::kMinEncodedLength) {
259 return Status::Corruption(
260 "file is too short (" + ToString(file_size) + " bytes) to be an "
261 "sstable" + file->file_name());
262 }
263
264 s = footer->DecodeFrom(&footer_input);
265 if (!s.ok()) {
266 return s;
267 }
268 if (enforce_table_magic_number != 0 &&
269 enforce_table_magic_number != footer->table_magic_number()) {
270 return Status::Corruption(
271 "Bad table magic number: expected "
272 + ToString(enforce_table_magic_number) + ", found "
273 + ToString(footer->table_magic_number())
274 + " in " + file->file_name());
275 }
276 return Status::OK();
277 }
278
279 Status UncompressBlockContentsForCompressionType(
280 const UncompressionContext& uncompression_ctx, const char* data, size_t n,
281 BlockContents* contents, uint32_t format_version,
282 const ImmutableCFOptions& ioptions) {
283 std::unique_ptr<char[]> ubuf;
284
285 assert(uncompression_ctx.type() != kNoCompression &&
286 "Invalid compression type");
287
288 StopWatchNano timer(ioptions.env,
289 ShouldReportDetailedTime(ioptions.env, ioptions.statistics));
290 int decompress_size = 0;
291 switch (uncompression_ctx.type()) {
292 case kSnappyCompression: {
293 size_t ulength = 0;
294 static char snappy_corrupt_msg[] =
295 "Snappy not supported or corrupted Snappy compressed block contents";
296 if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
297 return Status::Corruption(snappy_corrupt_msg);
298 }
299 ubuf.reset(new char[ulength]);
300 if (!Snappy_Uncompress(data, n, ubuf.get())) {
301 return Status::Corruption(snappy_corrupt_msg);
302 }
303 *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
304 break;
305 }
306 case kZlibCompression:
307 ubuf.reset(Zlib_Uncompress(
308 uncompression_ctx, data, n, &decompress_size,
309 GetCompressFormatForVersion(kZlibCompression, format_version)));
310 if (!ubuf) {
311 static char zlib_corrupt_msg[] =
312 "Zlib not supported or corrupted Zlib compressed block contents";
313 return Status::Corruption(zlib_corrupt_msg);
314 }
315 *contents =
316 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
317 break;
318 case kBZip2Compression:
319 ubuf.reset(BZip2_Uncompress(
320 data, n, &decompress_size,
321 GetCompressFormatForVersion(kBZip2Compression, format_version)));
322 if (!ubuf) {
323 static char bzip2_corrupt_msg[] =
324 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
325 return Status::Corruption(bzip2_corrupt_msg);
326 }
327 *contents =
328 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
329 break;
330 case kLZ4Compression:
331 ubuf.reset(LZ4_Uncompress(
332 uncompression_ctx, data, n, &decompress_size,
333 GetCompressFormatForVersion(kLZ4Compression, format_version)));
334 if (!ubuf) {
335 static char lz4_corrupt_msg[] =
336 "LZ4 not supported or corrupted LZ4 compressed block contents";
337 return Status::Corruption(lz4_corrupt_msg);
338 }
339 *contents =
340 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
341 break;
342 case kLZ4HCCompression:
343 ubuf.reset(LZ4_Uncompress(
344 uncompression_ctx, data, n, &decompress_size,
345 GetCompressFormatForVersion(kLZ4HCCompression, format_version)));
346 if (!ubuf) {
347 static char lz4hc_corrupt_msg[] =
348 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
349 return Status::Corruption(lz4hc_corrupt_msg);
350 }
351 *contents =
352 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
353 break;
354 case kXpressCompression:
355 ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
356 if (!ubuf) {
357 static char xpress_corrupt_msg[] =
358 "XPRESS not supported or corrupted XPRESS compressed block contents";
359 return Status::Corruption(xpress_corrupt_msg);
360 }
361 *contents =
362 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
363 break;
364 case kZSTD:
365 case kZSTDNotFinalCompression:
366 ubuf.reset(ZSTD_Uncompress(uncompression_ctx, data, n, &decompress_size));
367 if (!ubuf) {
368 static char zstd_corrupt_msg[] =
369 "ZSTD not supported or corrupted ZSTD compressed block contents";
370 return Status::Corruption(zstd_corrupt_msg);
371 }
372 *contents =
373 BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
374 break;
375 default:
376 return Status::Corruption("bad block type");
377 }
378
379 if(ShouldReportDetailedTime(ioptions.env, ioptions.statistics)){
380 MeasureTime(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
381 timer.ElapsedNanos());
382 }
383 MeasureTime(ioptions.statistics, BYTES_DECOMPRESSED, contents->data.size());
384 RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
385
386 return Status::OK();
387 }
388
389 //
390 // The 'data' points to the raw block contents that was read in from file.
391 // This method allocates a new heap buffer and the raw block
392 // contents are uncompresed into this buffer. This
393 // buffer is returned via 'result' and it is upto the caller to
394 // free this buffer.
395 // format_version is the block format as defined in include/rocksdb/table.h
396 Status UncompressBlockContents(const UncompressionContext& uncompression_ctx,
397 const char* data, size_t n,
398 BlockContents* contents, uint32_t format_version,
399 const ImmutableCFOptions& ioptions) {
400 assert(data[n] != kNoCompression);
401 assert(data[n] == uncompression_ctx.type());
402 return UncompressBlockContentsForCompressionType(
403 uncompression_ctx, data, n, contents, format_version, ioptions);
404 }
405
406 } // namespace rocksdb