]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #include "table/format.h" | |
11 | ||
f67539c2 | 12 | #include <cinttypes> |
494da23a | 13 | #include <string> |
7c673cae | 14 | |
f67539c2 TL |
15 | #include "block_fetcher.h" |
16 | #include "file/random_access_file_reader.h" | |
17 | #include "logging/logging.h" | |
18 | #include "memory/memory_allocator.h" | |
7c673cae FG |
19 | #include "monitoring/perf_context_imp.h" |
20 | #include "monitoring/statistics.h" | |
21 | #include "rocksdb/env.h" | |
f67539c2 TL |
22 | #include "table/block_based/block.h" |
23 | #include "table/block_based/block_based_table_reader.h" | |
7c673cae FG |
24 | #include "table/persistent_cache_helper.h" |
25 | #include "util/coding.h" | |
26 | #include "util/compression.h" | |
27 | #include "util/crc32c.h" | |
7c673cae FG |
28 | #include "util/stop_watch.h" |
29 | #include "util/string_util.h" | |
7c673cae | 30 | |
f67539c2 | 31 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
32 | |
33 | extern const uint64_t kLegacyBlockBasedTableMagicNumber; | |
34 | extern const uint64_t kBlockBasedTableMagicNumber; | |
35 | ||
36 | #ifndef ROCKSDB_LITE | |
37 | extern const uint64_t kLegacyPlainTableMagicNumber; | |
38 | extern const uint64_t kPlainTableMagicNumber; | |
39 | #else | |
40 | // ROCKSDB_LITE doesn't have plain table | |
41 | const uint64_t kLegacyPlainTableMagicNumber = 0; | |
42 | const uint64_t kPlainTableMagicNumber = 0; | |
43 | #endif | |
7c673cae FG |
44 | |
45 | bool ShouldReportDetailedTime(Env* env, Statistics* stats) { | |
46 | return env != nullptr && stats != nullptr && | |
494da23a | 47 | stats->get_stats_level() > kExceptDetailedTimers; |
7c673cae FG |
48 | } |
49 | ||
50 | void BlockHandle::EncodeTo(std::string* dst) const { | |
51 | // Sanity check that all fields have been set | |
52 | assert(offset_ != ~static_cast<uint64_t>(0)); | |
53 | assert(size_ != ~static_cast<uint64_t>(0)); | |
54 | PutVarint64Varint64(dst, offset_, size_); | |
55 | } | |
56 | ||
57 | Status BlockHandle::DecodeFrom(Slice* input) { | |
494da23a | 58 | if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { |
7c673cae FG |
59 | return Status::OK(); |
60 | } else { | |
61 | // reset in case failure after partially decoding | |
62 | offset_ = 0; | |
63 | size_ = 0; | |
64 | return Status::Corruption("bad block handle"); | |
65 | } | |
66 | } | |
67 | ||
11fdf7f2 TL |
68 | Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { |
69 | if (GetVarint64(input, &size_)) { | |
70 | offset_ = _offset; | |
71 | return Status::OK(); | |
72 | } else { | |
73 | // reset in case failure after partially decoding | |
74 | offset_ = 0; | |
75 | size_ = 0; | |
76 | return Status::Corruption("bad block handle"); | |
77 | } | |
78 | } | |
79 | ||
7c673cae FG |
80 | // Return a string that contains the copy of handle. |
81 | std::string BlockHandle::ToString(bool hex) const { | |
82 | std::string handle_str; | |
83 | EncodeTo(&handle_str); | |
84 | if (hex) { | |
85 | return Slice(handle_str).ToString(true); | |
86 | } else { | |
87 | return handle_str; | |
88 | } | |
89 | } | |
90 | ||
91 | const BlockHandle BlockHandle::kNullBlockHandle(0, 0); | |
92 | ||
f67539c2 TL |
93 | void IndexValue::EncodeTo(std::string* dst, bool have_first_key, |
94 | const BlockHandle* previous_handle) const { | |
95 | if (previous_handle) { | |
96 | assert(handle.offset() == previous_handle->offset() + | |
97 | previous_handle->size() + kBlockTrailerSize); | |
98 | PutVarsignedint64(dst, handle.size() - previous_handle->size()); | |
99 | } else { | |
100 | handle.EncodeTo(dst); | |
101 | } | |
102 | assert(dst->size() != 0); | |
103 | ||
104 | if (have_first_key) { | |
105 | PutLengthPrefixedSlice(dst, first_internal_key); | |
106 | } | |
107 | } | |
108 | ||
109 | Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, | |
110 | const BlockHandle* previous_handle) { | |
111 | if (previous_handle) { | |
112 | int64_t delta; | |
113 | if (!GetVarsignedint64(input, &delta)) { | |
114 | return Status::Corruption("bad delta-encoded index value"); | |
115 | } | |
116 | handle = BlockHandle( | |
117 | previous_handle->offset() + previous_handle->size() + kBlockTrailerSize, | |
118 | previous_handle->size() + delta); | |
119 | } else { | |
120 | Status s = handle.DecodeFrom(input); | |
121 | if (!s.ok()) { | |
122 | return s; | |
123 | } | |
124 | } | |
125 | ||
126 | if (!have_first_key) { | |
127 | first_internal_key = Slice(); | |
128 | } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { | |
129 | return Status::Corruption("bad first key in block info"); | |
130 | } | |
131 | ||
132 | return Status::OK(); | |
133 | } | |
134 | ||
135 | std::string IndexValue::ToString(bool hex, bool have_first_key) const { | |
136 | std::string s; | |
137 | EncodeTo(&s, have_first_key, nullptr); | |
138 | if (hex) { | |
139 | return Slice(s).ToString(true); | |
140 | } else { | |
141 | return s; | |
142 | } | |
143 | } | |
144 | ||
7c673cae FG |
145 | namespace { |
146 | inline bool IsLegacyFooterFormat(uint64_t magic_number) { | |
147 | return magic_number == kLegacyBlockBasedTableMagicNumber || | |
148 | magic_number == kLegacyPlainTableMagicNumber; | |
149 | } | |
150 | inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { | |
151 | if (magic_number == kLegacyBlockBasedTableMagicNumber) { | |
152 | return kBlockBasedTableMagicNumber; | |
153 | } | |
154 | if (magic_number == kLegacyPlainTableMagicNumber) { | |
155 | return kPlainTableMagicNumber; | |
156 | } | |
157 | assert(false); | |
158 | return 0; | |
159 | } | |
160 | } // namespace | |
161 | ||
162 | // legacy footer format: | |
163 | // metaindex handle (varint64 offset, varint64 size) | |
164 | // index handle (varint64 offset, varint64 size) | |
165 | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength | |
166 | // table_magic_number (8 bytes) | |
167 | // new footer format: | |
11fdf7f2 | 168 | // checksum type (char, 1 byte) |
7c673cae FG |
169 | // metaindex handle (varint64 offset, varint64 size) |
170 | // index handle (varint64 offset, varint64 size) | |
171 | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 | |
172 | // footer version (4 bytes) | |
173 | // table_magic_number (8 bytes) | |
174 | void Footer::EncodeTo(std::string* dst) const { | |
175 | assert(HasInitializedTableMagicNumber()); | |
176 | if (IsLegacyFooterFormat(table_magic_number())) { | |
177 | // has to be default checksum with legacy footer | |
178 | assert(checksum_ == kCRC32c); | |
179 | const size_t original_size = dst->size(); | |
180 | metaindex_handle_.EncodeTo(dst); | |
181 | index_handle_.EncodeTo(dst); | |
182 | dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding | |
183 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); | |
184 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); | |
185 | assert(dst->size() == original_size + kVersion0EncodedLength); | |
186 | } else { | |
187 | const size_t original_size = dst->size(); | |
188 | dst->push_back(static_cast<char>(checksum_)); | |
189 | metaindex_handle_.EncodeTo(dst); | |
190 | index_handle_.EncodeTo(dst); | |
191 | dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding | |
192 | PutFixed32(dst, version()); | |
193 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); | |
194 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); | |
195 | assert(dst->size() == original_size + kNewVersionsEncodedLength); | |
196 | } | |
197 | } | |
198 | ||
199 | Footer::Footer(uint64_t _table_magic_number, uint32_t _version) | |
200 | : version_(_version), | |
201 | checksum_(kCRC32c), | |
202 | table_magic_number_(_table_magic_number) { | |
203 | // This should be guaranteed by constructor callers | |
204 | assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); | |
205 | } | |
206 | ||
207 | Status Footer::DecodeFrom(Slice* input) { | |
208 | assert(!HasInitializedTableMagicNumber()); | |
209 | assert(input != nullptr); | |
210 | assert(input->size() >= kMinEncodedLength); | |
211 | ||
494da23a | 212 | const char* magic_ptr = |
7c673cae FG |
213 | input->data() + input->size() - kMagicNumberLengthByte; |
214 | const uint32_t magic_lo = DecodeFixed32(magic_ptr); | |
215 | const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); | |
216 | uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | | |
217 | (static_cast<uint64_t>(magic_lo))); | |
218 | ||
219 | // We check for legacy formats here and silently upconvert them | |
220 | bool legacy = IsLegacyFooterFormat(magic); | |
221 | if (legacy) { | |
222 | magic = UpconvertLegacyFooterFormat(magic); | |
223 | } | |
224 | set_table_magic_number(magic); | |
225 | ||
226 | if (legacy) { | |
227 | // The size is already asserted to be at least kMinEncodedLength | |
228 | // at the beginning of the function | |
229 | input->remove_prefix(input->size() - kVersion0EncodedLength); | |
230 | version_ = 0 /* legacy */; | |
231 | checksum_ = kCRC32c; | |
232 | } else { | |
233 | version_ = DecodeFixed32(magic_ptr - 4); | |
234 | // Footer version 1 and higher will always occupy exactly this many bytes. | |
235 | // It consists of the checksum type, two block handles, padding, | |
236 | // a version number, and a magic number | |
237 | if (input->size() < kNewVersionsEncodedLength) { | |
238 | return Status::Corruption("input is too short to be an sstable"); | |
239 | } else { | |
240 | input->remove_prefix(input->size() - kNewVersionsEncodedLength); | |
241 | } | |
242 | uint32_t chksum; | |
243 | if (!GetVarint32(input, &chksum)) { | |
244 | return Status::Corruption("bad checksum type"); | |
245 | } | |
246 | checksum_ = static_cast<ChecksumType>(chksum); | |
247 | } | |
248 | ||
249 | Status result = metaindex_handle_.DecodeFrom(input); | |
250 | if (result.ok()) { | |
251 | result = index_handle_.DecodeFrom(input); | |
252 | } | |
253 | if (result.ok()) { | |
254 | // We skip over any leftover data (just padding for now) in "input" | |
255 | const char* end = magic_ptr + kMagicNumberLengthByte; | |
256 | *input = Slice(end, input->data() + input->size() - end); | |
257 | } | |
258 | return result; | |
259 | } | |
260 | ||
261 | std::string Footer::ToString() const { | |
11fdf7f2 | 262 | std::string result; |
7c673cae FG |
263 | result.reserve(1024); |
264 | ||
265 | bool legacy = IsLegacyFooterFormat(table_magic_number_); | |
266 | if (legacy) { | |
267 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); | |
268 | result.append("index handle: " + index_handle_.ToString() + "\n "); | |
269 | result.append("table_magic_number: " + | |
f67539c2 | 270 | ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n "); |
7c673cae | 271 | } else { |
f67539c2 TL |
272 | result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) + |
273 | "\n "); | |
7c673cae FG |
274 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); |
275 | result.append("index handle: " + index_handle_.ToString() + "\n "); | |
f67539c2 TL |
276 | result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) + |
277 | "\n "); | |
7c673cae | 278 | result.append("table_magic_number: " + |
f67539c2 | 279 | ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n "); |
7c673cae FG |
280 | } |
281 | return result; | |
282 | } | |
283 | ||
11fdf7f2 TL |
284 | Status ReadFooterFromFile(RandomAccessFileReader* file, |
285 | FilePrefetchBuffer* prefetch_buffer, | |
286 | uint64_t file_size, Footer* footer, | |
287 | uint64_t enforce_table_magic_number) { | |
7c673cae | 288 | if (file_size < Footer::kMinEncodedLength) { |
494da23a TL |
289 | return Status::Corruption("file is too short (" + ToString(file_size) + |
290 | " bytes) to be an " | |
291 | "sstable: " + | |
292 | file->file_name()); | |
7c673cae FG |
293 | } |
294 | ||
295 | char footer_space[Footer::kMaxEncodedLength]; | |
296 | Slice footer_input; | |
297 | size_t read_offset = | |
298 | (file_size > Footer::kMaxEncodedLength) | |
299 | ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength) | |
300 | : 0; | |
11fdf7f2 TL |
301 | Status s; |
302 | if (prefetch_buffer == nullptr || | |
303 | !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength, | |
304 | &footer_input)) { | |
305 | s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, | |
306 | footer_space); | |
307 | if (!s.ok()) return s; | |
308 | } | |
7c673cae FG |
309 | |
310 | // Check that we actually read the whole footer from the file. It may be | |
311 | // that size isn't correct. | |
312 | if (footer_input.size() < Footer::kMinEncodedLength) { | |
494da23a TL |
313 | return Status::Corruption("file is too short (" + ToString(file_size) + |
314 | " bytes) to be an " | |
315 | "sstable" + | |
316 | file->file_name()); | |
7c673cae FG |
317 | } |
318 | ||
319 | s = footer->DecodeFrom(&footer_input); | |
320 | if (!s.ok()) { | |
321 | return s; | |
322 | } | |
323 | if (enforce_table_magic_number != 0 && | |
324 | enforce_table_magic_number != footer->table_magic_number()) { | |
11fdf7f2 | 325 | return Status::Corruption( |
494da23a TL |
326 | "Bad table magic number: expected " + |
327 | ToString(enforce_table_magic_number) + ", found " + | |
328 | ToString(footer->table_magic_number()) + " in " + file->file_name()); | |
7c673cae FG |
329 | } |
330 | return Status::OK(); | |
331 | } | |
332 | ||
7c673cae | 333 | Status UncompressBlockContentsForCompressionType( |
494da23a | 334 | const UncompressionInfo& uncompression_info, const char* data, size_t n, |
11fdf7f2 | 335 | BlockContents* contents, uint32_t format_version, |
494da23a TL |
336 | const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) { |
337 | CacheAllocationPtr ubuf; | |
7c673cae | 338 | |
494da23a | 339 | assert(uncompression_info.type() != kNoCompression && |
11fdf7f2 | 340 | "Invalid compression type"); |
7c673cae | 341 | |
494da23a TL |
342 | StopWatchNano timer(ioptions.env, ShouldReportDetailedTime( |
343 | ioptions.env, ioptions.statistics)); | |
7c673cae | 344 | int decompress_size = 0; |
494da23a | 345 | switch (uncompression_info.type()) { |
7c673cae FG |
346 | case kSnappyCompression: { |
347 | size_t ulength = 0; | |
348 | static char snappy_corrupt_msg[] = | |
494da23a | 349 | "Snappy not supported or corrupted Snappy compressed block contents"; |
7c673cae FG |
350 | if (!Snappy_GetUncompressedLength(data, n, &ulength)) { |
351 | return Status::Corruption(snappy_corrupt_msg); | |
352 | } | |
494da23a | 353 | ubuf = AllocateBlock(ulength, allocator); |
7c673cae FG |
354 | if (!Snappy_Uncompress(data, n, ubuf.get())) { |
355 | return Status::Corruption(snappy_corrupt_msg); | |
356 | } | |
494da23a | 357 | *contents = BlockContents(std::move(ubuf), ulength); |
7c673cae FG |
358 | break; |
359 | } | |
360 | case kZlibCompression: | |
494da23a TL |
361 | ubuf = Zlib_Uncompress( |
362 | uncompression_info, data, n, &decompress_size, | |
363 | GetCompressFormatForVersion(kZlibCompression, format_version), | |
364 | allocator); | |
7c673cae FG |
365 | if (!ubuf) { |
366 | static char zlib_corrupt_msg[] = | |
494da23a | 367 | "Zlib not supported or corrupted Zlib compressed block contents"; |
7c673cae FG |
368 | return Status::Corruption(zlib_corrupt_msg); |
369 | } | |
494da23a | 370 | *contents = BlockContents(std::move(ubuf), decompress_size); |
7c673cae FG |
371 | break; |
372 | case kBZip2Compression: | |
494da23a | 373 | ubuf = BZip2_Uncompress( |
7c673cae | 374 | data, n, &decompress_size, |
494da23a TL |
375 | GetCompressFormatForVersion(kBZip2Compression, format_version), |
376 | allocator); | |
7c673cae FG |
377 | if (!ubuf) { |
378 | static char bzip2_corrupt_msg[] = | |
494da23a | 379 | "Bzip2 not supported or corrupted Bzip2 compressed block contents"; |
7c673cae FG |
380 | return Status::Corruption(bzip2_corrupt_msg); |
381 | } | |
494da23a | 382 | *contents = BlockContents(std::move(ubuf), decompress_size); |
7c673cae FG |
383 | break; |
384 | case kLZ4Compression: | |
494da23a TL |
385 | ubuf = LZ4_Uncompress( |
386 | uncompression_info, data, n, &decompress_size, | |
387 | GetCompressFormatForVersion(kLZ4Compression, format_version), | |
388 | allocator); | |
7c673cae FG |
389 | if (!ubuf) { |
390 | static char lz4_corrupt_msg[] = | |
494da23a | 391 | "LZ4 not supported or corrupted LZ4 compressed block contents"; |
7c673cae FG |
392 | return Status::Corruption(lz4_corrupt_msg); |
393 | } | |
494da23a | 394 | *contents = BlockContents(std::move(ubuf), decompress_size); |
7c673cae FG |
395 | break; |
396 | case kLZ4HCCompression: | |
494da23a TL |
397 | ubuf = LZ4_Uncompress( |
398 | uncompression_info, data, n, &decompress_size, | |
399 | GetCompressFormatForVersion(kLZ4HCCompression, format_version), | |
400 | allocator); | |
7c673cae FG |
401 | if (!ubuf) { |
402 | static char lz4hc_corrupt_msg[] = | |
494da23a | 403 | "LZ4HC not supported or corrupted LZ4HC compressed block contents"; |
7c673cae FG |
404 | return Status::Corruption(lz4hc_corrupt_msg); |
405 | } | |
494da23a | 406 | *contents = BlockContents(std::move(ubuf), decompress_size); |
7c673cae FG |
407 | break; |
408 | case kXpressCompression: | |
494da23a TL |
409 | // XPRESS allocates memory internally, thus no support for custom |
410 | // allocator. | |
7c673cae FG |
411 | ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size)); |
412 | if (!ubuf) { | |
413 | static char xpress_corrupt_msg[] = | |
494da23a TL |
414 | "XPRESS not supported or corrupted XPRESS compressed block " |
415 | "contents"; | |
7c673cae FG |
416 | return Status::Corruption(xpress_corrupt_msg); |
417 | } | |
494da23a | 418 | *contents = BlockContents(std::move(ubuf), decompress_size); |
7c673cae FG |
419 | break; |
420 | case kZSTD: | |
421 | case kZSTDNotFinalCompression: | |
494da23a TL |
422 | ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size, |
423 | allocator); | |
7c673cae FG |
424 | if (!ubuf) { |
425 | static char zstd_corrupt_msg[] = | |
426 | "ZSTD not supported or corrupted ZSTD compressed block contents"; | |
427 | return Status::Corruption(zstd_corrupt_msg); | |
428 | } | |
494da23a | 429 | *contents = BlockContents(std::move(ubuf), decompress_size); |
7c673cae FG |
430 | break; |
431 | default: | |
432 | return Status::Corruption("bad block type"); | |
433 | } | |
434 | ||
494da23a TL |
435 | if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) { |
436 | RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, | |
437 | timer.ElapsedNanos()); | |
7c673cae | 438 | } |
494da23a TL |
439 | RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED, |
440 | contents->data.size()); | |
11fdf7f2 | 441 | RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); |
7c673cae FG |
442 | |
443 | return Status::OK(); | |
444 | } | |
445 | ||
446 | // | |
447 | // The 'data' points to the raw block contents that was read in from file. | |
448 | // This method allocates a new heap buffer and the raw block | |
449 | // contents are uncompresed into this buffer. This | |
450 | // buffer is returned via 'result' and it is upto the caller to | |
451 | // free this buffer. | |
452 | // format_version is the block format as defined in include/rocksdb/table.h | |
494da23a | 453 | Status UncompressBlockContents(const UncompressionInfo& uncompression_info, |
11fdf7f2 | 454 | const char* data, size_t n, |
7c673cae | 455 | BlockContents* contents, uint32_t format_version, |
494da23a TL |
456 | const ImmutableCFOptions& ioptions, |
457 | MemoryAllocator* allocator) { | |
7c673cae | 458 | assert(data[n] != kNoCompression); |
494da23a TL |
459 | assert(data[n] == uncompression_info.type()); |
460 | return UncompressBlockContentsForCompressionType(uncompression_info, data, n, | |
461 | contents, format_version, | |
462 | ioptions, allocator); | |
7c673cae FG |
463 | } |
464 | ||
f67539c2 | 465 | } // namespace ROCKSDB_NAMESPACE |