]>
Commit | Line | Data |
---|---|---|
1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. | |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | // | |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #include "table/format.h" | |
11 | ||
12 | #include <string> | |
13 | #include <inttypes.h> | |
14 | ||
15 | #include "monitoring/perf_context_imp.h" | |
16 | #include "monitoring/statistics.h" | |
17 | #include "rocksdb/env.h" | |
18 | #include "table/block.h" | |
19 | #include "table/block_based_table_reader.h" | |
20 | #include "table/block_fetcher.h" | |
21 | #include "table/persistent_cache_helper.h" | |
22 | #include "util/coding.h" | |
23 | #include "util/compression.h" | |
24 | #include "util/crc32c.h" | |
25 | #include "util/file_reader_writer.h" | |
26 | #include "util/logging.h" | |
27 | #include "util/stop_watch.h" | |
28 | #include "util/string_util.h" | |
29 | #include "util/xxhash.h" | |
30 | ||
31 | namespace rocksdb { | |
32 | ||
33 | extern const uint64_t kLegacyBlockBasedTableMagicNumber; | |
34 | extern const uint64_t kBlockBasedTableMagicNumber; | |
35 | ||
36 | #ifndef ROCKSDB_LITE | |
37 | extern const uint64_t kLegacyPlainTableMagicNumber; | |
38 | extern const uint64_t kPlainTableMagicNumber; | |
39 | #else | |
40 | // ROCKSDB_LITE doesn't have plain table | |
41 | const uint64_t kLegacyPlainTableMagicNumber = 0; | |
42 | const uint64_t kPlainTableMagicNumber = 0; | |
43 | #endif | |
44 | ||
45 | bool ShouldReportDetailedTime(Env* env, Statistics* stats) { | |
46 | return env != nullptr && stats != nullptr && | |
47 | stats->stats_level_ > kExceptDetailedTimers; | |
48 | } | |
49 | ||
50 | void BlockHandle::EncodeTo(std::string* dst) const { | |
51 | // Sanity check that all fields have been set | |
52 | assert(offset_ != ~static_cast<uint64_t>(0)); | |
53 | assert(size_ != ~static_cast<uint64_t>(0)); | |
54 | PutVarint64Varint64(dst, offset_, size_); | |
55 | } | |
56 | ||
57 | Status BlockHandle::DecodeFrom(Slice* input) { | |
58 | if (GetVarint64(input, &offset_) && | |
59 | GetVarint64(input, &size_)) { | |
60 | return Status::OK(); | |
61 | } else { | |
62 | // reset in case failure after partially decoding | |
63 | offset_ = 0; | |
64 | size_ = 0; | |
65 | return Status::Corruption("bad block handle"); | |
66 | } | |
67 | } | |
68 | ||
69 | Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { | |
70 | if (GetVarint64(input, &size_)) { | |
71 | offset_ = _offset; | |
72 | return Status::OK(); | |
73 | } else { | |
74 | // reset in case failure after partially decoding | |
75 | offset_ = 0; | |
76 | size_ = 0; | |
77 | return Status::Corruption("bad block handle"); | |
78 | } | |
79 | } | |
80 | ||
81 | // Return a string that contains the copy of handle. | |
82 | std::string BlockHandle::ToString(bool hex) const { | |
83 | std::string handle_str; | |
84 | EncodeTo(&handle_str); | |
85 | if (hex) { | |
86 | return Slice(handle_str).ToString(true); | |
87 | } else { | |
88 | return handle_str; | |
89 | } | |
90 | } | |
91 | ||
92 | const BlockHandle BlockHandle::kNullBlockHandle(0, 0); | |
93 | ||
94 | namespace { | |
95 | inline bool IsLegacyFooterFormat(uint64_t magic_number) { | |
96 | return magic_number == kLegacyBlockBasedTableMagicNumber || | |
97 | magic_number == kLegacyPlainTableMagicNumber; | |
98 | } | |
99 | inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { | |
100 | if (magic_number == kLegacyBlockBasedTableMagicNumber) { | |
101 | return kBlockBasedTableMagicNumber; | |
102 | } | |
103 | if (magic_number == kLegacyPlainTableMagicNumber) { | |
104 | return kPlainTableMagicNumber; | |
105 | } | |
106 | assert(false); | |
107 | return 0; | |
108 | } | |
109 | } // namespace | |
110 | ||
111 | // legacy footer format: | |
112 | // metaindex handle (varint64 offset, varint64 size) | |
113 | // index handle (varint64 offset, varint64 size) | |
114 | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength | |
115 | // table_magic_number (8 bytes) | |
116 | // new footer format: | |
117 | // checksum type (char, 1 byte) | |
118 | // metaindex handle (varint64 offset, varint64 size) | |
119 | // index handle (varint64 offset, varint64 size) | |
120 | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 | |
121 | // footer version (4 bytes) | |
122 | // table_magic_number (8 bytes) | |
123 | void Footer::EncodeTo(std::string* dst) const { | |
124 | assert(HasInitializedTableMagicNumber()); | |
125 | if (IsLegacyFooterFormat(table_magic_number())) { | |
126 | // has to be default checksum with legacy footer | |
127 | assert(checksum_ == kCRC32c); | |
128 | const size_t original_size = dst->size(); | |
129 | metaindex_handle_.EncodeTo(dst); | |
130 | index_handle_.EncodeTo(dst); | |
131 | dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding | |
132 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); | |
133 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); | |
134 | assert(dst->size() == original_size + kVersion0EncodedLength); | |
135 | } else { | |
136 | const size_t original_size = dst->size(); | |
137 | dst->push_back(static_cast<char>(checksum_)); | |
138 | metaindex_handle_.EncodeTo(dst); | |
139 | index_handle_.EncodeTo(dst); | |
140 | dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding | |
141 | PutFixed32(dst, version()); | |
142 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); | |
143 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); | |
144 | assert(dst->size() == original_size + kNewVersionsEncodedLength); | |
145 | } | |
146 | } | |
147 | ||
148 | Footer::Footer(uint64_t _table_magic_number, uint32_t _version) | |
149 | : version_(_version), | |
150 | checksum_(kCRC32c), | |
151 | table_magic_number_(_table_magic_number) { | |
152 | // This should be guaranteed by constructor callers | |
153 | assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); | |
154 | } | |
155 | ||
156 | Status Footer::DecodeFrom(Slice* input) { | |
157 | assert(!HasInitializedTableMagicNumber()); | |
158 | assert(input != nullptr); | |
159 | assert(input->size() >= kMinEncodedLength); | |
160 | ||
161 | const char *magic_ptr = | |
162 | input->data() + input->size() - kMagicNumberLengthByte; | |
163 | const uint32_t magic_lo = DecodeFixed32(magic_ptr); | |
164 | const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); | |
165 | uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | | |
166 | (static_cast<uint64_t>(magic_lo))); | |
167 | ||
168 | // We check for legacy formats here and silently upconvert them | |
169 | bool legacy = IsLegacyFooterFormat(magic); | |
170 | if (legacy) { | |
171 | magic = UpconvertLegacyFooterFormat(magic); | |
172 | } | |
173 | set_table_magic_number(magic); | |
174 | ||
175 | if (legacy) { | |
176 | // The size is already asserted to be at least kMinEncodedLength | |
177 | // at the beginning of the function | |
178 | input->remove_prefix(input->size() - kVersion0EncodedLength); | |
179 | version_ = 0 /* legacy */; | |
180 | checksum_ = kCRC32c; | |
181 | } else { | |
182 | version_ = DecodeFixed32(magic_ptr - 4); | |
183 | // Footer version 1 and higher will always occupy exactly this many bytes. | |
184 | // It consists of the checksum type, two block handles, padding, | |
185 | // a version number, and a magic number | |
186 | if (input->size() < kNewVersionsEncodedLength) { | |
187 | return Status::Corruption("input is too short to be an sstable"); | |
188 | } else { | |
189 | input->remove_prefix(input->size() - kNewVersionsEncodedLength); | |
190 | } | |
191 | uint32_t chksum; | |
192 | if (!GetVarint32(input, &chksum)) { | |
193 | return Status::Corruption("bad checksum type"); | |
194 | } | |
195 | checksum_ = static_cast<ChecksumType>(chksum); | |
196 | } | |
197 | ||
198 | Status result = metaindex_handle_.DecodeFrom(input); | |
199 | if (result.ok()) { | |
200 | result = index_handle_.DecodeFrom(input); | |
201 | } | |
202 | if (result.ok()) { | |
203 | // We skip over any leftover data (just padding for now) in "input" | |
204 | const char* end = magic_ptr + kMagicNumberLengthByte; | |
205 | *input = Slice(end, input->data() + input->size() - end); | |
206 | } | |
207 | return result; | |
208 | } | |
209 | ||
210 | std::string Footer::ToString() const { | |
211 | std::string result; | |
212 | result.reserve(1024); | |
213 | ||
214 | bool legacy = IsLegacyFooterFormat(table_magic_number_); | |
215 | if (legacy) { | |
216 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); | |
217 | result.append("index handle: " + index_handle_.ToString() + "\n "); | |
218 | result.append("table_magic_number: " + | |
219 | rocksdb::ToString(table_magic_number_) + "\n "); | |
220 | } else { | |
221 | result.append("checksum: " + rocksdb::ToString(checksum_) + "\n "); | |
222 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); | |
223 | result.append("index handle: " + index_handle_.ToString() + "\n "); | |
224 | result.append("footer version: " + rocksdb::ToString(version_) + "\n "); | |
225 | result.append("table_magic_number: " + | |
226 | rocksdb::ToString(table_magic_number_) + "\n "); | |
227 | } | |
228 | return result; | |
229 | } | |
230 | ||
231 | Status ReadFooterFromFile(RandomAccessFileReader* file, | |
232 | FilePrefetchBuffer* prefetch_buffer, | |
233 | uint64_t file_size, Footer* footer, | |
234 | uint64_t enforce_table_magic_number) { | |
235 | if (file_size < Footer::kMinEncodedLength) { | |
236 | return Status::Corruption( | |
237 | "file is too short (" + ToString(file_size) + " bytes) to be an " | |
238 | "sstable: " + file->file_name()); | |
239 | } | |
240 | ||
241 | char footer_space[Footer::kMaxEncodedLength]; | |
242 | Slice footer_input; | |
243 | size_t read_offset = | |
244 | (file_size > Footer::kMaxEncodedLength) | |
245 | ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength) | |
246 | : 0; | |
247 | Status s; | |
248 | if (prefetch_buffer == nullptr || | |
249 | !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength, | |
250 | &footer_input)) { | |
251 | s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, | |
252 | footer_space); | |
253 | if (!s.ok()) return s; | |
254 | } | |
255 | ||
256 | // Check that we actually read the whole footer from the file. It may be | |
257 | // that size isn't correct. | |
258 | if (footer_input.size() < Footer::kMinEncodedLength) { | |
259 | return Status::Corruption( | |
260 | "file is too short (" + ToString(file_size) + " bytes) to be an " | |
261 | "sstable" + file->file_name()); | |
262 | } | |
263 | ||
264 | s = footer->DecodeFrom(&footer_input); | |
265 | if (!s.ok()) { | |
266 | return s; | |
267 | } | |
268 | if (enforce_table_magic_number != 0 && | |
269 | enforce_table_magic_number != footer->table_magic_number()) { | |
270 | return Status::Corruption( | |
271 | "Bad table magic number: expected " | |
272 | + ToString(enforce_table_magic_number) + ", found " | |
273 | + ToString(footer->table_magic_number()) | |
274 | + " in " + file->file_name()); | |
275 | } | |
276 | return Status::OK(); | |
277 | } | |
278 | ||
279 | Status UncompressBlockContentsForCompressionType( | |
280 | const UncompressionContext& uncompression_ctx, const char* data, size_t n, | |
281 | BlockContents* contents, uint32_t format_version, | |
282 | const ImmutableCFOptions& ioptions) { | |
283 | std::unique_ptr<char[]> ubuf; | |
284 | ||
285 | assert(uncompression_ctx.type() != kNoCompression && | |
286 | "Invalid compression type"); | |
287 | ||
288 | StopWatchNano timer(ioptions.env, | |
289 | ShouldReportDetailedTime(ioptions.env, ioptions.statistics)); | |
290 | int decompress_size = 0; | |
291 | switch (uncompression_ctx.type()) { | |
292 | case kSnappyCompression: { | |
293 | size_t ulength = 0; | |
294 | static char snappy_corrupt_msg[] = | |
295 | "Snappy not supported or corrupted Snappy compressed block contents"; | |
296 | if (!Snappy_GetUncompressedLength(data, n, &ulength)) { | |
297 | return Status::Corruption(snappy_corrupt_msg); | |
298 | } | |
299 | ubuf.reset(new char[ulength]); | |
300 | if (!Snappy_Uncompress(data, n, ubuf.get())) { | |
301 | return Status::Corruption(snappy_corrupt_msg); | |
302 | } | |
303 | *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression); | |
304 | break; | |
305 | } | |
306 | case kZlibCompression: | |
307 | ubuf.reset(Zlib_Uncompress( | |
308 | uncompression_ctx, data, n, &decompress_size, | |
309 | GetCompressFormatForVersion(kZlibCompression, format_version))); | |
310 | if (!ubuf) { | |
311 | static char zlib_corrupt_msg[] = | |
312 | "Zlib not supported or corrupted Zlib compressed block contents"; | |
313 | return Status::Corruption(zlib_corrupt_msg); | |
314 | } | |
315 | *contents = | |
316 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); | |
317 | break; | |
318 | case kBZip2Compression: | |
319 | ubuf.reset(BZip2_Uncompress( | |
320 | data, n, &decompress_size, | |
321 | GetCompressFormatForVersion(kBZip2Compression, format_version))); | |
322 | if (!ubuf) { | |
323 | static char bzip2_corrupt_msg[] = | |
324 | "Bzip2 not supported or corrupted Bzip2 compressed block contents"; | |
325 | return Status::Corruption(bzip2_corrupt_msg); | |
326 | } | |
327 | *contents = | |
328 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); | |
329 | break; | |
330 | case kLZ4Compression: | |
331 | ubuf.reset(LZ4_Uncompress( | |
332 | uncompression_ctx, data, n, &decompress_size, | |
333 | GetCompressFormatForVersion(kLZ4Compression, format_version))); | |
334 | if (!ubuf) { | |
335 | static char lz4_corrupt_msg[] = | |
336 | "LZ4 not supported or corrupted LZ4 compressed block contents"; | |
337 | return Status::Corruption(lz4_corrupt_msg); | |
338 | } | |
339 | *contents = | |
340 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); | |
341 | break; | |
342 | case kLZ4HCCompression: | |
343 | ubuf.reset(LZ4_Uncompress( | |
344 | uncompression_ctx, data, n, &decompress_size, | |
345 | GetCompressFormatForVersion(kLZ4HCCompression, format_version))); | |
346 | if (!ubuf) { | |
347 | static char lz4hc_corrupt_msg[] = | |
348 | "LZ4HC not supported or corrupted LZ4HC compressed block contents"; | |
349 | return Status::Corruption(lz4hc_corrupt_msg); | |
350 | } | |
351 | *contents = | |
352 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); | |
353 | break; | |
354 | case kXpressCompression: | |
355 | ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size)); | |
356 | if (!ubuf) { | |
357 | static char xpress_corrupt_msg[] = | |
358 | "XPRESS not supported or corrupted XPRESS compressed block contents"; | |
359 | return Status::Corruption(xpress_corrupt_msg); | |
360 | } | |
361 | *contents = | |
362 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); | |
363 | break; | |
364 | case kZSTD: | |
365 | case kZSTDNotFinalCompression: | |
366 | ubuf.reset(ZSTD_Uncompress(uncompression_ctx, data, n, &decompress_size)); | |
367 | if (!ubuf) { | |
368 | static char zstd_corrupt_msg[] = | |
369 | "ZSTD not supported or corrupted ZSTD compressed block contents"; | |
370 | return Status::Corruption(zstd_corrupt_msg); | |
371 | } | |
372 | *contents = | |
373 | BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); | |
374 | break; | |
375 | default: | |
376 | return Status::Corruption("bad block type"); | |
377 | } | |
378 | ||
379 | if(ShouldReportDetailedTime(ioptions.env, ioptions.statistics)){ | |
380 | MeasureTime(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, | |
381 | timer.ElapsedNanos()); | |
382 | } | |
383 | MeasureTime(ioptions.statistics, BYTES_DECOMPRESSED, contents->data.size()); | |
384 | RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); | |
385 | ||
386 | return Status::OK(); | |
387 | } | |
388 | ||
389 | // | |
390 | // The 'data' points to the raw block contents that was read in from file. | |
391 | // This method allocates a new heap buffer and the raw block | |
392 | // contents are uncompresed into this buffer. This | |
393 | // buffer is returned via 'result' and it is upto the caller to | |
394 | // free this buffer. | |
395 | // format_version is the block format as defined in include/rocksdb/table.h | |
396 | Status UncompressBlockContents(const UncompressionContext& uncompression_ctx, | |
397 | const char* data, size_t n, | |
398 | BlockContents* contents, uint32_t format_version, | |
399 | const ImmutableCFOptions& ioptions) { | |
400 | assert(data[n] != kNoCompression); | |
401 | assert(data[n] == uncompression_ctx.type()); | |
402 | return UncompressBlockContentsForCompressionType( | |
403 | uncompression_ctx, data, n, contents, format_version, ioptions); | |
404 | } | |
405 | ||
406 | } // namespace rocksdb |