]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #include "table/format.h" | |
11 | ||
f67539c2 | 12 | #include <cinttypes> |
494da23a | 13 | #include <string> |
7c673cae | 14 | |
f67539c2 TL |
15 | #include "block_fetcher.h" |
16 | #include "file/random_access_file_reader.h" | |
f67539c2 | 17 | #include "memory/memory_allocator.h" |
7c673cae FG |
18 | #include "monitoring/perf_context_imp.h" |
19 | #include "monitoring/statistics.h" | |
20 | #include "rocksdb/env.h" | |
20effc67 | 21 | #include "rocksdb/options.h" |
f67539c2 TL |
22 | #include "table/block_based/block.h" |
23 | #include "table/block_based/block_based_table_reader.h" | |
7c673cae FG |
24 | #include "table/persistent_cache_helper.h" |
25 | #include "util/coding.h" | |
26 | #include "util/compression.h" | |
27 | #include "util/crc32c.h" | |
7c673cae FG |
28 | #include "util/stop_watch.h" |
29 | #include "util/string_util.h" | |
7c673cae | 30 | |
f67539c2 | 31 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
32 | |
33 | extern const uint64_t kLegacyBlockBasedTableMagicNumber; | |
34 | extern const uint64_t kBlockBasedTableMagicNumber; | |
35 | ||
36 | #ifndef ROCKSDB_LITE | |
37 | extern const uint64_t kLegacyPlainTableMagicNumber; | |
38 | extern const uint64_t kPlainTableMagicNumber; | |
39 | #else | |
40 | // ROCKSDB_LITE doesn't have plain table | |
41 | const uint64_t kLegacyPlainTableMagicNumber = 0; | |
42 | const uint64_t kPlainTableMagicNumber = 0; | |
43 | #endif | |
20effc67 | 44 | const char* kHostnameForDbHostId = "__hostname__"; |
7c673cae FG |
45 | |
46 | bool ShouldReportDetailedTime(Env* env, Statistics* stats) { | |
47 | return env != nullptr && stats != nullptr && | |
494da23a | 48 | stats->get_stats_level() > kExceptDetailedTimers; |
7c673cae FG |
49 | } |
50 | ||
51 | void BlockHandle::EncodeTo(std::string* dst) const { | |
52 | // Sanity check that all fields have been set | |
53 | assert(offset_ != ~static_cast<uint64_t>(0)); | |
54 | assert(size_ != ~static_cast<uint64_t>(0)); | |
55 | PutVarint64Varint64(dst, offset_, size_); | |
56 | } | |
57 | ||
58 | Status BlockHandle::DecodeFrom(Slice* input) { | |
494da23a | 59 | if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) { |
7c673cae FG |
60 | return Status::OK(); |
61 | } else { | |
62 | // reset in case failure after partially decoding | |
63 | offset_ = 0; | |
64 | size_ = 0; | |
65 | return Status::Corruption("bad block handle"); | |
66 | } | |
67 | } | |
68 | ||
11fdf7f2 TL |
69 | Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) { |
70 | if (GetVarint64(input, &size_)) { | |
71 | offset_ = _offset; | |
72 | return Status::OK(); | |
73 | } else { | |
74 | // reset in case failure after partially decoding | |
75 | offset_ = 0; | |
76 | size_ = 0; | |
77 | return Status::Corruption("bad block handle"); | |
78 | } | |
79 | } | |
80 | ||
7c673cae FG |
81 | // Return a string that contains the copy of handle. |
82 | std::string BlockHandle::ToString(bool hex) const { | |
83 | std::string handle_str; | |
84 | EncodeTo(&handle_str); | |
85 | if (hex) { | |
86 | return Slice(handle_str).ToString(true); | |
87 | } else { | |
88 | return handle_str; | |
89 | } | |
90 | } | |
91 | ||
92 | const BlockHandle BlockHandle::kNullBlockHandle(0, 0); | |
93 | ||
f67539c2 TL |
94 | void IndexValue::EncodeTo(std::string* dst, bool have_first_key, |
95 | const BlockHandle* previous_handle) const { | |
96 | if (previous_handle) { | |
97 | assert(handle.offset() == previous_handle->offset() + | |
98 | previous_handle->size() + kBlockTrailerSize); | |
99 | PutVarsignedint64(dst, handle.size() - previous_handle->size()); | |
100 | } else { | |
101 | handle.EncodeTo(dst); | |
102 | } | |
103 | assert(dst->size() != 0); | |
104 | ||
105 | if (have_first_key) { | |
106 | PutLengthPrefixedSlice(dst, first_internal_key); | |
107 | } | |
108 | } | |
109 | ||
110 | Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, | |
111 | const BlockHandle* previous_handle) { | |
112 | if (previous_handle) { | |
113 | int64_t delta; | |
114 | if (!GetVarsignedint64(input, &delta)) { | |
115 | return Status::Corruption("bad delta-encoded index value"); | |
116 | } | |
117 | handle = BlockHandle( | |
118 | previous_handle->offset() + previous_handle->size() + kBlockTrailerSize, | |
119 | previous_handle->size() + delta); | |
120 | } else { | |
121 | Status s = handle.DecodeFrom(input); | |
122 | if (!s.ok()) { | |
123 | return s; | |
124 | } | |
125 | } | |
126 | ||
127 | if (!have_first_key) { | |
128 | first_internal_key = Slice(); | |
129 | } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { | |
130 | return Status::Corruption("bad first key in block info"); | |
131 | } | |
132 | ||
133 | return Status::OK(); | |
134 | } | |
135 | ||
136 | std::string IndexValue::ToString(bool hex, bool have_first_key) const { | |
137 | std::string s; | |
138 | EncodeTo(&s, have_first_key, nullptr); | |
139 | if (hex) { | |
140 | return Slice(s).ToString(true); | |
141 | } else { | |
142 | return s; | |
143 | } | |
144 | } | |
145 | ||
7c673cae FG |
146 | namespace { |
147 | inline bool IsLegacyFooterFormat(uint64_t magic_number) { | |
148 | return magic_number == kLegacyBlockBasedTableMagicNumber || | |
149 | magic_number == kLegacyPlainTableMagicNumber; | |
150 | } | |
151 | inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { | |
152 | if (magic_number == kLegacyBlockBasedTableMagicNumber) { | |
153 | return kBlockBasedTableMagicNumber; | |
154 | } | |
155 | if (magic_number == kLegacyPlainTableMagicNumber) { | |
156 | return kPlainTableMagicNumber; | |
157 | } | |
158 | assert(false); | |
159 | return 0; | |
160 | } | |
161 | } // namespace | |
162 | ||
163 | // legacy footer format: | |
164 | // metaindex handle (varint64 offset, varint64 size) | |
165 | // index handle (varint64 offset, varint64 size) | |
166 | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength | |
167 | // table_magic_number (8 bytes) | |
168 | // new footer format: | |
11fdf7f2 | 169 | // checksum type (char, 1 byte) |
7c673cae FG |
170 | // metaindex handle (varint64 offset, varint64 size) |
171 | // index handle (varint64 offset, varint64 size) | |
172 | // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1 | |
173 | // footer version (4 bytes) | |
174 | // table_magic_number (8 bytes) | |
175 | void Footer::EncodeTo(std::string* dst) const { | |
176 | assert(HasInitializedTableMagicNumber()); | |
177 | if (IsLegacyFooterFormat(table_magic_number())) { | |
178 | // has to be default checksum with legacy footer | |
179 | assert(checksum_ == kCRC32c); | |
180 | const size_t original_size = dst->size(); | |
181 | metaindex_handle_.EncodeTo(dst); | |
182 | index_handle_.EncodeTo(dst); | |
183 | dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding | |
184 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); | |
185 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); | |
186 | assert(dst->size() == original_size + kVersion0EncodedLength); | |
187 | } else { | |
188 | const size_t original_size = dst->size(); | |
189 | dst->push_back(static_cast<char>(checksum_)); | |
190 | metaindex_handle_.EncodeTo(dst); | |
191 | index_handle_.EncodeTo(dst); | |
192 | dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding | |
193 | PutFixed32(dst, version()); | |
194 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu)); | |
195 | PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32)); | |
196 | assert(dst->size() == original_size + kNewVersionsEncodedLength); | |
197 | } | |
198 | } | |
199 | ||
200 | Footer::Footer(uint64_t _table_magic_number, uint32_t _version) | |
201 | : version_(_version), | |
202 | checksum_(kCRC32c), | |
203 | table_magic_number_(_table_magic_number) { | |
204 | // This should be guaranteed by constructor callers | |
205 | assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); | |
206 | } | |
207 | ||
208 | Status Footer::DecodeFrom(Slice* input) { | |
209 | assert(!HasInitializedTableMagicNumber()); | |
210 | assert(input != nullptr); | |
211 | assert(input->size() >= kMinEncodedLength); | |
212 | ||
494da23a | 213 | const char* magic_ptr = |
7c673cae FG |
214 | input->data() + input->size() - kMagicNumberLengthByte; |
215 | const uint32_t magic_lo = DecodeFixed32(magic_ptr); | |
216 | const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); | |
217 | uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) | | |
218 | (static_cast<uint64_t>(magic_lo))); | |
219 | ||
220 | // We check for legacy formats here and silently upconvert them | |
221 | bool legacy = IsLegacyFooterFormat(magic); | |
222 | if (legacy) { | |
223 | magic = UpconvertLegacyFooterFormat(magic); | |
224 | } | |
225 | set_table_magic_number(magic); | |
226 | ||
227 | if (legacy) { | |
228 | // The size is already asserted to be at least kMinEncodedLength | |
229 | // at the beginning of the function | |
230 | input->remove_prefix(input->size() - kVersion0EncodedLength); | |
231 | version_ = 0 /* legacy */; | |
232 | checksum_ = kCRC32c; | |
233 | } else { | |
234 | version_ = DecodeFixed32(magic_ptr - 4); | |
235 | // Footer version 1 and higher will always occupy exactly this many bytes. | |
236 | // It consists of the checksum type, two block handles, padding, | |
237 | // a version number, and a magic number | |
238 | if (input->size() < kNewVersionsEncodedLength) { | |
239 | return Status::Corruption("input is too short to be an sstable"); | |
240 | } else { | |
241 | input->remove_prefix(input->size() - kNewVersionsEncodedLength); | |
242 | } | |
243 | uint32_t chksum; | |
244 | if (!GetVarint32(input, &chksum)) { | |
245 | return Status::Corruption("bad checksum type"); | |
246 | } | |
247 | checksum_ = static_cast<ChecksumType>(chksum); | |
248 | } | |
249 | ||
250 | Status result = metaindex_handle_.DecodeFrom(input); | |
251 | if (result.ok()) { | |
252 | result = index_handle_.DecodeFrom(input); | |
253 | } | |
254 | if (result.ok()) { | |
255 | // We skip over any leftover data (just padding for now) in "input" | |
256 | const char* end = magic_ptr + kMagicNumberLengthByte; | |
257 | *input = Slice(end, input->data() + input->size() - end); | |
258 | } | |
259 | return result; | |
260 | } | |
261 | ||
262 | std::string Footer::ToString() const { | |
11fdf7f2 | 263 | std::string result; |
7c673cae FG |
264 | result.reserve(1024); |
265 | ||
266 | bool legacy = IsLegacyFooterFormat(table_magic_number_); | |
267 | if (legacy) { | |
268 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); | |
269 | result.append("index handle: " + index_handle_.ToString() + "\n "); | |
270 | result.append("table_magic_number: " + | |
f67539c2 | 271 | ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n "); |
7c673cae | 272 | } else { |
f67539c2 TL |
273 | result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) + |
274 | "\n "); | |
7c673cae FG |
275 | result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); |
276 | result.append("index handle: " + index_handle_.ToString() + "\n "); | |
f67539c2 TL |
277 | result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) + |
278 | "\n "); | |
7c673cae | 279 | result.append("table_magic_number: " + |
f67539c2 | 280 | ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n "); |
7c673cae FG |
281 | } |
282 | return result; | |
283 | } | |
284 | ||
20effc67 | 285 | Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, |
11fdf7f2 TL |
286 | FilePrefetchBuffer* prefetch_buffer, |
287 | uint64_t file_size, Footer* footer, | |
288 | uint64_t enforce_table_magic_number) { | |
7c673cae | 289 | if (file_size < Footer::kMinEncodedLength) { |
494da23a TL |
290 | return Status::Corruption("file is too short (" + ToString(file_size) + |
291 | " bytes) to be an " | |
292 | "sstable: " + | |
293 | file->file_name()); | |
7c673cae FG |
294 | } |
295 | ||
20effc67 TL |
296 | std::string footer_buf; |
297 | AlignedBuf internal_buf; | |
7c673cae FG |
298 | Slice footer_input; |
299 | size_t read_offset = | |
300 | (file_size > Footer::kMaxEncodedLength) | |
301 | ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength) | |
302 | : 0; | |
11fdf7f2 | 303 | Status s; |
20effc67 TL |
304 | // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now, |
305 | // there is no readahead for point lookups, so TryReadFromCache will fail if | |
306 | // the required data is not in the prefetch buffer. Once deadline is enabled | |
307 | // for iterator, TryReadFromCache might do a readahead. Revisit to see if we | |
308 | // need to pass a timeout at that point | |
11fdf7f2 | 309 | if (prefetch_buffer == nullptr || |
20effc67 TL |
310 | !prefetch_buffer->TryReadFromCache( |
311 | IOOptions(), read_offset, Footer::kMaxEncodedLength, &footer_input)) { | |
312 | if (file->use_direct_io()) { | |
313 | s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, | |
314 | &footer_input, nullptr, &internal_buf); | |
315 | } else { | |
316 | footer_buf.reserve(Footer::kMaxEncodedLength); | |
317 | s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, | |
318 | &footer_input, &footer_buf[0], nullptr); | |
319 | } | |
11fdf7f2 TL |
320 | if (!s.ok()) return s; |
321 | } | |
7c673cae FG |
322 | |
323 | // Check that we actually read the whole footer from the file. It may be | |
324 | // that size isn't correct. | |
325 | if (footer_input.size() < Footer::kMinEncodedLength) { | |
494da23a TL |
326 | return Status::Corruption("file is too short (" + ToString(file_size) + |
327 | " bytes) to be an " | |
328 | "sstable" + | |
329 | file->file_name()); | |
7c673cae FG |
330 | } |
331 | ||
332 | s = footer->DecodeFrom(&footer_input); | |
333 | if (!s.ok()) { | |
334 | return s; | |
335 | } | |
336 | if (enforce_table_magic_number != 0 && | |
337 | enforce_table_magic_number != footer->table_magic_number()) { | |
11fdf7f2 | 338 | return Status::Corruption( |
494da23a TL |
339 | "Bad table magic number: expected " + |
340 | ToString(enforce_table_magic_number) + ", found " + | |
341 | ToString(footer->table_magic_number()) + " in " + file->file_name()); | |
7c673cae FG |
342 | } |
343 | return Status::OK(); | |
344 | } | |
345 | ||
7c673cae | 346 | Status UncompressBlockContentsForCompressionType( |
494da23a | 347 | const UncompressionInfo& uncompression_info, const char* data, size_t n, |
11fdf7f2 | 348 | BlockContents* contents, uint32_t format_version, |
494da23a | 349 | const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) { |
20effc67 | 350 | Status ret = Status::OK(); |
7c673cae | 351 | |
494da23a | 352 | assert(uncompression_info.type() != kNoCompression && |
11fdf7f2 | 353 | "Invalid compression type"); |
7c673cae | 354 | |
494da23a TL |
355 | StopWatchNano timer(ioptions.env, ShouldReportDetailedTime( |
356 | ioptions.env, ioptions.statistics)); | |
20effc67 TL |
357 | size_t uncompressed_size = 0; |
358 | CacheAllocationPtr ubuf = | |
359 | UncompressData(uncompression_info, data, n, &uncompressed_size, | |
360 | GetCompressFormatForVersion(format_version), allocator); | |
361 | if (!ubuf) { | |
362 | return Status::Corruption( | |
363 | "Unsupported compression method or corrupted compressed block contents", | |
364 | CompressionTypeToString(uncompression_info.type())); | |
7c673cae FG |
365 | } |
366 | ||
20effc67 TL |
367 | *contents = BlockContents(std::move(ubuf), uncompressed_size); |
368 | ||
494da23a TL |
369 | if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) { |
370 | RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, | |
371 | timer.ElapsedNanos()); | |
7c673cae | 372 | } |
494da23a TL |
373 | RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED, |
374 | contents->data.size()); | |
11fdf7f2 | 375 | RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); |
7c673cae | 376 | |
20effc67 TL |
377 | TEST_SYNC_POINT_CALLBACK( |
378 | "UncompressBlockContentsForCompressionType:TamperWithReturnValue", | |
379 | static_cast<void*>(&ret)); | |
380 | TEST_SYNC_POINT_CALLBACK( | |
381 | "UncompressBlockContentsForCompressionType:" | |
382 | "TamperWithDecompressionOutput", | |
383 | static_cast<void*>(contents)); | |
384 | ||
385 | return ret; | |
7c673cae FG |
386 | } |
387 | ||
388 | // | |
389 | // The 'data' points to the raw block contents that was read in from file. | |
390 | // This method allocates a new heap buffer and the raw block | |
391 | // contents are uncompresed into this buffer. This | |
392 | // buffer is returned via 'result' and it is upto the caller to | |
393 | // free this buffer. | |
394 | // format_version is the block format as defined in include/rocksdb/table.h | |
494da23a | 395 | Status UncompressBlockContents(const UncompressionInfo& uncompression_info, |
11fdf7f2 | 396 | const char* data, size_t n, |
7c673cae | 397 | BlockContents* contents, uint32_t format_version, |
494da23a TL |
398 | const ImmutableCFOptions& ioptions, |
399 | MemoryAllocator* allocator) { | |
7c673cae | 400 | assert(data[n] != kNoCompression); |
20effc67 | 401 | assert(data[n] == static_cast<char>(uncompression_info.type())); |
494da23a TL |
402 | return UncompressBlockContentsForCompressionType(uncompression_info, data, n, |
403 | contents, format_version, | |
404 | ioptions, allocator); | |
7c673cae FG |
405 | } |
406 | ||
20effc67 TL |
407 | // Replace the contents of db_host_id with the actual hostname, if db_host_id |
408 | // matches the keyword kHostnameForDbHostId | |
409 | Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) { | |
410 | assert(db_host_id); | |
411 | if (*db_host_id == kHostnameForDbHostId) { | |
412 | Status s = env->GetHostNameString(db_host_id); | |
413 | if (!s.ok()) { | |
414 | db_host_id->clear(); | |
415 | } | |
416 | return s; | |
417 | } | |
418 | ||
419 | return Status::OK(); | |
420 | } | |
f67539c2 | 421 | } // namespace ROCKSDB_NAMESPACE |