]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/format.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / format.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#include "table/format.h"
11
f67539c2 12#include <cinttypes>
494da23a 13#include <string>
7c673cae 14
f67539c2
TL
15#include "block_fetcher.h"
16#include "file/random_access_file_reader.h"
f67539c2 17#include "memory/memory_allocator.h"
7c673cae
FG
18#include "monitoring/perf_context_imp.h"
19#include "monitoring/statistics.h"
20#include "rocksdb/env.h"
20effc67 21#include "rocksdb/options.h"
f67539c2
TL
22#include "table/block_based/block.h"
23#include "table/block_based/block_based_table_reader.h"
7c673cae
FG
24#include "table/persistent_cache_helper.h"
25#include "util/coding.h"
26#include "util/compression.h"
27#include "util/crc32c.h"
7c673cae
FG
28#include "util/stop_watch.h"
29#include "util/string_util.h"
7c673cae 30
f67539c2 31namespace ROCKSDB_NAMESPACE {
7c673cae
FG
32
33extern const uint64_t kLegacyBlockBasedTableMagicNumber;
34extern const uint64_t kBlockBasedTableMagicNumber;
35
36#ifndef ROCKSDB_LITE
37extern const uint64_t kLegacyPlainTableMagicNumber;
38extern const uint64_t kPlainTableMagicNumber;
39#else
40// ROCKSDB_LITE doesn't have plain table
41const uint64_t kLegacyPlainTableMagicNumber = 0;
42const uint64_t kPlainTableMagicNumber = 0;
43#endif
20effc67 44const char* kHostnameForDbHostId = "__hostname__";
7c673cae
FG
45
46bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
47 return env != nullptr && stats != nullptr &&
494da23a 48 stats->get_stats_level() > kExceptDetailedTimers;
7c673cae
FG
49}
50
51void BlockHandle::EncodeTo(std::string* dst) const {
52 // Sanity check that all fields have been set
53 assert(offset_ != ~static_cast<uint64_t>(0));
54 assert(size_ != ~static_cast<uint64_t>(0));
55 PutVarint64Varint64(dst, offset_, size_);
56}
57
58Status BlockHandle::DecodeFrom(Slice* input) {
494da23a 59 if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
7c673cae
FG
60 return Status::OK();
61 } else {
62 // reset in case failure after partially decoding
63 offset_ = 0;
64 size_ = 0;
65 return Status::Corruption("bad block handle");
66 }
67}
68
11fdf7f2
TL
69Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
70 if (GetVarint64(input, &size_)) {
71 offset_ = _offset;
72 return Status::OK();
73 } else {
74 // reset in case failure after partially decoding
75 offset_ = 0;
76 size_ = 0;
77 return Status::Corruption("bad block handle");
78 }
79}
80
7c673cae
FG
81// Return a string that contains the copy of handle.
82std::string BlockHandle::ToString(bool hex) const {
83 std::string handle_str;
84 EncodeTo(&handle_str);
85 if (hex) {
86 return Slice(handle_str).ToString(true);
87 } else {
88 return handle_str;
89 }
90}
91
92const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
93
f67539c2
TL
94void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
95 const BlockHandle* previous_handle) const {
96 if (previous_handle) {
97 assert(handle.offset() == previous_handle->offset() +
98 previous_handle->size() + kBlockTrailerSize);
99 PutVarsignedint64(dst, handle.size() - previous_handle->size());
100 } else {
101 handle.EncodeTo(dst);
102 }
103 assert(dst->size() != 0);
104
105 if (have_first_key) {
106 PutLengthPrefixedSlice(dst, first_internal_key);
107 }
108}
109
110Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
111 const BlockHandle* previous_handle) {
112 if (previous_handle) {
113 int64_t delta;
114 if (!GetVarsignedint64(input, &delta)) {
115 return Status::Corruption("bad delta-encoded index value");
116 }
117 handle = BlockHandle(
118 previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
119 previous_handle->size() + delta);
120 } else {
121 Status s = handle.DecodeFrom(input);
122 if (!s.ok()) {
123 return s;
124 }
125 }
126
127 if (!have_first_key) {
128 first_internal_key = Slice();
129 } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
130 return Status::Corruption("bad first key in block info");
131 }
132
133 return Status::OK();
134}
135
136std::string IndexValue::ToString(bool hex, bool have_first_key) const {
137 std::string s;
138 EncodeTo(&s, have_first_key, nullptr);
139 if (hex) {
140 return Slice(s).ToString(true);
141 } else {
142 return s;
143 }
144}
145
7c673cae
FG
146namespace {
147inline bool IsLegacyFooterFormat(uint64_t magic_number) {
148 return magic_number == kLegacyBlockBasedTableMagicNumber ||
149 magic_number == kLegacyPlainTableMagicNumber;
150}
151inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
152 if (magic_number == kLegacyBlockBasedTableMagicNumber) {
153 return kBlockBasedTableMagicNumber;
154 }
155 if (magic_number == kLegacyPlainTableMagicNumber) {
156 return kPlainTableMagicNumber;
157 }
158 assert(false);
159 return 0;
160}
161} // namespace
162
163// legacy footer format:
164// metaindex handle (varint64 offset, varint64 size)
165// index handle (varint64 offset, varint64 size)
166// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
167// table_magic_number (8 bytes)
168// new footer format:
11fdf7f2 169// checksum type (char, 1 byte)
7c673cae
FG
170// metaindex handle (varint64 offset, varint64 size)
171// index handle (varint64 offset, varint64 size)
172// <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
173// footer version (4 bytes)
174// table_magic_number (8 bytes)
175void Footer::EncodeTo(std::string* dst) const {
176 assert(HasInitializedTableMagicNumber());
177 if (IsLegacyFooterFormat(table_magic_number())) {
178 // has to be default checksum with legacy footer
179 assert(checksum_ == kCRC32c);
180 const size_t original_size = dst->size();
181 metaindex_handle_.EncodeTo(dst);
182 index_handle_.EncodeTo(dst);
183 dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
184 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
185 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
186 assert(dst->size() == original_size + kVersion0EncodedLength);
187 } else {
188 const size_t original_size = dst->size();
189 dst->push_back(static_cast<char>(checksum_));
190 metaindex_handle_.EncodeTo(dst);
191 index_handle_.EncodeTo(dst);
192 dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
193 PutFixed32(dst, version());
194 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
195 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
196 assert(dst->size() == original_size + kNewVersionsEncodedLength);
197 }
198}
199
200Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
201 : version_(_version),
202 checksum_(kCRC32c),
203 table_magic_number_(_table_magic_number) {
204 // This should be guaranteed by constructor callers
205 assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
206}
207
208Status Footer::DecodeFrom(Slice* input) {
209 assert(!HasInitializedTableMagicNumber());
210 assert(input != nullptr);
211 assert(input->size() >= kMinEncodedLength);
212
494da23a 213 const char* magic_ptr =
7c673cae
FG
214 input->data() + input->size() - kMagicNumberLengthByte;
215 const uint32_t magic_lo = DecodeFixed32(magic_ptr);
216 const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
217 uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
218 (static_cast<uint64_t>(magic_lo)));
219
220 // We check for legacy formats here and silently upconvert them
221 bool legacy = IsLegacyFooterFormat(magic);
222 if (legacy) {
223 magic = UpconvertLegacyFooterFormat(magic);
224 }
225 set_table_magic_number(magic);
226
227 if (legacy) {
228 // The size is already asserted to be at least kMinEncodedLength
229 // at the beginning of the function
230 input->remove_prefix(input->size() - kVersion0EncodedLength);
231 version_ = 0 /* legacy */;
232 checksum_ = kCRC32c;
233 } else {
234 version_ = DecodeFixed32(magic_ptr - 4);
235 // Footer version 1 and higher will always occupy exactly this many bytes.
236 // It consists of the checksum type, two block handles, padding,
237 // a version number, and a magic number
238 if (input->size() < kNewVersionsEncodedLength) {
239 return Status::Corruption("input is too short to be an sstable");
240 } else {
241 input->remove_prefix(input->size() - kNewVersionsEncodedLength);
242 }
243 uint32_t chksum;
244 if (!GetVarint32(input, &chksum)) {
245 return Status::Corruption("bad checksum type");
246 }
247 checksum_ = static_cast<ChecksumType>(chksum);
248 }
249
250 Status result = metaindex_handle_.DecodeFrom(input);
251 if (result.ok()) {
252 result = index_handle_.DecodeFrom(input);
253 }
254 if (result.ok()) {
255 // We skip over any leftover data (just padding for now) in "input"
256 const char* end = magic_ptr + kMagicNumberLengthByte;
257 *input = Slice(end, input->data() + input->size() - end);
258 }
259 return result;
260}
261
262std::string Footer::ToString() const {
11fdf7f2 263 std::string result;
7c673cae
FG
264 result.reserve(1024);
265
266 bool legacy = IsLegacyFooterFormat(table_magic_number_);
267 if (legacy) {
268 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
269 result.append("index handle: " + index_handle_.ToString() + "\n ");
270 result.append("table_magic_number: " +
f67539c2 271 ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
7c673cae 272 } else {
f67539c2
TL
273 result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
274 "\n ");
7c673cae
FG
275 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
276 result.append("index handle: " + index_handle_.ToString() + "\n ");
f67539c2
TL
277 result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
278 "\n ");
7c673cae 279 result.append("table_magic_number: " +
f67539c2 280 ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
7c673cae
FG
281 }
282 return result;
283}
284
20effc67 285Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
11fdf7f2
TL
286 FilePrefetchBuffer* prefetch_buffer,
287 uint64_t file_size, Footer* footer,
288 uint64_t enforce_table_magic_number) {
7c673cae 289 if (file_size < Footer::kMinEncodedLength) {
494da23a
TL
290 return Status::Corruption("file is too short (" + ToString(file_size) +
291 " bytes) to be an "
292 "sstable: " +
293 file->file_name());
7c673cae
FG
294 }
295
20effc67
TL
296 std::string footer_buf;
297 AlignedBuf internal_buf;
7c673cae
FG
298 Slice footer_input;
299 size_t read_offset =
300 (file_size > Footer::kMaxEncodedLength)
301 ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
302 : 0;
11fdf7f2 303 Status s;
20effc67
TL
304 // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
305 // there is no readahead for point lookups, so TryReadFromCache will fail if
306 // the required data is not in the prefetch buffer. Once deadline is enabled
307 // for iterator, TryReadFromCache might do a readahead. Revisit to see if we
308 // need to pass a timeout at that point
11fdf7f2 309 if (prefetch_buffer == nullptr ||
20effc67
TL
310 !prefetch_buffer->TryReadFromCache(
311 IOOptions(), read_offset, Footer::kMaxEncodedLength, &footer_input)) {
312 if (file->use_direct_io()) {
313 s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
314 &footer_input, nullptr, &internal_buf);
315 } else {
316 footer_buf.reserve(Footer::kMaxEncodedLength);
317 s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
318 &footer_input, &footer_buf[0], nullptr);
319 }
11fdf7f2
TL
320 if (!s.ok()) return s;
321 }
7c673cae
FG
322
323 // Check that we actually read the whole footer from the file. It may be
324 // that size isn't correct.
325 if (footer_input.size() < Footer::kMinEncodedLength) {
494da23a
TL
326 return Status::Corruption("file is too short (" + ToString(file_size) +
327 " bytes) to be an "
328 "sstable" +
329 file->file_name());
7c673cae
FG
330 }
331
332 s = footer->DecodeFrom(&footer_input);
333 if (!s.ok()) {
334 return s;
335 }
336 if (enforce_table_magic_number != 0 &&
337 enforce_table_magic_number != footer->table_magic_number()) {
11fdf7f2 338 return Status::Corruption(
494da23a
TL
339 "Bad table magic number: expected " +
340 ToString(enforce_table_magic_number) + ", found " +
341 ToString(footer->table_magic_number()) + " in " + file->file_name());
7c673cae
FG
342 }
343 return Status::OK();
344}
345
7c673cae 346Status UncompressBlockContentsForCompressionType(
494da23a 347 const UncompressionInfo& uncompression_info, const char* data, size_t n,
11fdf7f2 348 BlockContents* contents, uint32_t format_version,
494da23a 349 const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
20effc67 350 Status ret = Status::OK();
7c673cae 351
494da23a 352 assert(uncompression_info.type() != kNoCompression &&
11fdf7f2 353 "Invalid compression type");
7c673cae 354
494da23a
TL
355 StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
356 ioptions.env, ioptions.statistics));
20effc67
TL
357 size_t uncompressed_size = 0;
358 CacheAllocationPtr ubuf =
359 UncompressData(uncompression_info, data, n, &uncompressed_size,
360 GetCompressFormatForVersion(format_version), allocator);
361 if (!ubuf) {
362 return Status::Corruption(
363 "Unsupported compression method or corrupted compressed block contents",
364 CompressionTypeToString(uncompression_info.type()));
7c673cae
FG
365 }
366
20effc67
TL
367 *contents = BlockContents(std::move(ubuf), uncompressed_size);
368
494da23a
TL
369 if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
370 RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
371 timer.ElapsedNanos());
7c673cae 372 }
494da23a
TL
373 RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
374 contents->data.size());
11fdf7f2 375 RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
7c673cae 376
20effc67
TL
377 TEST_SYNC_POINT_CALLBACK(
378 "UncompressBlockContentsForCompressionType:TamperWithReturnValue",
379 static_cast<void*>(&ret));
380 TEST_SYNC_POINT_CALLBACK(
381 "UncompressBlockContentsForCompressionType:"
382 "TamperWithDecompressionOutput",
383 static_cast<void*>(contents));
384
385 return ret;
7c673cae
FG
386}
387
388//
389// The 'data' points to the raw block contents that was read in from file.
390// This method allocates a new heap buffer and the raw block
391// contents are uncompresed into this buffer. This
392// buffer is returned via 'result' and it is upto the caller to
393// free this buffer.
394// format_version is the block format as defined in include/rocksdb/table.h
494da23a 395Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
11fdf7f2 396 const char* data, size_t n,
7c673cae 397 BlockContents* contents, uint32_t format_version,
494da23a
TL
398 const ImmutableCFOptions& ioptions,
399 MemoryAllocator* allocator) {
7c673cae 400 assert(data[n] != kNoCompression);
20effc67 401 assert(data[n] == static_cast<char>(uncompression_info.type()));
494da23a
TL
402 return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
403 contents, format_version,
404 ioptions, allocator);
7c673cae
FG
405}
406
20effc67
TL
407// Replace the contents of db_host_id with the actual hostname, if db_host_id
408// matches the keyword kHostnameForDbHostId
409Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
410 assert(db_host_id);
411 if (*db_host_id == kHostnameForDbHostId) {
412 Status s = env->GetHostNameString(db_host_id);
413 if (!s.ok()) {
414 db_host_id->clear();
415 }
416 return s;
417 }
418
419 return Status::OK();
420}
f67539c2 421} // namespace ROCKSDB_NAMESPACE