]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/sst_file_dumper.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / sst_file_dumper.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 #ifndef ROCKSDB_LITE
7
8 #include "table/sst_file_dumper.h"
9
10 #include <chrono>
11 #include <cinttypes>
12 #include <iostream>
13 #include <map>
14 #include <memory>
15 #include <sstream>
16 #include <vector>
17
18 #include "db/blob/blob_index.h"
19 #include "db/memtable.h"
20 #include "db/write_batch_internal.h"
21 #include "options/cf_options.h"
22 #include "port/port.h"
23 #include "rocksdb/db.h"
24 #include "rocksdb/env.h"
25 #include "rocksdb/iterator.h"
26 #include "rocksdb/slice_transform.h"
27 #include "rocksdb/status.h"
28 #include "rocksdb/table_properties.h"
29 #include "rocksdb/utilities/ldb_cmd.h"
30 #include "table/block_based/block.h"
31 #include "table/block_based/block_based_table_builder.h"
32 #include "table/block_based/block_based_table_factory.h"
33 #include "table/block_based/block_builder.h"
34 #include "table/format.h"
35 #include "table/meta_blocks.h"
36 #include "table/plain/plain_table_factory.h"
37 #include "table/table_reader.h"
38 #include "util/compression.h"
39 #include "util/random.h"
40
41 namespace ROCKSDB_NAMESPACE {
42
43 SstFileDumper::SstFileDumper(const Options& options,
44 const std::string& file_path,
45 Temperature file_temp, size_t readahead_size,
46 bool verify_checksum, bool output_hex,
47 bool decode_blob_index, const EnvOptions& soptions,
48 bool silent)
49 : file_name_(file_path),
50 read_num_(0),
51 file_temp_(file_temp),
52 output_hex_(output_hex),
53 decode_blob_index_(decode_blob_index),
54 soptions_(soptions),
55 silent_(silent),
56 options_(options),
57 ioptions_(options_),
58 moptions_(ColumnFamilyOptions(options_)),
59 read_options_(verify_checksum, false),
60 internal_comparator_(BytewiseComparator()) {
61 read_options_.readahead_size = readahead_size;
62 if (!silent_) {
63 fprintf(stdout, "Process %s\n", file_path.c_str());
64 }
65 init_result_ = GetTableReader(file_name_);
66 }
67
68 extern const uint64_t kBlockBasedTableMagicNumber;
69 extern const uint64_t kLegacyBlockBasedTableMagicNumber;
70 extern const uint64_t kPlainTableMagicNumber;
71 extern const uint64_t kLegacyPlainTableMagicNumber;
72
73 const char* testFileName = "test_file_name";
74
75 Status SstFileDumper::GetTableReader(const std::string& file_path) {
76 // Warning about 'magic_number' being uninitialized shows up only in UBsan
77 // builds. Though access is guarded by 's.ok()' checks, fix the issue to
78 // avoid any warnings.
79 uint64_t magic_number = Footer::kNullTableMagicNumber;
80
81 // read table magic number
82 Footer footer;
83
84 const auto& fs = options_.env->GetFileSystem();
85 std::unique_ptr<FSRandomAccessFile> file;
86 uint64_t file_size = 0;
87 FileOptions fopts = soptions_;
88 fopts.temperature = file_temp_;
89 Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
90 if (s.ok()) {
91 s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
92 }
93
94 // check empty file
95 // if true, skip further processing of this file
96 if (file_size == 0) {
97 return Status::Aborted(file_path, "Empty file");
98 }
99
100 file_.reset(new RandomAccessFileReader(std::move(file), file_path));
101
102 FilePrefetchBuffer prefetch_buffer(
103 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
104 false /* track_min_offset */);
105 if (s.ok()) {
106 const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
107 uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
108 ? kSstDumpTailPrefetchSize
109 : file_size;
110 uint64_t prefetch_off = file_size - prefetch_size;
111 IOOptions opts;
112 s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off,
113 static_cast<size_t>(prefetch_size),
114 Env::IO_TOTAL /* rate_limiter_priority */);
115
116 s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size,
117 &footer);
118 }
119 if (s.ok()) {
120 magic_number = footer.table_magic_number();
121 }
122
123 if (s.ok()) {
124 if (magic_number == kPlainTableMagicNumber ||
125 magic_number == kLegacyPlainTableMagicNumber) {
126 soptions_.use_mmap_reads = true;
127
128 fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
129 file_.reset(new RandomAccessFileReader(std::move(file), file_path));
130 }
131
132 // For old sst format, ReadTableProperties might fail but file can be read
133 if (ReadTableProperties(magic_number, file_.get(), file_size,
134 (magic_number == kBlockBasedTableMagicNumber)
135 ? &prefetch_buffer
136 : nullptr)
137 .ok()) {
138 s = SetTableOptionsByMagicNumber(magic_number);
139 if (s.ok()) {
140 if (table_properties_ && !table_properties_->comparator_name.empty()) {
141 ConfigOptions config_options;
142 const Comparator* user_comparator = nullptr;
143 s = Comparator::CreateFromString(config_options,
144 table_properties_->comparator_name,
145 &user_comparator);
146 if (s.ok()) {
147 assert(user_comparator);
148 internal_comparator_ = InternalKeyComparator(user_comparator);
149 }
150 }
151 }
152 } else {
153 s = SetOldTableOptions();
154 }
155 options_.comparator = internal_comparator_.user_comparator();
156 }
157
158 if (s.ok()) {
159 s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
160 &table_reader_);
161 }
162 return s;
163 }
164
165 Status SstFileDumper::NewTableReader(
166 const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
167 const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
168 std::unique_ptr<TableReader>* /*table_reader*/) {
169 auto t_opt =
170 TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_,
171 internal_comparator_, false /* skip_filters */,
172 false /* imortal */, true /* force_direct_prefetch */);
173 // Allow open file with global sequence number for backward compatibility.
174 t_opt.largest_seqno = kMaxSequenceNumber;
175
176 // We need to turn off pre-fetching of index and filter nodes for
177 // BlockBasedTable
178 if (options_.table_factory->IsInstanceOf(
179 TableFactory::kBlockBasedTableName())) {
180 return options_.table_factory->NewTableReader(t_opt, std::move(file_),
181 file_size, &table_reader_,
182 /*enable_prefetch=*/false);
183 }
184
185 // For all other factory implementation
186 return options_.table_factory->NewTableReader(t_opt, std::move(file_),
187 file_size, &table_reader_);
188 }
189
190 Status SstFileDumper::VerifyChecksum() {
191 // We could pass specific readahead setting into read options if needed.
192 return table_reader_->VerifyChecksum(read_options_,
193 TableReaderCaller::kSSTDumpTool);
194 }
195
196 Status SstFileDumper::DumpTable(const std::string& out_filename) {
197 std::unique_ptr<WritableFile> out_file;
198 Env* env = options_.env;
199 Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
200 if (s.ok()) {
201 s = table_reader_->DumpTable(out_file.get());
202 }
203 if (!s.ok()) {
204 // close the file before return error, ignore the close error if there's any
205 out_file->Close().PermitUncheckedError();
206 return s;
207 }
208 return out_file->Close();
209 }
210
211 Status SstFileDumper::CalculateCompressedTableSize(
212 const TableBuilderOptions& tb_options, size_t block_size,
213 uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
214 std::unique_ptr<Env> env(NewMemEnv(options_.env));
215 std::unique_ptr<WritableFileWriter> dest_writer;
216 Status s =
217 WritableFileWriter::Create(env->GetFileSystem(), testFileName,
218 FileOptions(soptions_), &dest_writer, nullptr);
219 if (!s.ok()) {
220 return s;
221 }
222 BlockBasedTableOptions table_options;
223 table_options.block_size = block_size;
224 BlockBasedTableFactory block_based_tf(table_options);
225 std::unique_ptr<TableBuilder> table_builder;
226 table_builder.reset(
227 block_based_tf.NewTableBuilder(tb_options, dest_writer.get()));
228 std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
229 read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
230 /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
231 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
232 table_builder->Add(iter->key(), iter->value());
233 }
234 s = iter->status();
235 if (!s.ok()) {
236 return s;
237 }
238 s = table_builder->Finish();
239 if (!s.ok()) {
240 return s;
241 }
242 *compressed_table_size = table_builder->FileSize();
243 assert(num_data_blocks != nullptr);
244 *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
245 return env->DeleteFile(testFileName);
246 }
247
248 Status SstFileDumper::ShowAllCompressionSizes(
249 size_t block_size,
250 const std::vector<std::pair<CompressionType, const char*>>&
251 compression_types,
252 int32_t compress_level_from, int32_t compress_level_to,
253 uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
254 uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
255 fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
256 for (auto& i : compression_types) {
257 if (CompressionTypeSupported(i.first)) {
258 fprintf(stdout, "Compression: %-24s\n", i.second);
259 CompressionOptions compress_opt;
260 compress_opt.max_dict_bytes = max_dict_bytes;
261 compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
262 compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
263 compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer;
264 for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
265 fprintf(stdout, "Compression level: %d", j);
266 compress_opt.level = j;
267 Status s = ShowCompressionSize(block_size, i.first, compress_opt);
268 if (!s.ok()) {
269 return s;
270 }
271 }
272 } else {
273 fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
274 }
275 }
276 return Status::OK();
277 }
278
279 Status SstFileDumper::ShowCompressionSize(
280 size_t block_size, CompressionType compress_type,
281 const CompressionOptions& compress_opt) {
282 Options opts;
283 opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
284 opts.statistics->set_stats_level(StatsLevel::kAll);
285 const ImmutableOptions imoptions(opts);
286 const ColumnFamilyOptions cfo(opts);
287 const MutableCFOptions moptions(cfo);
288 ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
289 IntTblPropCollectorFactories block_based_table_factories;
290
291 std::string column_family_name;
292 int unknown_level = -1;
293 TableBuilderOptions tb_opts(
294 imoptions, moptions, ikc, &block_based_table_factories, compress_type,
295 compress_opt,
296 TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
297 column_family_name, unknown_level);
298 uint64_t num_data_blocks = 0;
299 std::chrono::steady_clock::time_point start =
300 std::chrono::steady_clock::now();
301 uint64_t file_size;
302 Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
303 &file_size);
304 if (!s.ok()) {
305 return s;
306 }
307
308 std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
309 fprintf(stdout, " Size: %10" PRIu64, file_size);
310 fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
311 fprintf(stdout, " Time Taken: %10s microsecs",
312 std::to_string(
313 std::chrono::duration_cast<std::chrono::microseconds>(end - start)
314 .count())
315 .c_str());
316 const uint64_t compressed_blocks =
317 opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
318 const uint64_t not_compressed_blocks =
319 opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
320 // When the option enable_index_compression is true,
321 // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
322 if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
323 num_data_blocks = compressed_blocks + not_compressed_blocks;
324 }
325
326 const uint64_t ratio_not_compressed_blocks =
327 (num_data_blocks - compressed_blocks) - not_compressed_blocks;
328 const double compressed_pcnt =
329 (0 == num_data_blocks) ? 0.0
330 : ((static_cast<double>(compressed_blocks) /
331 static_cast<double>(num_data_blocks)) *
332 100.0);
333 const double ratio_not_compressed_pcnt =
334 (0 == num_data_blocks)
335 ? 0.0
336 : ((static_cast<double>(ratio_not_compressed_blocks) /
337 static_cast<double>(num_data_blocks)) *
338 100.0);
339 const double not_compressed_pcnt =
340 (0 == num_data_blocks) ? 0.0
341 : ((static_cast<double>(not_compressed_blocks) /
342 static_cast<double>(num_data_blocks)) *
343 100.0);
344 fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
345 compressed_pcnt);
346 fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
347 ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
348 fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
349 not_compressed_blocks, not_compressed_pcnt);
350 return Status::OK();
351 }
352
353 // Reads TableProperties prior to opening table reader in order to set up
354 // options.
355 Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
356 RandomAccessFileReader* file,
357 uint64_t file_size,
358 FilePrefetchBuffer* prefetch_buffer) {
359 Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
360 file, file_size, table_magic_number, ioptions_, &table_properties_,
361 /* memory_allocator= */ nullptr, prefetch_buffer);
362 if (!s.ok()) {
363 if (!silent_) {
364 fprintf(stdout, "Not able to read table properties\n");
365 }
366 }
367 return s;
368 }
369
370 Status SstFileDumper::SetTableOptionsByMagicNumber(
371 uint64_t table_magic_number) {
372 assert(table_properties_);
373 if (table_magic_number == kBlockBasedTableMagicNumber ||
374 table_magic_number == kLegacyBlockBasedTableMagicNumber) {
375 BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
376 // To force tail prefetching, we fake reporting two useful reads of 512KB
377 // from the tail.
378 // It needs at least two data points to warm up the stats.
379 bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
380 bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
381
382 options_.table_factory.reset(bbtf);
383 if (!silent_) {
384 fprintf(stdout, "Sst file format: block-based\n");
385 }
386
387 auto& props = table_properties_->user_collected_properties;
388 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
389 if (pos != props.end()) {
390 auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
391 DecodeFixed32(pos->second.c_str()));
392 if (index_type_on_file ==
393 BlockBasedTableOptions::IndexType::kHashSearch) {
394 options_.prefix_extractor.reset(NewNoopTransform());
395 }
396 }
397 } else if (table_magic_number == kPlainTableMagicNumber ||
398 table_magic_number == kLegacyPlainTableMagicNumber) {
399 options_.allow_mmap_reads = true;
400
401 PlainTableOptions plain_table_options;
402 plain_table_options.user_key_len = kPlainTableVariableLength;
403 plain_table_options.bloom_bits_per_key = 0;
404 plain_table_options.hash_table_ratio = 0;
405 plain_table_options.index_sparseness = 1;
406 plain_table_options.huge_page_tlb_size = 0;
407 plain_table_options.encoding_type = kPlain;
408 plain_table_options.full_scan_mode = true;
409
410 options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
411 if (!silent_) {
412 fprintf(stdout, "Sst file format: plain table\n");
413 }
414 } else {
415 char error_msg_buffer[80];
416 snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
417 "Unsupported table magic number --- %lx",
418 (long)table_magic_number);
419 return Status::InvalidArgument(error_msg_buffer);
420 }
421
422 return Status::OK();
423 }
424
425 Status SstFileDumper::SetOldTableOptions() {
426 assert(table_properties_ == nullptr);
427 options_.table_factory = std::make_shared<BlockBasedTableFactory>();
428 if (!silent_) {
429 fprintf(stdout, "Sst file format: block-based(old version)\n");
430 }
431
432 return Status::OK();
433 }
434
435 Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
436 bool has_from, const std::string& from_key,
437 bool has_to, const std::string& to_key,
438 bool use_from_as_prefix) {
439 if (!table_reader_) {
440 return init_result_;
441 }
442
443 InternalIterator* iter = table_reader_->NewIterator(
444 read_options_, moptions_.prefix_extractor.get(),
445 /*arena=*/nullptr, /*skip_filters=*/false,
446 TableReaderCaller::kSSTDumpTool);
447 uint64_t i = 0;
448 if (has_from) {
449 InternalKey ikey;
450 ikey.SetMinPossibleForUserKey(from_key);
451 iter->Seek(ikey.Encode());
452 } else {
453 iter->SeekToFirst();
454 }
455 for (; iter->Valid(); iter->Next()) {
456 Slice key = iter->key();
457 Slice value = iter->value();
458 ++i;
459 if (read_num > 0 && i > read_num) break;
460
461 ParsedInternalKey ikey;
462 Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
463 if (!pik_status.ok()) {
464 std::cerr << pik_status.getState() << "\n";
465 continue;
466 }
467
468 // the key returned is not prefixed with out 'from' key
469 if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
470 break;
471 }
472
473 // If end marker was specified, we stop before it
474 if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
475 break;
476 }
477
478 if (print_kv) {
479 if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
480 fprintf(stdout, "%s => %s\n",
481 ikey.DebugString(true, output_hex_).c_str(),
482 value.ToString(output_hex_).c_str());
483 } else {
484 BlobIndex blob_index;
485
486 const Status s = blob_index.DecodeFrom(value);
487 if (!s.ok()) {
488 fprintf(stderr, "%s => error decoding blob index\n",
489 ikey.DebugString(true, output_hex_).c_str());
490 continue;
491 }
492
493 fprintf(stdout, "%s => %s\n",
494 ikey.DebugString(true, output_hex_).c_str(),
495 blob_index.DebugString(output_hex_).c_str());
496 }
497 }
498 }
499
500 read_num_ += i;
501
502 Status ret = iter->status();
503 delete iter;
504 return ret;
505 }
506
507 // Provides TableProperties to API user
508 Status SstFileDumper::ReadTableProperties(
509 std::shared_ptr<const TableProperties>* table_properties) {
510 if (!table_reader_) {
511 return init_result_;
512 }
513
514 *table_properties = table_reader_->GetTableProperties();
515 return init_result_;
516 }
517 } // namespace ROCKSDB_NAMESPACE
518
519 #endif // ROCKSDB_LITE