1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
8 #include "table/sst_file_dumper.h"
18 #include "db/blob/blob_index.h"
19 #include "db/memtable.h"
20 #include "db/write_batch_internal.h"
21 #include "options/cf_options.h"
22 #include "port/port.h"
23 #include "rocksdb/db.h"
24 #include "rocksdb/env.h"
25 #include "rocksdb/iterator.h"
26 #include "rocksdb/slice_transform.h"
27 #include "rocksdb/status.h"
28 #include "rocksdb/table_properties.h"
29 #include "rocksdb/utilities/ldb_cmd.h"
30 #include "table/block_based/block.h"
31 #include "table/block_based/block_based_table_builder.h"
32 #include "table/block_based/block_based_table_factory.h"
33 #include "table/block_based/block_builder.h"
34 #include "table/format.h"
35 #include "table/meta_blocks.h"
36 #include "table/plain/plain_table_factory.h"
37 #include "table/table_reader.h"
38 #include "util/compression.h"
39 #include "util/random.h"
41 namespace ROCKSDB_NAMESPACE
{
43 SstFileDumper::SstFileDumper(const Options
& options
,
44 const std::string
& file_path
,
45 Temperature file_temp
, size_t readahead_size
,
46 bool verify_checksum
, bool output_hex
,
47 bool decode_blob_index
, const EnvOptions
& soptions
,
49 : file_name_(file_path
),
51 file_temp_(file_temp
),
52 output_hex_(output_hex
),
53 decode_blob_index_(decode_blob_index
),
58 moptions_(ColumnFamilyOptions(options_
)),
59 read_options_(verify_checksum
, false),
60 internal_comparator_(BytewiseComparator()) {
61 read_options_
.readahead_size
= readahead_size
;
63 fprintf(stdout
, "Process %s\n", file_path
.c_str());
65 init_result_
= GetTableReader(file_name_
);
68 extern const uint64_t kBlockBasedTableMagicNumber
;
69 extern const uint64_t kLegacyBlockBasedTableMagicNumber
;
70 extern const uint64_t kPlainTableMagicNumber
;
71 extern const uint64_t kLegacyPlainTableMagicNumber
;
73 const char* testFileName
= "test_file_name";
75 Status
SstFileDumper::GetTableReader(const std::string
& file_path
) {
76 // Warning about 'magic_number' being uninitialized shows up only in UBsan
77 // builds. Though access is guarded by 's.ok()' checks, fix the issue to
78 // avoid any warnings.
79 uint64_t magic_number
= Footer::kNullTableMagicNumber
;
81 // read table magic number
84 const auto& fs
= options_
.env
->GetFileSystem();
85 std::unique_ptr
<FSRandomAccessFile
> file
;
86 uint64_t file_size
= 0;
87 FileOptions fopts
= soptions_
;
88 fopts
.temperature
= file_temp_
;
89 Status s
= fs
->NewRandomAccessFile(file_path
, fopts
, &file
, nullptr);
91 s
= fs
->GetFileSize(file_path
, IOOptions(), &file_size
, nullptr);
95 // if true, skip further processing of this file
97 return Status::Aborted(file_path
, "Empty file");
100 file_
.reset(new RandomAccessFileReader(std::move(file
), file_path
));
102 FilePrefetchBuffer
prefetch_buffer(
103 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
104 false /* track_min_offset */);
106 const uint64_t kSstDumpTailPrefetchSize
= 512 * 1024;
107 uint64_t prefetch_size
= (file_size
> kSstDumpTailPrefetchSize
)
108 ? kSstDumpTailPrefetchSize
110 uint64_t prefetch_off
= file_size
- prefetch_size
;
112 s
= prefetch_buffer
.Prefetch(opts
, file_
.get(), prefetch_off
,
113 static_cast<size_t>(prefetch_size
),
114 Env::IO_TOTAL
/* rate_limiter_priority */);
116 s
= ReadFooterFromFile(opts
, file_
.get(), &prefetch_buffer
, file_size
,
120 magic_number
= footer
.table_magic_number();
124 if (magic_number
== kPlainTableMagicNumber
||
125 magic_number
== kLegacyPlainTableMagicNumber
) {
126 soptions_
.use_mmap_reads
= true;
128 fs
->NewRandomAccessFile(file_path
, fopts
, &file
, nullptr);
129 file_
.reset(new RandomAccessFileReader(std::move(file
), file_path
));
132 // For old sst format, ReadTableProperties might fail but file can be read
133 if (ReadTableProperties(magic_number
, file_
.get(), file_size
,
134 (magic_number
== kBlockBasedTableMagicNumber
)
138 s
= SetTableOptionsByMagicNumber(magic_number
);
140 if (table_properties_
&& !table_properties_
->comparator_name
.empty()) {
141 ConfigOptions config_options
;
142 const Comparator
* user_comparator
= nullptr;
143 s
= Comparator::CreateFromString(config_options
,
144 table_properties_
->comparator_name
,
147 assert(user_comparator
);
148 internal_comparator_
= InternalKeyComparator(user_comparator
);
153 s
= SetOldTableOptions();
155 options_
.comparator
= internal_comparator_
.user_comparator();
159 s
= NewTableReader(ioptions_
, soptions_
, internal_comparator_
, file_size
,
165 Status
SstFileDumper::NewTableReader(
166 const ImmutableOptions
& /*ioptions*/, const EnvOptions
& /*soptions*/,
167 const InternalKeyComparator
& /*internal_comparator*/, uint64_t file_size
,
168 std::unique_ptr
<TableReader
>* /*table_reader*/) {
170 TableReaderOptions(ioptions_
, moptions_
.prefix_extractor
, soptions_
,
171 internal_comparator_
, false /* skip_filters */,
172 false /* imortal */, true /* force_direct_prefetch */);
173 // Allow open file with global sequence number for backward compatibility.
174 t_opt
.largest_seqno
= kMaxSequenceNumber
;
176 // We need to turn off pre-fetching of index and filter nodes for
178 if (options_
.table_factory
->IsInstanceOf(
179 TableFactory::kBlockBasedTableName())) {
180 return options_
.table_factory
->NewTableReader(t_opt
, std::move(file_
),
181 file_size
, &table_reader_
,
182 /*enable_prefetch=*/false);
185 // For all other factory implementation
186 return options_
.table_factory
->NewTableReader(t_opt
, std::move(file_
),
187 file_size
, &table_reader_
);
190 Status
SstFileDumper::VerifyChecksum() {
191 // We could pass specific readahead setting into read options if needed.
192 return table_reader_
->VerifyChecksum(read_options_
,
193 TableReaderCaller::kSSTDumpTool
);
196 Status
SstFileDumper::DumpTable(const std::string
& out_filename
) {
197 std::unique_ptr
<WritableFile
> out_file
;
198 Env
* env
= options_
.env
;
199 Status s
= env
->NewWritableFile(out_filename
, &out_file
, soptions_
);
201 s
= table_reader_
->DumpTable(out_file
.get());
204 // close the file before return error, ignore the close error if there's any
205 out_file
->Close().PermitUncheckedError();
208 return out_file
->Close();
211 Status
SstFileDumper::CalculateCompressedTableSize(
212 const TableBuilderOptions
& tb_options
, size_t block_size
,
213 uint64_t* num_data_blocks
, uint64_t* compressed_table_size
) {
214 std::unique_ptr
<Env
> env(NewMemEnv(options_
.env
));
215 std::unique_ptr
<WritableFileWriter
> dest_writer
;
217 WritableFileWriter::Create(env
->GetFileSystem(), testFileName
,
218 FileOptions(soptions_
), &dest_writer
, nullptr);
222 BlockBasedTableOptions table_options
;
223 table_options
.block_size
= block_size
;
224 BlockBasedTableFactory
block_based_tf(table_options
);
225 std::unique_ptr
<TableBuilder
> table_builder
;
227 block_based_tf
.NewTableBuilder(tb_options
, dest_writer
.get()));
228 std::unique_ptr
<InternalIterator
> iter(table_reader_
->NewIterator(
229 read_options_
, moptions_
.prefix_extractor
.get(), /*arena=*/nullptr,
230 /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool
));
231 for (iter
->SeekToFirst(); iter
->Valid(); iter
->Next()) {
232 table_builder
->Add(iter
->key(), iter
->value());
238 s
= table_builder
->Finish();
242 *compressed_table_size
= table_builder
->FileSize();
243 assert(num_data_blocks
!= nullptr);
244 *num_data_blocks
= table_builder
->GetTableProperties().num_data_blocks
;
245 return env
->DeleteFile(testFileName
);
248 Status
SstFileDumper::ShowAllCompressionSizes(
250 const std::vector
<std::pair
<CompressionType
, const char*>>&
252 int32_t compress_level_from
, int32_t compress_level_to
,
253 uint32_t max_dict_bytes
, uint32_t zstd_max_train_bytes
,
254 uint64_t max_dict_buffer_bytes
, bool use_zstd_dict_trainer
) {
255 fprintf(stdout
, "Block Size: %" ROCKSDB_PRIszt
"\n", block_size
);
256 for (auto& i
: compression_types
) {
257 if (CompressionTypeSupported(i
.first
)) {
258 fprintf(stdout
, "Compression: %-24s\n", i
.second
);
259 CompressionOptions compress_opt
;
260 compress_opt
.max_dict_bytes
= max_dict_bytes
;
261 compress_opt
.zstd_max_train_bytes
= zstd_max_train_bytes
;
262 compress_opt
.max_dict_buffer_bytes
= max_dict_buffer_bytes
;
263 compress_opt
.use_zstd_dict_trainer
= use_zstd_dict_trainer
;
264 for (int32_t j
= compress_level_from
; j
<= compress_level_to
; j
++) {
265 fprintf(stdout
, "Compression level: %d", j
);
266 compress_opt
.level
= j
;
267 Status s
= ShowCompressionSize(block_size
, i
.first
, compress_opt
);
273 fprintf(stdout
, "Unsupported compression type: %s.\n", i
.second
);
279 Status
SstFileDumper::ShowCompressionSize(
280 size_t block_size
, CompressionType compress_type
,
281 const CompressionOptions
& compress_opt
) {
283 opts
.statistics
= ROCKSDB_NAMESPACE::CreateDBStatistics();
284 opts
.statistics
->set_stats_level(StatsLevel::kAll
);
285 const ImmutableOptions
imoptions(opts
);
286 const ColumnFamilyOptions
cfo(opts
);
287 const MutableCFOptions
moptions(cfo
);
288 ROCKSDB_NAMESPACE::InternalKeyComparator
ikc(opts
.comparator
);
289 IntTblPropCollectorFactories block_based_table_factories
;
291 std::string column_family_name
;
292 int unknown_level
= -1;
293 TableBuilderOptions
tb_opts(
294 imoptions
, moptions
, ikc
, &block_based_table_factories
, compress_type
,
296 TablePropertiesCollectorFactory::Context::kUnknownColumnFamily
,
297 column_family_name
, unknown_level
);
298 uint64_t num_data_blocks
= 0;
299 std::chrono::steady_clock::time_point start
=
300 std::chrono::steady_clock::now();
302 Status s
= CalculateCompressedTableSize(tb_opts
, block_size
, &num_data_blocks
,
308 std::chrono::steady_clock::time_point end
= std::chrono::steady_clock::now();
309 fprintf(stdout
, " Size: %10" PRIu64
, file_size
);
310 fprintf(stdout
, " Blocks: %6" PRIu64
, num_data_blocks
);
311 fprintf(stdout
, " Time Taken: %10s microsecs",
313 std::chrono::duration_cast
<std::chrono::microseconds
>(end
- start
)
316 const uint64_t compressed_blocks
=
317 opts
.statistics
->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED
);
318 const uint64_t not_compressed_blocks
=
319 opts
.statistics
->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED
);
320 // When the option enable_index_compression is true,
321 // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
322 if ((compressed_blocks
+ not_compressed_blocks
) > num_data_blocks
) {
323 num_data_blocks
= compressed_blocks
+ not_compressed_blocks
;
326 const uint64_t ratio_not_compressed_blocks
=
327 (num_data_blocks
- compressed_blocks
) - not_compressed_blocks
;
328 const double compressed_pcnt
=
329 (0 == num_data_blocks
) ? 0.0
330 : ((static_cast<double>(compressed_blocks
) /
331 static_cast<double>(num_data_blocks
)) *
333 const double ratio_not_compressed_pcnt
=
334 (0 == num_data_blocks
)
336 : ((static_cast<double>(ratio_not_compressed_blocks
) /
337 static_cast<double>(num_data_blocks
)) *
339 const double not_compressed_pcnt
=
340 (0 == num_data_blocks
) ? 0.0
341 : ((static_cast<double>(not_compressed_blocks
) /
342 static_cast<double>(num_data_blocks
)) *
344 fprintf(stdout
, " Compressed: %6" PRIu64
" (%5.1f%%)", compressed_blocks
,
346 fprintf(stdout
, " Not compressed (ratio): %6" PRIu64
" (%5.1f%%)",
347 ratio_not_compressed_blocks
, ratio_not_compressed_pcnt
);
348 fprintf(stdout
, " Not compressed (abort): %6" PRIu64
" (%5.1f%%)\n",
349 not_compressed_blocks
, not_compressed_pcnt
);
353 // Reads TableProperties prior to opening table reader in order to set up
355 Status
SstFileDumper::ReadTableProperties(uint64_t table_magic_number
,
356 RandomAccessFileReader
* file
,
358 FilePrefetchBuffer
* prefetch_buffer
) {
359 Status s
= ROCKSDB_NAMESPACE::ReadTableProperties(
360 file
, file_size
, table_magic_number
, ioptions_
, &table_properties_
,
361 /* memory_allocator= */ nullptr, prefetch_buffer
);
364 fprintf(stdout
, "Not able to read table properties\n");
370 Status
SstFileDumper::SetTableOptionsByMagicNumber(
371 uint64_t table_magic_number
) {
372 assert(table_properties_
);
373 if (table_magic_number
== kBlockBasedTableMagicNumber
||
374 table_magic_number
== kLegacyBlockBasedTableMagicNumber
) {
375 BlockBasedTableFactory
* bbtf
= new BlockBasedTableFactory();
376 // To force tail prefetching, we fake reporting two useful reads of 512KB
378 // It needs at least two data points to warm up the stats.
379 bbtf
->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
380 bbtf
->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
382 options_
.table_factory
.reset(bbtf
);
384 fprintf(stdout
, "Sst file format: block-based\n");
387 auto& props
= table_properties_
->user_collected_properties
;
388 auto pos
= props
.find(BlockBasedTablePropertyNames::kIndexType
);
389 if (pos
!= props
.end()) {
390 auto index_type_on_file
= static_cast<BlockBasedTableOptions::IndexType
>(
391 DecodeFixed32(pos
->second
.c_str()));
392 if (index_type_on_file
==
393 BlockBasedTableOptions::IndexType::kHashSearch
) {
394 options_
.prefix_extractor
.reset(NewNoopTransform());
397 } else if (table_magic_number
== kPlainTableMagicNumber
||
398 table_magic_number
== kLegacyPlainTableMagicNumber
) {
399 options_
.allow_mmap_reads
= true;
401 PlainTableOptions plain_table_options
;
402 plain_table_options
.user_key_len
= kPlainTableVariableLength
;
403 plain_table_options
.bloom_bits_per_key
= 0;
404 plain_table_options
.hash_table_ratio
= 0;
405 plain_table_options
.index_sparseness
= 1;
406 plain_table_options
.huge_page_tlb_size
= 0;
407 plain_table_options
.encoding_type
= kPlain
;
408 plain_table_options
.full_scan_mode
= true;
410 options_
.table_factory
.reset(NewPlainTableFactory(plain_table_options
));
412 fprintf(stdout
, "Sst file format: plain table\n");
415 char error_msg_buffer
[80];
416 snprintf(error_msg_buffer
, sizeof(error_msg_buffer
) - 1,
417 "Unsupported table magic number --- %lx",
418 (long)table_magic_number
);
419 return Status::InvalidArgument(error_msg_buffer
);
425 Status
SstFileDumper::SetOldTableOptions() {
426 assert(table_properties_
== nullptr);
427 options_
.table_factory
= std::make_shared
<BlockBasedTableFactory
>();
429 fprintf(stdout
, "Sst file format: block-based(old version)\n");
435 Status
SstFileDumper::ReadSequential(bool print_kv
, uint64_t read_num
,
436 bool has_from
, const std::string
& from_key
,
437 bool has_to
, const std::string
& to_key
,
438 bool use_from_as_prefix
) {
439 if (!table_reader_
) {
443 InternalIterator
* iter
= table_reader_
->NewIterator(
444 read_options_
, moptions_
.prefix_extractor
.get(),
445 /*arena=*/nullptr, /*skip_filters=*/false,
446 TableReaderCaller::kSSTDumpTool
);
450 ikey
.SetMinPossibleForUserKey(from_key
);
451 iter
->Seek(ikey
.Encode());
455 for (; iter
->Valid(); iter
->Next()) {
456 Slice key
= iter
->key();
457 Slice value
= iter
->value();
459 if (read_num
> 0 && i
> read_num
) break;
461 ParsedInternalKey ikey
;
462 Status pik_status
= ParseInternalKey(key
, &ikey
, true /* log_err_key */);
463 if (!pik_status
.ok()) {
464 std::cerr
<< pik_status
.getState() << "\n";
468 // the key returned is not prefixed with out 'from' key
469 if (use_from_as_prefix
&& !ikey
.user_key
.starts_with(from_key
)) {
473 // If end marker was specified, we stop before it
474 if (has_to
&& BytewiseComparator()->Compare(ikey
.user_key
, to_key
) >= 0) {
479 if (!decode_blob_index_
|| ikey
.type
!= kTypeBlobIndex
) {
480 fprintf(stdout
, "%s => %s\n",
481 ikey
.DebugString(true, output_hex_
).c_str(),
482 value
.ToString(output_hex_
).c_str());
484 BlobIndex blob_index
;
486 const Status s
= blob_index
.DecodeFrom(value
);
488 fprintf(stderr
, "%s => error decoding blob index\n",
489 ikey
.DebugString(true, output_hex_
).c_str());
493 fprintf(stdout
, "%s => %s\n",
494 ikey
.DebugString(true, output_hex_
).c_str(),
495 blob_index
.DebugString(output_hex_
).c_str());
502 Status ret
= iter
->status();
507 // Provides TableProperties to API user
508 Status
SstFileDumper::ReadTableProperties(
509 std::shared_ptr
<const TableProperties
>* table_properties
) {
510 if (!table_reader_
) {
514 *table_properties
= table_reader_
->GetTableProperties();
517 } // namespace ROCKSDB_NAMESPACE
519 #endif // ROCKSDB_LITE