]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_based_table_builder.cc
bump version to 12.1.1-pve1 while rebasing patches
[ceph.git] / ceph / src / rocksdb / table / block_based_table_builder.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/block_based_table_builder.h"
11
12 #include <assert.h>
13 #include <inttypes.h>
14 #include <stdio.h>
15
16 #include <list>
17 #include <map>
18 #include <memory>
19 #include <string>
20 #include <unordered_map>
21 #include <utility>
22
23 #include "db/dbformat.h"
24
25 #include "rocksdb/cache.h"
26 #include "rocksdb/comparator.h"
27 #include "rocksdb/env.h"
28 #include "rocksdb/filter_policy.h"
29 #include "rocksdb/flush_block_policy.h"
30 #include "rocksdb/merge_operator.h"
31 #include "rocksdb/table.h"
32
33 #include "table/block.h"
34 #include "table/block_based_filter_block.h"
35 #include "table/block_based_table_factory.h"
36 #include "table/block_based_table_reader.h"
37 #include "table/block_builder.h"
38 #include "table/filter_block.h"
39 #include "table/format.h"
40 #include "table/full_filter_block.h"
41 #include "table/meta_blocks.h"
42 #include "table/table_builder.h"
43
44 #include "util/string_util.h"
45 #include "util/coding.h"
46 #include "util/compression.h"
47 #include "util/crc32c.h"
48 #include "util/stop_watch.h"
49 #include "util/xxhash.h"
50
51 #include "table/index_builder.h"
52 #include "table/partitioned_filter_block.h"
53
54 namespace rocksdb {
55
56 extern const std::string kHashIndexPrefixesBlock;
57 extern const std::string kHashIndexPrefixesMetadataBlock;
58
59 typedef BlockBasedTableOptions::IndexType IndexType;
60
61 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
62 namespace {
63
64 // Create a filter block builder based on its type.
65 FilterBlockBuilder* CreateFilterBlockBuilder(
66 const ImmutableCFOptions& opt, const BlockBasedTableOptions& table_opt,
67 PartitionedIndexBuilder* const p_index_builder) {
68 if (table_opt.filter_policy == nullptr) return nullptr;
69
70 FilterBitsBuilder* filter_bits_builder =
71 table_opt.filter_policy->GetFilterBitsBuilder();
72 if (filter_bits_builder == nullptr) {
73 return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt);
74 } else {
75 if (table_opt.partition_filters) {
76 assert(p_index_builder != nullptr);
77 return new PartitionedFilterBlockBuilder(
78 opt.prefix_extractor, table_opt.whole_key_filtering,
79 filter_bits_builder, table_opt.index_block_restart_interval,
80 p_index_builder);
81 } else {
82 return new FullFilterBlockBuilder(opt.prefix_extractor,
83 table_opt.whole_key_filtering,
84 filter_bits_builder);
85 }
86 }
87 }
88
89 bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
90 // Check to see if compressed less than 12.5%
91 return compressed_size < raw_size - (raw_size / 8u);
92 }
93
94 } // namespace
95
96 // format_version is the block format as defined in include/rocksdb/table.h
97 Slice CompressBlock(const Slice& raw,
98 const CompressionOptions& compression_options,
99 CompressionType* type, uint32_t format_version,
100 const Slice& compression_dict,
101 std::string* compressed_output) {
102 if (*type == kNoCompression) {
103 return raw;
104 }
105
106 // Will return compressed block contents if (1) the compression method is
107 // supported in this platform and (2) the compression rate is "good enough".
108 switch (*type) {
109 case kSnappyCompression:
110 if (Snappy_Compress(compression_options, raw.data(), raw.size(),
111 compressed_output) &&
112 GoodCompressionRatio(compressed_output->size(), raw.size())) {
113 return *compressed_output;
114 }
115 break; // fall back to no compression.
116 case kZlibCompression:
117 if (Zlib_Compress(
118 compression_options,
119 GetCompressFormatForVersion(kZlibCompression, format_version),
120 raw.data(), raw.size(), compressed_output, compression_dict) &&
121 GoodCompressionRatio(compressed_output->size(), raw.size())) {
122 return *compressed_output;
123 }
124 break; // fall back to no compression.
125 case kBZip2Compression:
126 if (BZip2_Compress(
127 compression_options,
128 GetCompressFormatForVersion(kBZip2Compression, format_version),
129 raw.data(), raw.size(), compressed_output) &&
130 GoodCompressionRatio(compressed_output->size(), raw.size())) {
131 return *compressed_output;
132 }
133 break; // fall back to no compression.
134 case kLZ4Compression:
135 if (LZ4_Compress(
136 compression_options,
137 GetCompressFormatForVersion(kLZ4Compression, format_version),
138 raw.data(), raw.size(), compressed_output, compression_dict) &&
139 GoodCompressionRatio(compressed_output->size(), raw.size())) {
140 return *compressed_output;
141 }
142 break; // fall back to no compression.
143 case kLZ4HCCompression:
144 if (LZ4HC_Compress(
145 compression_options,
146 GetCompressFormatForVersion(kLZ4HCCompression, format_version),
147 raw.data(), raw.size(), compressed_output, compression_dict) &&
148 GoodCompressionRatio(compressed_output->size(), raw.size())) {
149 return *compressed_output;
150 }
151 break; // fall back to no compression.
152 case kXpressCompression:
153 if (XPRESS_Compress(raw.data(), raw.size(),
154 compressed_output) &&
155 GoodCompressionRatio(compressed_output->size(), raw.size())) {
156 return *compressed_output;
157 }
158 break;
159 case kZSTD:
160 case kZSTDNotFinalCompression:
161 if (ZSTD_Compress(compression_options, raw.data(), raw.size(),
162 compressed_output, compression_dict) &&
163 GoodCompressionRatio(compressed_output->size(), raw.size())) {
164 return *compressed_output;
165 }
166 break; // fall back to no compression.
167 default: {} // Do not recognize this compression type
168 }
169
170 // Compression method is not supported, or not good compression ratio, so just
171 // fall back to uncompressed form.
172 *type = kNoCompression;
173 return raw;
174 }
175
176 // kBlockBasedTableMagicNumber was picked by running
177 // echo rocksdb.table.block_based | sha1sum
178 // and taking the leading 64 bits.
179 // Please note that kBlockBasedTableMagicNumber may also be accessed by other
180 // .cc files
181 // for that reason we declare it extern in the header but to get the space
182 // allocated
183 // it must be not extern in one place.
184 const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
185 // We also support reading and writing legacy block based table format (for
186 // backwards compatibility)
187 const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
188
189 // A collector that collects properties of interest to block-based table.
190 // For now this class looks heavy-weight since we only write one additional
191 // property.
192 // But in the foreseeable future, we will add more and more properties that are
193 // specific to block-based table.
194 class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
195 : public IntTblPropCollector {
196 public:
197 explicit BlockBasedTablePropertiesCollector(
198 BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
199 bool prefix_filtering)
200 : index_type_(index_type),
201 whole_key_filtering_(whole_key_filtering),
202 prefix_filtering_(prefix_filtering) {}
203
204 virtual Status InternalAdd(const Slice& key, const Slice& value,
205 uint64_t file_size) override {
206 // Intentionally left blank. Have no interest in collecting stats for
207 // individual key/value pairs.
208 return Status::OK();
209 }
210
211 virtual Status Finish(UserCollectedProperties* properties) override {
212 std::string val;
213 PutFixed32(&val, static_cast<uint32_t>(index_type_));
214 properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
215 properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
216 whole_key_filtering_ ? kPropTrue : kPropFalse});
217 properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
218 prefix_filtering_ ? kPropTrue : kPropFalse});
219 return Status::OK();
220 }
221
222 // The name of the properties collector can be used for debugging purpose.
223 virtual const char* Name() const override {
224 return "BlockBasedTablePropertiesCollector";
225 }
226
227 virtual UserCollectedProperties GetReadableProperties() const override {
228 // Intentionally left blank.
229 return UserCollectedProperties();
230 }
231
232 private:
233 BlockBasedTableOptions::IndexType index_type_;
234 bool whole_key_filtering_;
235 bool prefix_filtering_;
236 };
237
238 struct BlockBasedTableBuilder::Rep {
239 const ImmutableCFOptions ioptions;
240 const BlockBasedTableOptions table_options;
241 const InternalKeyComparator& internal_comparator;
242 WritableFileWriter* file;
243 uint64_t offset = 0;
244 Status status;
245 BlockBuilder data_block;
246 BlockBuilder range_del_block;
247
248 InternalKeySliceTransform internal_prefix_transform;
249 std::unique_ptr<IndexBuilder> index_builder;
250
251 std::string last_key;
252 const CompressionType compression_type;
253 const CompressionOptions compression_opts;
254 // Data for presetting the compression library's dictionary, or nullptr.
255 const std::string* compression_dict;
256 TableProperties props;
257
258 bool closed = false; // Either Finish() or Abandon() has been called.
259 std::unique_ptr<FilterBlockBuilder> filter_builder;
260 char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
261 size_t compressed_cache_key_prefix_size;
262
263 BlockHandle pending_handle; // Handle to add to index block
264
265 std::string compressed_output;
266 std::unique_ptr<FlushBlockPolicy> flush_block_policy;
267 uint32_t column_family_id;
268 const std::string& column_family_name;
269
270 std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
271
272 Rep(const ImmutableCFOptions& _ioptions,
273 const BlockBasedTableOptions& table_opt,
274 const InternalKeyComparator& icomparator,
275 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
276 int_tbl_prop_collector_factories,
277 uint32_t _column_family_id, WritableFileWriter* f,
278 const CompressionType _compression_type,
279 const CompressionOptions& _compression_opts,
280 const std::string* _compression_dict, const bool skip_filters,
281 const std::string& _column_family_name)
282 : ioptions(_ioptions),
283 table_options(table_opt),
284 internal_comparator(icomparator),
285 file(f),
286 data_block(table_options.block_restart_interval,
287 table_options.use_delta_encoding),
288 range_del_block(1), // TODO(andrewkr): restart_interval unnecessary
289 internal_prefix_transform(_ioptions.prefix_extractor),
290 compression_type(_compression_type),
291 compression_opts(_compression_opts),
292 compression_dict(_compression_dict),
293 flush_block_policy(
294 table_options.flush_block_policy_factory->NewFlushBlockPolicy(
295 table_options, data_block)),
296 column_family_id(_column_family_id),
297 column_family_name(_column_family_name) {
298 PartitionedIndexBuilder* p_index_builder = nullptr;
299 if (table_options.index_type ==
300 BlockBasedTableOptions::kTwoLevelIndexSearch) {
301 p_index_builder = PartitionedIndexBuilder::CreateIndexBuilder(
302 &internal_comparator, table_options);
303 index_builder.reset(p_index_builder);
304 } else {
305 index_builder.reset(IndexBuilder::CreateIndexBuilder(
306 table_options.index_type, &internal_comparator,
307 &this->internal_prefix_transform, table_options));
308 }
309 if (skip_filters) {
310 filter_builder = nullptr;
311 } else {
312 filter_builder.reset(
313 CreateFilterBlockBuilder(_ioptions, table_options, p_index_builder));
314 }
315
316 for (auto& collector_factories : *int_tbl_prop_collector_factories) {
317 table_properties_collectors.emplace_back(
318 collector_factories->CreateIntTblPropCollector(column_family_id));
319 }
320 table_properties_collectors.emplace_back(
321 new BlockBasedTablePropertiesCollector(
322 table_options.index_type, table_options.whole_key_filtering,
323 _ioptions.prefix_extractor != nullptr));
324 }
325 };
326
327 BlockBasedTableBuilder::BlockBasedTableBuilder(
328 const ImmutableCFOptions& ioptions,
329 const BlockBasedTableOptions& table_options,
330 const InternalKeyComparator& internal_comparator,
331 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
332 int_tbl_prop_collector_factories,
333 uint32_t column_family_id, WritableFileWriter* file,
334 const CompressionType compression_type,
335 const CompressionOptions& compression_opts,
336 const std::string* compression_dict, const bool skip_filters,
337 const std::string& column_family_name) {
338 BlockBasedTableOptions sanitized_table_options(table_options);
339 if (sanitized_table_options.format_version == 0 &&
340 sanitized_table_options.checksum != kCRC32c) {
341 ROCKS_LOG_WARN(
342 ioptions.info_log,
343 "Silently converting format_version to 1 because checksum is "
344 "non-default");
345 // silently convert format_version to 1 to keep consistent with current
346 // behavior
347 sanitized_table_options.format_version = 1;
348 }
349
350 rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator,
351 int_tbl_prop_collector_factories, column_family_id, file,
352 compression_type, compression_opts, compression_dict,
353 skip_filters, column_family_name);
354
355 if (rep_->filter_builder != nullptr) {
356 rep_->filter_builder->StartBlock(0);
357 }
358 if (table_options.block_cache_compressed.get() != nullptr) {
359 BlockBasedTable::GenerateCachePrefix(
360 table_options.block_cache_compressed.get(), file->writable_file(),
361 &rep_->compressed_cache_key_prefix[0],
362 &rep_->compressed_cache_key_prefix_size);
363 }
364 }
365
366 BlockBasedTableBuilder::~BlockBasedTableBuilder() {
367 assert(rep_->closed); // Catch errors where caller forgot to call Finish()
368 delete rep_;
369 }
370
371 void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
372 Rep* r = rep_;
373 assert(!r->closed);
374 if (!ok()) return;
375 ValueType value_type = ExtractValueType(key);
376 if (IsValueType(value_type)) {
377 if (r->props.num_entries > 0) {
378 assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
379 }
380
381 auto should_flush = r->flush_block_policy->Update(key, value);
382 if (should_flush) {
383 assert(!r->data_block.empty());
384 Flush();
385
386 // Add item to index block.
387 // We do not emit the index entry for a block until we have seen the
388 // first key for the next data block. This allows us to use shorter
389 // keys in the index block. For example, consider a block boundary
390 // between the keys "the quick brown fox" and "the who". We can use
391 // "the r" as the key for the index block entry since it is >= all
392 // entries in the first block and < all entries in subsequent
393 // blocks.
394 if (ok()) {
395 r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
396 }
397 }
398
399 // Note: PartitionedFilterBlockBuilder requires key being added to filter
400 // builder after being added to index builder.
401 if (r->filter_builder != nullptr) {
402 r->filter_builder->Add(ExtractUserKey(key));
403 }
404
405 r->last_key.assign(key.data(), key.size());
406 r->data_block.Add(key, value);
407 r->props.num_entries++;
408 r->props.raw_key_size += key.size();
409 r->props.raw_value_size += value.size();
410
411 r->index_builder->OnKeyAdded(key);
412 NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
413 r->table_properties_collectors,
414 r->ioptions.info_log);
415
416 } else if (value_type == kTypeRangeDeletion) {
417 // TODO(wanning&andrewkr) add num_tomestone to table properties
418 r->range_del_block.Add(key, value);
419 ++r->props.num_entries;
420 r->props.raw_key_size += key.size();
421 r->props.raw_value_size += value.size();
422 NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
423 r->table_properties_collectors,
424 r->ioptions.info_log);
425 } else {
426 assert(false);
427 }
428 }
429
430 void BlockBasedTableBuilder::Flush() {
431 Rep* r = rep_;
432 assert(!r->closed);
433 if (!ok()) return;
434 if (r->data_block.empty()) return;
435 WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
436 if (r->filter_builder != nullptr) {
437 r->filter_builder->StartBlock(r->offset);
438 }
439 r->props.data_size = r->offset;
440 ++r->props.num_data_blocks;
441 }
442
443 void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
444 BlockHandle* handle,
445 bool is_data_block) {
446 WriteBlock(block->Finish(), handle, is_data_block);
447 block->Reset();
448 }
449
450 void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
451 BlockHandle* handle,
452 bool is_data_block) {
453 // File format contains a sequence of blocks where each block has:
454 // block_data: uint8[n]
455 // type: uint8
456 // crc: uint32
457 assert(ok());
458 Rep* r = rep_;
459
460 auto type = r->compression_type;
461 Slice block_contents;
462 bool abort_compression = false;
463
464 StopWatchNano timer(r->ioptions.env,
465 ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
466
467 if (raw_block_contents.size() < kCompressionSizeLimit) {
468 Slice compression_dict;
469 if (is_data_block && r->compression_dict && r->compression_dict->size()) {
470 compression_dict = *r->compression_dict;
471 }
472
473 block_contents = CompressBlock(raw_block_contents, r->compression_opts,
474 &type, r->table_options.format_version,
475 compression_dict, &r->compressed_output);
476
477 // Some of the compression algorithms are known to be unreliable. If
478 // the verify_compression flag is set then try to de-compress the
479 // compressed data and compare to the input.
480 if (type != kNoCompression && r->table_options.verify_compression) {
481 // Retrieve the uncompressed contents into a new buffer
482 BlockContents contents;
483 Status stat = UncompressBlockContentsForCompressionType(
484 block_contents.data(), block_contents.size(), &contents,
485 r->table_options.format_version, compression_dict, type,
486 r->ioptions);
487
488 if (stat.ok()) {
489 bool compressed_ok = contents.data.compare(raw_block_contents) == 0;
490 if (!compressed_ok) {
491 // The result of the compression was invalid. abort.
492 abort_compression = true;
493 ROCKS_LOG_ERROR(r->ioptions.info_log,
494 "Decompressed block did not match raw block");
495 r->status =
496 Status::Corruption("Decompressed block did not match raw block");
497 }
498 } else {
499 // Decompression reported an error. abort.
500 r->status = Status::Corruption("Could not decompress");
501 abort_compression = true;
502 }
503 }
504 } else {
505 // Block is too big to be compressed.
506 abort_compression = true;
507 }
508
509 // Abort compression if the block is too big, or did not pass
510 // verification.
511 if (abort_compression) {
512 RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
513 type = kNoCompression;
514 block_contents = raw_block_contents;
515 } else if (type != kNoCompression &&
516 ShouldReportDetailedTime(r->ioptions.env,
517 r->ioptions.statistics)) {
518 MeasureTime(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
519 timer.ElapsedNanos());
520 MeasureTime(r->ioptions.statistics, BYTES_COMPRESSED,
521 raw_block_contents.size());
522 RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED);
523 }
524
525 WriteRawBlock(block_contents, type, handle);
526 r->compressed_output.clear();
527 }
528
529 void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
530 CompressionType type,
531 BlockHandle* handle) {
532 Rep* r = rep_;
533 StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
534 handle->set_offset(r->offset);
535 handle->set_size(block_contents.size());
536 r->status = r->file->Append(block_contents);
537 if (r->status.ok()) {
538 char trailer[kBlockTrailerSize];
539 trailer[0] = type;
540 char* trailer_without_type = trailer + 1;
541 switch (r->table_options.checksum) {
542 case kNoChecksum:
543 // we don't support no checksum yet
544 assert(false);
545 // intentional fallthrough
546 case kCRC32c: {
547 auto crc = crc32c::Value(block_contents.data(), block_contents.size());
548 crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type
549 EncodeFixed32(trailer_without_type, crc32c::Mask(crc));
550 break;
551 }
552 case kxxHash: {
553 void* xxh = XXH32_init(0);
554 XXH32_update(xxh, block_contents.data(),
555 static_cast<uint32_t>(block_contents.size()));
556 XXH32_update(xxh, trailer, 1); // Extend to cover block type
557 EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
558 break;
559 }
560 }
561
562 r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
563 if (r->status.ok()) {
564 r->status = InsertBlockInCache(block_contents, type, handle);
565 }
566 if (r->status.ok()) {
567 r->offset += block_contents.size() + kBlockTrailerSize;
568 }
569 }
570 }
571
572 Status BlockBasedTableBuilder::status() const {
573 return rep_->status;
574 }
575
576 static void DeleteCachedBlock(const Slice& key, void* value) {
577 Block* block = reinterpret_cast<Block*>(value);
578 delete block;
579 }
580
581 //
582 // Make a copy of the block contents and insert into compressed block cache
583 //
584 Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
585 const CompressionType type,
586 const BlockHandle* handle) {
587 Rep* r = rep_;
588 Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
589
590 if (type != kNoCompression && block_cache_compressed != nullptr) {
591
592 size_t size = block_contents.size();
593
594 std::unique_ptr<char[]> ubuf(new char[size + 1]);
595 memcpy(ubuf.get(), block_contents.data(), size);
596 ubuf[size] = type;
597
598 BlockContents results(std::move(ubuf), size, true, type);
599
600 Block* block = new Block(std::move(results), kDisableGlobalSequenceNumber);
601
602 // make cache key by appending the file offset to the cache prefix id
603 char* end = EncodeVarint64(
604 r->compressed_cache_key_prefix +
605 r->compressed_cache_key_prefix_size,
606 handle->offset());
607 Slice key(r->compressed_cache_key_prefix, static_cast<size_t>
608 (end - r->compressed_cache_key_prefix));
609
610 // Insert into compressed block cache.
611 block_cache_compressed->Insert(key, block, block->usable_size(),
612 &DeleteCachedBlock);
613
614 // Invalidate OS cache.
615 r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
616 }
617 return Status::OK();
618 }
619
620 Status BlockBasedTableBuilder::Finish() {
621 Rep* r = rep_;
622 bool empty_data_block = r->data_block.empty();
623 Flush();
624 assert(!r->closed);
625 r->closed = true;
626
627 // To make sure properties block is able to keep the accurate size of index
628 // block, we will finish writing all index entries here and flush them
629 // to storage after metaindex block is written.
630 if (ok() && !empty_data_block) {
631 r->index_builder->AddIndexEntry(
632 &r->last_key, nullptr /* no next data block */, r->pending_handle);
633 }
634
635 BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle,
636 compression_dict_block_handle, range_del_block_handle;
637 // Write filter block
638 if (ok() && r->filter_builder != nullptr) {
639 Status s = Status::Incomplete();
640 while (s.IsIncomplete()) {
641 Slice filter_content = r->filter_builder->Finish(filter_block_handle, &s);
642 assert(s.ok() || s.IsIncomplete());
643 r->props.filter_size += filter_content.size();
644 WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
645 }
646 }
647
648 IndexBuilder::IndexBlocks index_blocks;
649 auto index_builder_status = r->index_builder->Finish(&index_blocks);
650 if (index_builder_status.IsIncomplete()) {
651 // We we have more than one index partition then meta_blocks are not
652 // supported for the index. Currently meta_blocks are used only by
653 // HashIndexBuilder which is not multi-partition.
654 assert(index_blocks.meta_blocks.empty());
655 } else if (!index_builder_status.ok()) {
656 return index_builder_status;
657 }
658
659 // Write meta blocks and metaindex block with the following order.
660 // 1. [meta block: filter]
661 // 2. [meta block: properties]
662 // 3. [meta block: compression dictionary]
663 // 4. [meta block: range deletion tombstone]
664 // 5. [metaindex block]
665 // write meta blocks
666 MetaIndexBuilder meta_index_builder;
667 for (const auto& item : index_blocks.meta_blocks) {
668 BlockHandle block_handle;
669 WriteBlock(item.second, &block_handle, false /* is_data_block */);
670 meta_index_builder.Add(item.first, block_handle);
671 }
672
673 if (ok()) {
674 if (r->filter_builder != nullptr) {
675 // Add mapping from "<filter_block_prefix>.Name" to location
676 // of filter data.
677 std::string key;
678 if (r->filter_builder->IsBlockBased()) {
679 key = BlockBasedTable::kFilterBlockPrefix;
680 } else {
681 key = r->table_options.partition_filters
682 ? BlockBasedTable::kPartitionedFilterBlockPrefix
683 : BlockBasedTable::kFullFilterBlockPrefix;
684 }
685 key.append(r->table_options.filter_policy->Name());
686 meta_index_builder.Add(key, filter_block_handle);
687 }
688
689 // Write properties and compression dictionary blocks.
690 {
691 PropertyBlockBuilder property_block_builder;
692 r->props.column_family_id = r->column_family_id;
693 r->props.column_family_name = r->column_family_name;
694 r->props.filter_policy_name = r->table_options.filter_policy != nullptr ?
695 r->table_options.filter_policy->Name() : "";
696 r->props.index_size =
697 r->index_builder->EstimatedSize() + kBlockTrailerSize;
698 r->props.comparator_name = r->ioptions.user_comparator != nullptr
699 ? r->ioptions.user_comparator->Name()
700 : "nullptr";
701 r->props.merge_operator_name = r->ioptions.merge_operator != nullptr
702 ? r->ioptions.merge_operator->Name()
703 : "nullptr";
704 r->props.compression_name = CompressionTypeToString(r->compression_type);
705 r->props.prefix_extractor_name =
706 r->ioptions.prefix_extractor != nullptr
707 ? r->ioptions.prefix_extractor->Name()
708 : "nullptr";
709
710 std::string property_collectors_names = "[";
711 property_collectors_names = "[";
712 for (size_t i = 0;
713 i < r->ioptions.table_properties_collector_factories.size(); ++i) {
714 if (i != 0) {
715 property_collectors_names += ",";
716 }
717 property_collectors_names +=
718 r->ioptions.table_properties_collector_factories[i]->Name();
719 }
720 property_collectors_names += "]";
721 r->props.property_collectors_names = property_collectors_names;
722
723 // Add basic properties
724 property_block_builder.AddTableProperty(r->props);
725
726 // Add use collected properties
727 NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors,
728 r->ioptions.info_log,
729 &property_block_builder);
730
731 BlockHandle properties_block_handle;
732 WriteRawBlock(
733 property_block_builder.Finish(),
734 kNoCompression,
735 &properties_block_handle
736 );
737 meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
738
739 // Write compression dictionary block
740 if (r->compression_dict && r->compression_dict->size()) {
741 WriteRawBlock(*r->compression_dict, kNoCompression,
742 &compression_dict_block_handle);
743 meta_index_builder.Add(kCompressionDictBlock,
744 compression_dict_block_handle);
745 }
746 } // end of properties/compression dictionary block writing
747
748 if (ok() && !r->range_del_block.empty()) {
749 WriteRawBlock(r->range_del_block.Finish(), kNoCompression,
750 &range_del_block_handle);
751 meta_index_builder.Add(kRangeDelBlock, range_del_block_handle);
752 } // range deletion tombstone meta block
753 } // meta blocks
754
755 // Write index block
756 if (ok()) {
757 // flush the meta index block
758 WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
759 &metaindex_block_handle);
760
761 const bool is_data_block = true;
762 WriteBlock(index_blocks.index_block_contents, &index_block_handle,
763 !is_data_block);
764 // If there are more index partitions, finish them and write them out
765 Status& s = index_builder_status;
766 while (s.IsIncomplete()) {
767 s = r->index_builder->Finish(&index_blocks, index_block_handle);
768 if (!s.ok() && !s.IsIncomplete()) {
769 return s;
770 }
771 WriteBlock(index_blocks.index_block_contents, &index_block_handle,
772 !is_data_block);
773 // The last index_block_handle will be for the partition index block
774 }
775 }
776
777 // Write footer
778 if (ok()) {
779 // No need to write out new footer if we're using default checksum.
780 // We're writing legacy magic number because we want old versions of RocksDB
781 // be able to read files generated with new release (just in case if
782 // somebody wants to roll back after an upgrade)
783 // TODO(icanadi) at some point in the future, when we're absolutely sure
784 // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
785 // number and always write new table files with new magic number
786 bool legacy = (r->table_options.format_version == 0);
787 // this is guaranteed by BlockBasedTableBuilder's constructor
788 assert(r->table_options.checksum == kCRC32c ||
789 r->table_options.format_version != 0);
790 Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
791 : kBlockBasedTableMagicNumber,
792 r->table_options.format_version);
793 footer.set_metaindex_handle(metaindex_block_handle);
794 footer.set_index_handle(index_block_handle);
795 footer.set_checksum(r->table_options.checksum);
796 std::string footer_encoding;
797 footer.EncodeTo(&footer_encoding);
798 r->status = r->file->Append(footer_encoding);
799 if (r->status.ok()) {
800 r->offset += footer_encoding.size();
801 }
802 }
803
804 return r->status;
805 }
806
807 void BlockBasedTableBuilder::Abandon() {
808 Rep* r = rep_;
809 assert(!r->closed);
810 r->closed = true;
811 }
812
813 uint64_t BlockBasedTableBuilder::NumEntries() const {
814 return rep_->props.num_entries;
815 }
816
817 uint64_t BlockBasedTableBuilder::FileSize() const {
818 return rep_->offset;
819 }
820
821 bool BlockBasedTableBuilder::NeedCompact() const {
822 for (const auto& collector : rep_->table_properties_collectors) {
823 if (collector->NeedCompact()) {
824 return true;
825 }
826 }
827 return false;
828 }
829
830 TableProperties BlockBasedTableBuilder::GetTableProperties() const {
831 TableProperties ret = rep_->props;
832 for (const auto& collector : rep_->table_properties_collectors) {
833 for (const auto& prop : collector->GetReadableProperties()) {
834 ret.readable_properties.insert(prop);
835 }
836 collector->Finish(&ret.user_collected_properties);
837 }
838 return ret;
839 }
840
841 const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
842 const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
843 const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
844 "partitionedfilter.";
845 } // namespace rocksdb