1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
14 #include <unordered_map>
17 #include "cache/cache_reservation_manager.h"
18 #include "db/memtable_list.h"
19 #include "db/table_cache.h"
20 #include "db/table_properties_collector.h"
21 #include "db/write_batch_internal.h"
22 #include "db/write_controller.h"
23 #include "options/cf_options.h"
24 #include "rocksdb/compaction_job_stats.h"
25 #include "rocksdb/db.h"
26 #include "rocksdb/env.h"
27 #include "rocksdb/options.h"
28 #include "trace_replay/block_cache_tracer.h"
29 #include "util/hash_containers.h"
30 #include "util/thread_local.h"
32 namespace ROCKSDB_NAMESPACE
{
36 class VersionStorageInfo
;
38 class MemTableListVersion
;
39 class CompactionPicker
;
43 class ColumnFamilyData
;
46 class InstrumentedMutex
;
47 class InstrumentedMutexLock
;
48 struct SuperVersionContext
;
52 extern const double kIncSlowdownRatio
;
53 // This file contains a list of data structures for managing column family
56 // The basic relationships among classes declared here are illustrated as
59 // +----------------------+ +----------------------+ +--------+
60 // +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl |
61 // | +----------------------+ | +----------------------+ +----+---+
62 // | +--------------------------+ |
63 // | | +-----------------------------+
65 // | | +-----------------------------v-------------------------------+
67 // | | | ColumnFamilySet |
69 // | | +-------------+--------------------------+----------------+---+
71 // | +-------------------------------------+ | |
73 // | +-------------v-------------+ +-----v----v---------+
75 // | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ......
81 // +--------+---+--+-+----+----+ +--------------------++
84 // | | | +-----------------------+
85 // | | +-----------+ |
87 // +--------+--------+ | | |
88 // | | | | +----------v----------+
89 // +---> |SuperVersion 1.a +-----------------> |
90 // | +------+ | | MemTableListVersion |
91 // +---+-------------+ | | | | |
92 // | | | | +----+------------+---+
93 // | current | | | | |
94 // | +-------------+ | |mem | |
96 // +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+
98 // | Version 1.a | | memtable | | memtable | | memtable |
99 // | | | 1.a | | 1.b | | 1.c |
100 // +-------------+ | | | | | |
101 // +----------+ +----------+ +----------+
103 // DBImpl keeps a ColumnFamilySet, which references to all column families by
104 // pointing to respective ColumnFamilyData object of each column family.
105 // This is how DBImpl can list and operate on all the column families.
106 // ColumnFamilyHandle also points to ColumnFamilyData directly, so that
107 // when a user executes a query, it can directly find memtables and Version
108 // as well as SuperVersion to the column family, without going through
111 // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
112 // and SST files) indirectly, while ongoing operations may hold references
113 // to a current or an out-of-date SuperVersion, which in turn points to a
114 // point-in-time view of the LSM-tree. This guarantees the memtables and SST
115 // files being operated on will not go away, until the SuperVersion is
116 // unreferenced to 0 and destoryed.
118 // The following graph illustrates a possible referencing relationships:
120 // Column +--------------+ current +-----------+
121 // Family +---->+ +------------------->+ |
122 // Data | SuperVersion +----------+ | Version A |
124 // Iter2 +----->+ | +-------v------+ +-----------+
125 // +-----+--------+ | MemtableList +----------------> Empty
126 // | | Version r | +-----------+
127 // | +--------------+ | |
128 // +------------------+ current| Version B |
129 // +--------------+ | +----->+ |
130 // | | | | +-----+-----+
131 // Compaction +>+ SuperVersion +-------------+ ^
132 // Job | 2 +------+ | |current
133 // | +----+ | | mem | +------------+
134 // +--------------+ | | +---------------------> |
135 // | +------------------------> MemTable a |
137 // +--------------+ | | +------------+
138 // | +--------------------------+
139 // Iter1 +-----> SuperVersion | | +------------+
140 // | 1 +------------------------------>+ |
141 // | +-+ | mem | MemTable b |
142 // +--------------+ | | | |
143 // | | +--------------+ +-----^------+
144 // | |imm | MemtableList | |
145 // | +--->+ Version s +------------+
146 // | +--------------+
147 // | +--------------+
148 // | | MemtableList |
149 // +------>+ Version t +--------> Empty
150 // imm +--------------+
152 // In this example, even if the current LSM-tree consists of Version A and
153 // memtable a, which is also referenced by SuperVersion, two older SuperVersion
154 // SuperVersion2 and Superversion1 still exist, and are referenced by a
155 // compaction job and an old iterator Iter1, respectively. SuperVersion2
156 // contains Version B, memtable a and memtable b; SuperVersion1 contains
157 // Version B and memtable b (mutable). As a result, Version B and memtable b
158 // are prevented from being destroyed or deleted.
160 // ColumnFamilyHandleImpl is the class that clients use to access different
161 // column families. It has non-trivial destructor, which gets called when client
162 // is done using the column family
163 class ColumnFamilyHandleImpl
: public ColumnFamilyHandle
{
165 // create while holding the mutex
166 ColumnFamilyHandleImpl(ColumnFamilyData
* cfd
, DBImpl
* db
,
167 InstrumentedMutex
* mutex
);
168 // destroy without mutex
169 virtual ~ColumnFamilyHandleImpl();
170 virtual ColumnFamilyData
* cfd() const { return cfd_
; }
172 virtual uint32_t GetID() const override
;
173 virtual const std::string
& GetName() const override
;
174 virtual Status
GetDescriptor(ColumnFamilyDescriptor
* desc
) override
;
175 virtual const Comparator
* GetComparator() const override
;
178 ColumnFamilyData
* cfd_
;
180 InstrumentedMutex
* mutex_
;
183 // Does not ref-count ColumnFamilyData
184 // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
185 // calls DBImpl methods. When this happens, MemTableInserter need access to
186 // ColumnFamilyHandle (same as the client would need). In that case, we feed
187 // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
189 class ColumnFamilyHandleInternal
: public ColumnFamilyHandleImpl
{
191 ColumnFamilyHandleInternal()
192 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
193 internal_cfd_(nullptr) {}
195 void SetCFD(ColumnFamilyData
* _cfd
) { internal_cfd_
= _cfd
; }
196 virtual ColumnFamilyData
* cfd() const override
{ return internal_cfd_
; }
199 ColumnFamilyData
* internal_cfd_
;
202 // holds references to memtable, all immutable memtables and version
203 struct SuperVersion
{
204 // Accessing members of this class is not thread-safe and requires external
205 // synchronization (ie db mutex held or on write thread).
206 ColumnFamilyData
* cfd
;
208 MemTableListVersion
* imm
;
210 MutableCFOptions mutable_cf_options
;
211 // Version number of the current SuperVersion
212 uint64_t version_number
;
213 WriteStallCondition write_stall_condition
;
215 // should be called outside the mutex
216 SuperVersion() = default;
219 // If Unref() returns true, Cleanup() should be called with mutex held
220 // before deleting this SuperVersion.
223 // call these two methods with db mutex held
224 // Cleanup unrefs mem, imm and current. Also, it stores all memtables
225 // that needs to be deleted in to_delete vector. Unrefing those
226 // objects needs to be done in the mutex
228 void Init(ColumnFamilyData
* new_cfd
, MemTable
* new_mem
,
229 MemTableListVersion
* new_imm
, Version
* new_current
);
231 // The value of dummy is not actually used. kSVInUse takes its address as a
232 // mark in the thread local storage to indicate the SuperVersion is in use
233 // by thread. This way, the value of kSVInUse is guaranteed to have no
234 // conflict with SuperVersion object address and portable on different
237 static void* const kSVInUse
;
238 static void* const kSVObsolete
;
241 std::atomic
<uint32_t> refs
;
242 // We need to_delete because during Cleanup(), imm->Unref() returns
243 // all memtables that we need to free through this vector. We then
244 // delete all those memtables outside of mutex, during destruction
245 autovector
<MemTable
*> to_delete
;
248 extern Status
CheckCompressionSupported(const ColumnFamilyOptions
& cf_options
);
250 extern Status
CheckConcurrentWritesSupported(
251 const ColumnFamilyOptions
& cf_options
);
253 extern Status
CheckCFPathsSupported(const DBOptions
& db_options
,
254 const ColumnFamilyOptions
& cf_options
);
256 extern ColumnFamilyOptions
SanitizeOptions(const ImmutableDBOptions
& db_options
,
257 const ColumnFamilyOptions
& src
);
258 // Wrap user defined table properties collector factories `from cf_options`
259 // into internal ones in int_tbl_prop_collector_factories. Add a system internal
261 extern void GetIntTblPropCollectorFactory(
262 const ImmutableCFOptions
& ioptions
,
263 IntTblPropCollectorFactories
* int_tbl_prop_collector_factories
);
265 class ColumnFamilySet
;
267 // This class keeps all the data that a column family needs.
268 // Most methods require DB mutex held, unless otherwise noted
269 class ColumnFamilyData
{
274 uint32_t GetID() const { return id_
; }
276 const std::string
& GetName() const { return name_
; }
278 // Ref() can only be called from a context where the caller can guarantee
279 // that ColumnFamilyData is alive (while holding a non-zero ref already,
280 // holding a DB mutex, or as the leader in a write batch group).
281 void Ref() { refs_
.fetch_add(1); }
283 // UnrefAndTryDelete() decreases the reference count and do free if needed,
284 // return true if this is freed else false, UnrefAndTryDelete() can only
285 // be called while holding a DB mutex, or during single-threaded recovery.
286 bool UnrefAndTryDelete();
288 // SetDropped() can only be called under following conditions:
289 // 1) Holding a DB mutex,
290 // 2) from single-threaded write thread, AND
291 // 3) from single-threaded VersionSet::LogAndApply()
292 // After dropping column family no other operation on that column family
293 // will be executed. All the files and memory will be, however, kept around
294 // until client drops the column family handle. That way, client can still
295 // access data from dropped column family.
296 // Column family can be dropped and still alive. In that state:
297 // *) Compaction and flush is not executed on the dropped column family.
298 // *) Client can continue reading from column family. Writes will fail unless
299 // WriteOptions::ignore_missing_column_families is true
300 // When the dropped column family is unreferenced, then we:
301 // *) Remove column family from the linked list maintained by ColumnFamilySet
302 // *) delete all memory associated with that column family
303 // *) delete all the files associated with that column family
305 bool IsDropped() const { return dropped_
.load(std::memory_order_relaxed
); }
308 int NumberLevels() const { return ioptions_
.num_levels
; }
310 void SetLogNumber(uint64_t log_number
) { log_number_
= log_number
; }
311 uint64_t GetLogNumber() const { return log_number_
; }
313 void SetFlushReason(FlushReason flush_reason
) {
314 flush_reason_
= flush_reason
;
316 FlushReason
GetFlushReason() const { return flush_reason_
; }
318 const FileOptions
* soptions() const;
319 const ImmutableOptions
* ioptions() const { return &ioptions_
; }
320 // REQUIRES: DB mutex held
321 // This returns the MutableCFOptions used by current SuperVersion
322 // You should use this API to reference MutableCFOptions most of the time.
323 const MutableCFOptions
* GetCurrentMutableCFOptions() const {
324 return &(super_version_
->mutable_cf_options
);
326 // REQUIRES: DB mutex held
327 // This returns the latest MutableCFOptions, which may be not in effect yet.
328 const MutableCFOptions
* GetLatestMutableCFOptions() const {
329 return &mutable_cf_options_
;
332 // REQUIRES: DB mutex held
333 // Build ColumnFamiliesOptions with immutable options and latest mutable
335 ColumnFamilyOptions
GetLatestCFOptions() const;
337 bool is_delete_range_supported() { return is_delete_range_supported_
; }
339 // Validate CF options against DB options
340 static Status
ValidateOptions(const DBOptions
& db_options
,
341 const ColumnFamilyOptions
& cf_options
);
343 // REQUIRES: DB mutex held
345 const DBOptions
& db_options
,
346 const std::unordered_map
<std::string
, std::string
>& options_map
);
347 #endif // ROCKSDB_LITE
349 InternalStats
* internal_stats() { return internal_stats_
.get(); }
351 MemTableList
* imm() { return &imm_
; }
352 MemTable
* mem() { return mem_
; }
355 return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
358 Version
* current() { return current_
; }
359 Version
* dummy_versions() { return dummy_versions_
; }
360 void SetCurrent(Version
* _current
);
361 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
362 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
363 uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
364 uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held
365 void SetMemtable(MemTable
* new_mem
) {
366 uint64_t memtable_id
= last_memtable_id_
.fetch_add(1) + 1;
367 new_mem
->SetID(memtable_id
);
371 // calculate the oldest log needed for the durability of this column family
372 uint64_t OldestLogToKeep();
374 // See Memtable constructor for explanation of earliest_seq param.
375 MemTable
* ConstructNewMemtable(const MutableCFOptions
& mutable_cf_options
,
376 SequenceNumber earliest_seq
);
377 void CreateNewMemtable(const MutableCFOptions
& mutable_cf_options
,
378 SequenceNumber earliest_seq
);
380 TableCache
* table_cache() const { return table_cache_
.get(); }
381 BlobSource
* blob_source() const { return blob_source_
.get(); }
383 // See documentation in compaction_picker.h
384 // REQUIRES: DB mutex held
385 bool NeedsCompaction() const;
386 // REQUIRES: DB mutex held
387 Compaction
* PickCompaction(const MutableCFOptions
& mutable_options
,
388 const MutableDBOptions
& mutable_db_options
,
389 LogBuffer
* log_buffer
);
391 // Check if the passed range overlap with any running compactions.
392 // REQUIRES: DB mutex held
393 bool RangeOverlapWithCompaction(const Slice
& smallest_user_key
,
394 const Slice
& largest_user_key
,
397 // Check if the passed ranges overlap with any unflushed memtables
398 // (immutable or mutable).
400 // @param super_version A referenced SuperVersion that will be held for the
401 // duration of this function.
404 Status
RangesOverlapWithMemtables(const autovector
<Range
>& ranges
,
405 SuperVersion
* super_version
,
406 bool allow_data_in_errors
, bool* overlap
);
408 // A flag to tell a manual compaction is to compact all levels together
409 // instead of a specific level.
410 static const int kCompactAllLevels
;
411 // A flag to tell a manual compaction's output is base level.
412 static const int kCompactToBaseLevel
;
413 // REQUIRES: DB mutex held
414 Compaction
* CompactRange(const MutableCFOptions
& mutable_cf_options
,
415 const MutableDBOptions
& mutable_db_options
,
416 int input_level
, int output_level
,
417 const CompactRangeOptions
& compact_range_options
,
418 const InternalKey
* begin
, const InternalKey
* end
,
419 InternalKey
** compaction_end
, bool* manual_conflict
,
420 uint64_t max_file_num_to_ignore
,
421 const std::string
& trim_ts
);
423 CompactionPicker
* compaction_picker() { return compaction_picker_
.get(); }
425 const Comparator
* user_comparator() const {
426 return internal_comparator_
.user_comparator();
429 const InternalKeyComparator
& internal_comparator() const {
430 return internal_comparator_
;
433 const IntTblPropCollectorFactories
* int_tbl_prop_collector_factories() const {
434 return &int_tbl_prop_collector_factories_
;
437 SuperVersion
* GetSuperVersion() { return super_version_
; }
439 // Return a already referenced SuperVersion to be used safely.
440 SuperVersion
* GetReferencedSuperVersion(DBImpl
* db
);
442 // Get SuperVersion stored in thread local storage. If it does not exist,
443 // get a reference from a current SuperVersion.
444 SuperVersion
* GetThreadLocalSuperVersion(DBImpl
* db
);
445 // Try to return SuperVersion back to thread local storage. Return true on
446 // success and false on failure. It fails when the thread local storage
447 // contains anything other than SuperVersion::kSVInUse flag.
448 bool ReturnThreadLocalSuperVersion(SuperVersion
* sv
);
450 uint64_t GetSuperVersionNumber() const {
451 return super_version_number_
.load();
453 // will return a pointer to SuperVersion* if previous SuperVersion
454 // if its reference count is zero and needs deletion or nullptr if not
455 // As argument takes a pointer to allocated SuperVersion to enable
456 // the clients to allocate SuperVersion outside of mutex.
457 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
458 void InstallSuperVersion(SuperVersionContext
* sv_context
,
459 const MutableCFOptions
& mutable_cf_options
);
460 void InstallSuperVersion(SuperVersionContext
* sv_context
,
461 InstrumentedMutex
* db_mutex
);
463 void ResetThreadLocalSuperVersions();
465 // Protected by DB mutex
466 void set_queued_for_flush(bool value
) { queued_for_flush_
= value
; }
467 void set_queued_for_compaction(bool value
) { queued_for_compaction_
= value
; }
468 bool queued_for_flush() { return queued_for_flush_
; }
469 bool queued_for_compaction() { return queued_for_compaction_
; }
471 enum class WriteStallCause
{
475 kPendingCompactionBytes
,
477 static std::pair
<WriteStallCondition
, WriteStallCause
>
478 GetWriteStallConditionAndCause(
479 int num_unflushed_memtables
, int num_l0_files
,
480 uint64_t num_compaction_needed_bytes
,
481 const MutableCFOptions
& mutable_cf_options
,
482 const ImmutableCFOptions
& immutable_cf_options
);
484 // Recalculate some stall conditions, which are changed only during
485 // compaction, adding new memtable and/or recalculation of compaction score.
486 WriteStallCondition
RecalculateWriteStallConditions(
487 const MutableCFOptions
& mutable_cf_options
);
489 void set_initialized() { initialized_
.store(true); }
491 bool initialized() const { return initialized_
.load(); }
493 const ColumnFamilyOptions
& initial_cf_options() {
494 return initial_cf_options_
;
497 Env::WriteLifeTimeHint
CalculateSSTWriteHint(int level
);
499 // created_dirs remembers directory created, so that we don't need to call
500 // the same data creation operation again.
501 Status
AddDirectories(
502 std::map
<std::string
, std::shared_ptr
<FSDirectory
>>* created_dirs
);
504 FSDirectory
* GetDataDir(size_t path_id
) const;
506 // full_history_ts_low_ can only increase.
507 void SetFullHistoryTsLow(std::string ts_low
) {
508 assert(!ts_low
.empty());
509 const Comparator
* ucmp
= user_comparator();
511 if (full_history_ts_low_
.empty() ||
512 ucmp
->CompareTimestamp(ts_low
, full_history_ts_low_
) > 0) {
513 full_history_ts_low_
= std::move(ts_low
);
517 const std::string
& GetFullHistoryTsLow() const {
518 return full_history_ts_low_
;
521 ThreadLocalPtr
* TEST_GetLocalSV() { return local_sv_
.get(); }
522 WriteBufferManager
* write_buffer_mgr() { return write_buffer_manager_
; }
523 std::shared_ptr
<CacheReservationManager
>
524 GetFileMetadataCacheReservationManager() {
525 return file_metadata_cache_res_mgr_
;
528 SequenceNumber
GetFirstMemtableSequenceNumber() const;
530 static const uint32_t kDummyColumnFamilyDataId
;
532 // Keep track of whether the mempurge feature was ever used.
533 void SetMempurgeUsed() { mempurge_used_
= true; }
534 bool GetMempurgeUsed() { return mempurge_used_
; }
537 friend class ColumnFamilySet
;
538 ColumnFamilyData(uint32_t id
, const std::string
& name
,
539 Version
* dummy_versions
, Cache
* table_cache
,
540 WriteBufferManager
* write_buffer_manager
,
541 const ColumnFamilyOptions
& options
,
542 const ImmutableDBOptions
& db_options
,
543 const FileOptions
* file_options
,
544 ColumnFamilySet
* column_family_set
,
545 BlockCacheTracer
* const block_cache_tracer
,
546 const std::shared_ptr
<IOTracer
>& io_tracer
,
547 const std::string
& db_id
, const std::string
& db_session_id
);
549 std::vector
<std::string
> GetDbPaths() const;
552 const std::string name_
;
553 Version
* dummy_versions_
; // Head of circular doubly-linked list of versions.
554 Version
* current_
; // == dummy_versions->prev_
556 std::atomic
<int> refs_
; // outstanding references to ColumnFamilyData
557 std::atomic
<bool> initialized_
;
558 std::atomic
<bool> dropped_
; // true if client dropped it
560 const InternalKeyComparator internal_comparator_
;
561 IntTblPropCollectorFactories int_tbl_prop_collector_factories_
;
563 const ColumnFamilyOptions initial_cf_options_
;
564 const ImmutableOptions ioptions_
;
565 MutableCFOptions mutable_cf_options_
;
567 const bool is_delete_range_supported_
;
569 std::unique_ptr
<TableCache
> table_cache_
;
570 std::unique_ptr
<BlobFileCache
> blob_file_cache_
;
571 std::unique_ptr
<BlobSource
> blob_source_
;
573 std::unique_ptr
<InternalStats
> internal_stats_
;
575 WriteBufferManager
* write_buffer_manager_
;
579 SuperVersion
* super_version_
;
581 // An ordinal representing the current SuperVersion. Updated by
582 // InstallSuperVersion(), i.e. incremented every time super_version_
584 std::atomic
<uint64_t> super_version_number_
;
586 // Thread's local copy of SuperVersion pointer
587 // This needs to be destructed before mutex_
588 std::unique_ptr
<ThreadLocalPtr
> local_sv_
;
590 // pointers for a circular linked list. we use it to support iterations over
591 // all column families that are alive (note: dropped column families can also
592 // be alive as long as client holds a reference)
593 ColumnFamilyData
* next_
;
594 ColumnFamilyData
* prev_
;
596 // This is the earliest log file number that contains data from this
597 // Column Family. All earlier log files must be ignored and not
599 uint64_t log_number_
;
601 std::atomic
<FlushReason
> flush_reason_
;
603 // An object that keeps all the compaction stats
604 // and picks the next compaction
605 std::unique_ptr
<CompactionPicker
> compaction_picker_
;
607 ColumnFamilySet
* column_family_set_
;
609 std::unique_ptr
<WriteControllerToken
> write_controller_token_
;
611 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
612 bool queued_for_flush_
;
614 // If true --> this ColumnFamily is currently present in
615 // DBImpl::compaction_queue_
616 bool queued_for_compaction_
;
618 uint64_t prev_compaction_needed_bytes_
;
620 // if the database was opened with 2pc enabled
623 // Memtable id to track flush.
624 std::atomic
<uint64_t> last_memtable_id_
;
626 // Directories corresponding to cf_paths.
627 std::vector
<std::shared_ptr
<FSDirectory
>> data_dirs_
;
629 bool db_paths_registered_
;
631 std::string full_history_ts_low_
;
633 // For charging memory usage of file metadata created for newly added files to
634 // a Version associated with this CFD
635 std::shared_ptr
<CacheReservationManager
> file_metadata_cache_res_mgr_
;
639 // ColumnFamilySet has interesting thread-safety requirements
640 // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
641 // mutex AND executed in the write thread.
642 // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
643 // single-threaded write thread. It is also called during Recovery and in
645 // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
646 // held and it needs to be executed from the write thread. SetDropped() also
647 // guarantees that it will be called only from single-threaded LogAndApply(),
648 // but this condition is not that important.
649 // * Iteration -- hold DB mutex. If you want to release the DB mutex in the
650 // body of the iteration, wrap in a RefedColumnFamilySet.
651 // * GetDefault() -- thread safe
652 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
653 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
654 // NumberOfColumnFamilies -- inside of DB mutex
655 class ColumnFamilySet
{
657 // ColumnFamilySet supports iteration
660 explicit iterator(ColumnFamilyData
* cfd
) : current_(cfd
) {}
661 // NOTE: minimum operators for for-loop iteration
662 iterator
& operator++() {
663 current_
= current_
->next_
;
666 bool operator!=(const iterator
& other
) const {
667 return this->current_
!= other
.current_
;
669 ColumnFamilyData
* operator*() { return current_
; }
672 ColumnFamilyData
* current_
;
675 ColumnFamilySet(const std::string
& dbname
,
676 const ImmutableDBOptions
* db_options
,
677 const FileOptions
& file_options
, Cache
* table_cache
,
678 WriteBufferManager
* _write_buffer_manager
,
679 WriteController
* _write_controller
,
680 BlockCacheTracer
* const block_cache_tracer
,
681 const std::shared_ptr
<IOTracer
>& io_tracer
,
682 const std::string
& db_id
, const std::string
& db_session_id
);
685 ColumnFamilyData
* GetDefault() const;
686 // GetColumnFamily() calls return nullptr if column family is not found
687 ColumnFamilyData
* GetColumnFamily(uint32_t id
) const;
688 ColumnFamilyData
* GetColumnFamily(const std::string
& name
) const;
689 // this call will return the next available column family ID. it guarantees
690 // that there is no column family with id greater than or equal to the
691 // returned value in the current running instance or anytime in RocksDB
693 uint32_t GetNextColumnFamilyID();
694 uint32_t GetMaxColumnFamily();
695 void UpdateMaxColumnFamily(uint32_t new_max_column_family
);
696 size_t NumberOfColumnFamilies() const;
698 ColumnFamilyData
* CreateColumnFamily(const std::string
& name
, uint32_t id
,
699 Version
* dummy_version
,
700 const ColumnFamilyOptions
& options
);
702 iterator
begin() { return iterator(dummy_cfd_
->next_
); }
703 iterator
end() { return iterator(dummy_cfd_
); }
705 Cache
* get_table_cache() { return table_cache_
; }
707 WriteBufferManager
* write_buffer_manager() { return write_buffer_manager_
; }
709 WriteController
* write_controller() { return write_controller_
; }
712 friend class ColumnFamilyData
;
713 // helper function that gets called from cfd destructor
714 // REQUIRES: DB mutex held
715 void RemoveColumnFamily(ColumnFamilyData
* cfd
);
717 // column_families_ and column_family_data_ need to be protected:
718 // * when mutating both conditions have to be satisfied:
719 // 1. DB mutex locked
720 // 2. thread currently in single-threaded write thread
721 // * when reading, at least one condition needs to be satisfied:
722 // 1. DB mutex locked
723 // 2. accessed from a single-threaded write thread
724 UnorderedMap
<std::string
, uint32_t> column_families_
;
725 UnorderedMap
<uint32_t, ColumnFamilyData
*> column_family_data_
;
727 uint32_t max_column_family_
;
728 const FileOptions file_options_
;
730 ColumnFamilyData
* dummy_cfd_
;
731 // We don't hold the refcount here, since default column family always exists
732 // We are also not responsible for cleaning up default_cfd_cache_. This is
733 // just a cache that makes common case (accessing default column family)
735 ColumnFamilyData
* default_cfd_cache_
;
737 const std::string db_name_
;
738 const ImmutableDBOptions
* const db_options_
;
740 WriteBufferManager
* write_buffer_manager_
;
741 WriteController
* write_controller_
;
742 BlockCacheTracer
* const block_cache_tracer_
;
743 std::shared_ptr
<IOTracer
> io_tracer_
;
744 const std::string
& db_id_
;
745 std::string db_session_id_
;
748 // A wrapper for ColumnFamilySet that supports releasing DB mutex during each
749 // iteration over the iterator, because the cfd is Refed and Unrefed during
750 // each iteration to prevent concurrent CF drop from destroying it (until
752 class RefedColumnFamilySet
{
754 explicit RefedColumnFamilySet(ColumnFamilySet
* cfs
) : wrapped_(cfs
) {}
758 explicit iterator(ColumnFamilySet::iterator wrapped
) : wrapped_(wrapped
) {
761 ~iterator() { MaybeUnref(*wrapped_
); }
762 inline void MaybeRef(ColumnFamilyData
* cfd
) {
763 if (cfd
->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId
) {
767 inline void MaybeUnref(ColumnFamilyData
* cfd
) {
768 if (cfd
->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId
) {
769 cfd
->UnrefAndTryDelete();
772 // NOTE: minimum operators for for-loop iteration
773 inline iterator
& operator++() {
774 ColumnFamilyData
* old
= *wrapped_
;
776 // Can only unref & potentially free cfd after accessing its next_
781 inline bool operator!=(const iterator
& other
) const {
782 return this->wrapped_
!= other
.wrapped_
;
784 inline ColumnFamilyData
* operator*() { return *wrapped_
; }
787 ColumnFamilySet::iterator wrapped_
;
790 iterator
begin() { return iterator(wrapped_
->begin()); }
791 iterator
end() { return iterator(wrapped_
->end()); }
794 ColumnFamilySet
* wrapped_
;
797 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
798 // memtables of different column families (specified by ID in the write batch)
799 class ColumnFamilyMemTablesImpl
: public ColumnFamilyMemTables
{
801 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet
* column_family_set
)
802 : column_family_set_(column_family_set
), current_(nullptr) {}
804 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
805 // with the arguments used to construct *orig.
806 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl
* orig
)
807 : column_family_set_(orig
->column_family_set_
), current_(nullptr) {}
809 // sets current_ to ColumnFamilyData with column_family_id
810 // returns false if column family doesn't exist
811 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
812 // under a DB mutex OR from a write thread
813 bool Seek(uint32_t column_family_id
) override
;
815 // Returns log number of the selected column family
816 // REQUIRES: under a DB mutex OR from a write thread
817 uint64_t GetLogNumber() const override
;
819 // REQUIRES: Seek() called first
820 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
821 // under a DB mutex OR from a write thread
822 virtual MemTable
* GetMemTable() const override
;
824 // Returns column family handle for the selected column family
825 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
826 // under a DB mutex OR from a write thread
827 virtual ColumnFamilyHandle
* GetColumnFamilyHandle() override
;
829 // Cannot be called while another thread is calling Seek().
830 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
831 // under a DB mutex OR from a write thread
832 virtual ColumnFamilyData
* current() override
{ return current_
; }
835 ColumnFamilySet
* column_family_set_
;
836 ColumnFamilyData
* current_
;
837 ColumnFamilyHandleInternal handle_
;
840 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle
* column_family
);
842 extern const Comparator
* GetColumnFamilyUserComparator(
843 ColumnFamilyHandle
* column_family
);
845 } // namespace ROCKSDB_NAMESPACE