1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
12 #include <unordered_map>
17 #include "db/memtable_list.h"
18 #include "db/table_cache.h"
19 #include "db/table_properties_collector.h"
20 #include "db/write_batch_internal.h"
21 #include "db/write_controller.h"
22 #include "options/cf_options.h"
23 #include "rocksdb/compaction_job_stats.h"
24 #include "rocksdb/db.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/options.h"
27 #include "util/thread_local.h"
33 class VersionStorageInfo
;
35 class MemTableListVersion
;
36 class CompactionPicker
;
40 class ColumnFamilyData
;
43 class InstrumentedMutex
;
44 class InstrumentedMutexLock
;
45 struct SuperVersionContext
;
47 extern const double kIncSlowdownRatio
;
49 // ColumnFamilyHandleImpl is the class that clients use to access different
50 // column families. It has non-trivial destructor, which gets called when client
51 // is done using the column family
52 class ColumnFamilyHandleImpl
: public ColumnFamilyHandle
{
54 // create while holding the mutex
55 ColumnFamilyHandleImpl(
56 ColumnFamilyData
* cfd
, DBImpl
* db
, InstrumentedMutex
* mutex
);
57 // destroy without mutex
58 virtual ~ColumnFamilyHandleImpl();
59 virtual ColumnFamilyData
* cfd() const { return cfd_
; }
61 virtual uint32_t GetID() const override
;
62 virtual const std::string
& GetName() const override
;
63 virtual Status
GetDescriptor(ColumnFamilyDescriptor
* desc
) override
;
64 virtual const Comparator
* GetComparator() const override
;
67 ColumnFamilyData
* cfd_
;
69 InstrumentedMutex
* mutex_
;
72 // Does not ref-count ColumnFamilyData
73 // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
74 // calls DBImpl methods. When this happens, MemTableInserter need access to
75 // ColumnFamilyHandle (same as the client would need). In that case, we feed
76 // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
78 class ColumnFamilyHandleInternal
: public ColumnFamilyHandleImpl
{
80 ColumnFamilyHandleInternal()
81 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
83 void SetCFD(ColumnFamilyData
* _cfd
) { internal_cfd_
= _cfd
; }
84 virtual ColumnFamilyData
* cfd() const override
{ return internal_cfd_
; }
87 ColumnFamilyData
* internal_cfd_
;
90 // holds references to memtable, all immutable memtables and version
92 // Accessing members of this class is not thread-safe and requires external
93 // synchronization (ie db mutex held or on write thread).
95 MemTableListVersion
* imm
;
97 MutableCFOptions mutable_cf_options
;
98 // Version number of the current SuperVersion
99 uint64_t version_number
;
100 WriteStallCondition write_stall_condition
;
102 InstrumentedMutex
* db_mutex
;
104 // should be called outside the mutex
105 SuperVersion() = default;
108 // If Unref() returns true, Cleanup() should be called with mutex held
109 // before deleting this SuperVersion.
112 // call these two methods with db mutex held
113 // Cleanup unrefs mem, imm and current. Also, it stores all memtables
114 // that needs to be deleted in to_delete vector. Unrefing those
115 // objects needs to be done in the mutex
117 void Init(MemTable
* new_mem
, MemTableListVersion
* new_imm
,
118 Version
* new_current
);
120 // The value of dummy is not actually used. kSVInUse takes its address as a
121 // mark in the thread local storage to indicate the SuperVersion is in use
122 // by thread. This way, the value of kSVInUse is guaranteed to have no
123 // conflict with SuperVersion object address and portable on different
126 static void* const kSVInUse
;
127 static void* const kSVObsolete
;
130 std::atomic
<uint32_t> refs
;
131 // We need to_delete because during Cleanup(), imm->Unref() returns
132 // all memtables that we need to free through this vector. We then
133 // delete all those memtables outside of mutex, during destruction
134 autovector
<MemTable
*> to_delete
;
137 extern Status
CheckCompressionSupported(const ColumnFamilyOptions
& cf_options
);
139 extern Status
CheckConcurrentWritesSupported(
140 const ColumnFamilyOptions
& cf_options
);
142 extern Status
CheckCFPathsSupported(const DBOptions
& db_options
,
143 const ColumnFamilyOptions
& cf_options
);
145 extern ColumnFamilyOptions
SanitizeOptions(const ImmutableDBOptions
& db_options
,
146 const ColumnFamilyOptions
& src
);
147 // Wrap user defined table proproties collector factories `from cf_options`
148 // into internal ones in int_tbl_prop_collector_factories. Add a system internal
150 extern void GetIntTblPropCollectorFactory(
151 const ImmutableCFOptions
& ioptions
,
152 std::vector
<std::unique_ptr
<IntTblPropCollectorFactory
>>*
153 int_tbl_prop_collector_factories
);
155 class ColumnFamilySet
;
157 // This class keeps all the data that a column family needs.
158 // Most methods require DB mutex held, unless otherwise noted
159 class ColumnFamilyData
{
164 uint32_t GetID() const { return id_
; }
166 const std::string
& GetName() const { return name_
; }
168 // Ref() can only be called from a context where the caller can guarantee
169 // that ColumnFamilyData is alive (while holding a non-zero ref already,
170 // holding a DB mutex, or as the leader in a write batch group).
171 void Ref() { refs_
.fetch_add(1, std::memory_order_relaxed
); }
173 // Unref decreases the reference count, but does not handle deletion
174 // when the count goes to 0. If this method returns true then the
175 // caller should delete the instance immediately, or later, by calling
176 // FreeDeadColumnFamilies(). Unref() can only be called while holding
177 // a DB mutex, or during single-threaded recovery.
179 int old_refs
= refs_
.fetch_sub(1, std::memory_order_relaxed
);
180 assert(old_refs
> 0);
181 return old_refs
== 1;
184 // SetDropped() can only be called under following conditions:
185 // 1) Holding a DB mutex,
186 // 2) from single-threaded write thread, AND
187 // 3) from single-threaded VersionSet::LogAndApply()
188 // After dropping column family no other operation on that column family
189 // will be executed. All the files and memory will be, however, kept around
190 // until client drops the column family handle. That way, client can still
191 // access data from dropped column family.
192 // Column family can be dropped and still alive. In that state:
193 // *) Compaction and flush is not executed on the dropped column family.
194 // *) Client can continue reading from column family. Writes will fail unless
195 // WriteOptions::ignore_missing_column_families is true
196 // When the dropped column family is unreferenced, then we:
197 // *) Remove column family from the linked list maintained by ColumnFamilySet
198 // *) delete all memory associated with that column family
199 // *) delete all the files associated with that column family
201 bool IsDropped() const { return dropped_
.load(std::memory_order_relaxed
); }
204 int NumberLevels() const { return ioptions_
.num_levels
; }
206 void SetLogNumber(uint64_t log_number
) { log_number_
= log_number
; }
207 uint64_t GetLogNumber() const { return log_number_
; }
209 void SetFlushReason(FlushReason flush_reason
) {
210 flush_reason_
= flush_reason
;
212 FlushReason
GetFlushReason() const { return flush_reason_
; }
214 const EnvOptions
* soptions() const;
215 const ImmutableCFOptions
* ioptions() const { return &ioptions_
; }
216 // REQUIRES: DB mutex held
217 // This returns the MutableCFOptions used by current SuperVersion
218 // You should use this API to reference MutableCFOptions most of the time.
219 const MutableCFOptions
* GetCurrentMutableCFOptions() const {
220 return &(super_version_
->mutable_cf_options
);
222 // REQUIRES: DB mutex held
223 // This returns the latest MutableCFOptions, which may be not in effect yet.
224 const MutableCFOptions
* GetLatestMutableCFOptions() const {
225 return &mutable_cf_options_
;
228 // REQUIRES: DB mutex held
229 // Build ColumnFamiliesOptions with immutable options and latest mutable
231 ColumnFamilyOptions
GetLatestCFOptions() const;
233 bool is_delete_range_supported() { return is_delete_range_supported_
; }
236 // REQUIRES: DB mutex held
238 const std::unordered_map
<std::string
, std::string
>& options_map
);
239 #endif // ROCKSDB_LITE
241 InternalStats
* internal_stats() { return internal_stats_
.get(); }
243 MemTableList
* imm() { return &imm_
; }
244 MemTable
* mem() { return mem_
; }
245 Version
* current() { return current_
; }
246 Version
* dummy_versions() { return dummy_versions_
; }
247 void SetCurrent(Version
* _current
);
248 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
249 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
250 uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
251 void SetMemtable(MemTable
* new_mem
) {
252 uint64_t memtable_id
= last_memtable_id_
.fetch_add(1) + 1;
253 new_mem
->SetID(memtable_id
);
257 // calculate the oldest log needed for the durability of this column family
258 uint64_t OldestLogToKeep();
260 // See Memtable constructor for explanation of earliest_seq param.
261 MemTable
* ConstructNewMemtable(const MutableCFOptions
& mutable_cf_options
,
262 SequenceNumber earliest_seq
);
263 void CreateNewMemtable(const MutableCFOptions
& mutable_cf_options
,
264 SequenceNumber earliest_seq
);
266 TableCache
* table_cache() const { return table_cache_
.get(); }
268 // See documentation in compaction_picker.h
269 // REQUIRES: DB mutex held
270 bool NeedsCompaction() const;
271 // REQUIRES: DB mutex held
272 Compaction
* PickCompaction(const MutableCFOptions
& mutable_options
,
273 LogBuffer
* log_buffer
);
275 // Check if the passed range overlap with any running compactions.
276 // REQUIRES: DB mutex held
277 bool RangeOverlapWithCompaction(const Slice
& smallest_user_key
,
278 const Slice
& largest_user_key
,
281 // Check if the passed ranges overlap with any unflushed memtables
282 // (immutable or mutable).
284 // @param super_version A referenced SuperVersion that will be held for the
285 // duration of this function.
288 Status
RangesOverlapWithMemtables(const autovector
<Range
>& ranges
,
289 SuperVersion
* super_version
, bool* overlap
);
291 // A flag to tell a manual compaction is to compact all levels together
292 // instead of a specific level.
293 static const int kCompactAllLevels
;
294 // A flag to tell a manual compaction's output is base level.
295 static const int kCompactToBaseLevel
;
296 // REQUIRES: DB mutex held
297 Compaction
* CompactRange(const MutableCFOptions
& mutable_cf_options
,
298 int input_level
, int output_level
,
299 uint32_t output_path_id
, uint32_t max_subcompactions
,
300 const InternalKey
* begin
, const InternalKey
* end
,
301 InternalKey
** compaction_end
, bool* manual_conflict
);
303 CompactionPicker
* compaction_picker() { return compaction_picker_
.get(); }
305 const Comparator
* user_comparator() const {
306 return internal_comparator_
.user_comparator();
309 const InternalKeyComparator
& internal_comparator() const {
310 return internal_comparator_
;
313 const std::vector
<std::unique_ptr
<IntTblPropCollectorFactory
>>*
314 int_tbl_prop_collector_factories() const {
315 return &int_tbl_prop_collector_factories_
;
318 SuperVersion
* GetSuperVersion() { return super_version_
; }
320 // Return a already referenced SuperVersion to be used safely.
321 SuperVersion
* GetReferencedSuperVersion(InstrumentedMutex
* db_mutex
);
323 // Get SuperVersion stored in thread local storage. If it does not exist,
324 // get a reference from a current SuperVersion.
325 SuperVersion
* GetThreadLocalSuperVersion(InstrumentedMutex
* db_mutex
);
326 // Try to return SuperVersion back to thread local storage. Retrun true on
327 // success and false on failure. It fails when the thread local storage
328 // contains anything other than SuperVersion::kSVInUse flag.
329 bool ReturnThreadLocalSuperVersion(SuperVersion
* sv
);
331 uint64_t GetSuperVersionNumber() const {
332 return super_version_number_
.load();
334 // will return a pointer to SuperVersion* if previous SuperVersion
335 // if its reference count is zero and needs deletion or nullptr if not
336 // As argument takes a pointer to allocated SuperVersion to enable
337 // the clients to allocate SuperVersion outside of mutex.
338 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
339 void InstallSuperVersion(SuperVersionContext
* sv_context
,
340 InstrumentedMutex
* db_mutex
,
341 const MutableCFOptions
& mutable_cf_options
);
342 void InstallSuperVersion(SuperVersionContext
* sv_context
,
343 InstrumentedMutex
* db_mutex
);
345 void ResetThreadLocalSuperVersions();
347 // Protected by DB mutex
348 void set_queued_for_flush(bool value
) { queued_for_flush_
= value
; }
349 void set_queued_for_compaction(bool value
) { queued_for_compaction_
= value
; }
350 bool queued_for_flush() { return queued_for_flush_
; }
351 bool queued_for_compaction() { return queued_for_compaction_
; }
353 enum class WriteStallCause
{
357 kPendingCompactionBytes
,
359 static std::pair
<WriteStallCondition
, WriteStallCause
>
360 GetWriteStallConditionAndCause(int num_unflushed_memtables
, int num_l0_files
,
361 uint64_t num_compaction_needed_bytes
,
362 const MutableCFOptions
& mutable_cf_options
);
364 // Recalculate some small conditions, which are changed only during
365 // compaction, adding new memtable and/or
366 // recalculation of compaction score. These values are used in
367 // DBImpl::MakeRoomForWrite function to decide, if it need to make
369 WriteStallCondition
RecalculateWriteStallConditions(
370 const MutableCFOptions
& mutable_cf_options
);
372 void set_initialized() { initialized_
.store(true); }
374 bool initialized() const { return initialized_
.load(); }
376 const ColumnFamilyOptions
& initial_cf_options() {
377 return initial_cf_options_
;
380 Env::WriteLifeTimeHint
CalculateSSTWriteHint(int level
);
382 Status
AddDirectories();
384 Directory
* GetDataDir(size_t path_id
) const;
386 ThreadLocalPtr
* TEST_GetLocalSV() { return local_sv_
.get(); }
389 friend class ColumnFamilySet
;
390 ColumnFamilyData(uint32_t id
, const std::string
& name
,
391 Version
* dummy_versions
, Cache
* table_cache
,
392 WriteBufferManager
* write_buffer_manager
,
393 const ColumnFamilyOptions
& options
,
394 const ImmutableDBOptions
& db_options
,
395 const EnvOptions
& env_options
,
396 ColumnFamilySet
* column_family_set
);
399 const std::string name_
;
400 Version
* dummy_versions_
; // Head of circular doubly-linked list of versions.
401 Version
* current_
; // == dummy_versions->prev_
403 std::atomic
<int> refs_
; // outstanding references to ColumnFamilyData
404 std::atomic
<bool> initialized_
;
405 std::atomic
<bool> dropped_
; // true if client dropped it
407 const InternalKeyComparator internal_comparator_
;
408 std::vector
<std::unique_ptr
<IntTblPropCollectorFactory
>>
409 int_tbl_prop_collector_factories_
;
411 const ColumnFamilyOptions initial_cf_options_
;
412 const ImmutableCFOptions ioptions_
;
413 MutableCFOptions mutable_cf_options_
;
415 const bool is_delete_range_supported_
;
417 std::unique_ptr
<TableCache
> table_cache_
;
419 std::unique_ptr
<InternalStats
> internal_stats_
;
421 WriteBufferManager
* write_buffer_manager_
;
425 SuperVersion
* super_version_
;
427 // An ordinal representing the current SuperVersion. Updated by
428 // InstallSuperVersion(), i.e. incremented every time super_version_
430 std::atomic
<uint64_t> super_version_number_
;
432 // Thread's local copy of SuperVersion pointer
433 // This needs to be destructed before mutex_
434 std::unique_ptr
<ThreadLocalPtr
> local_sv_
;
436 // pointers for a circular linked list. we use it to support iterations over
437 // all column families that are alive (note: dropped column families can also
438 // be alive as long as client holds a reference)
439 ColumnFamilyData
* next_
;
440 ColumnFamilyData
* prev_
;
442 // This is the earliest log file number that contains data from this
443 // Column Family. All earlier log files must be ignored and not
445 uint64_t log_number_
;
447 std::atomic
<FlushReason
> flush_reason_
;
449 // An object that keeps all the compaction stats
450 // and picks the next compaction
451 std::unique_ptr
<CompactionPicker
> compaction_picker_
;
453 ColumnFamilySet
* column_family_set_
;
455 std::unique_ptr
<WriteControllerToken
> write_controller_token_
;
457 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
458 bool queued_for_flush_
;
460 // If true --> this ColumnFamily is currently present in
461 // DBImpl::compaction_queue_
462 bool queued_for_compaction_
;
464 uint64_t prev_compaction_needed_bytes_
;
466 // if the database was opened with 2pc enabled
469 // Memtable id to track flush.
470 std::atomic
<uint64_t> last_memtable_id_
;
472 // Directories corresponding to cf_paths.
473 std::vector
<std::unique_ptr
<Directory
>> data_dirs_
;
476 // ColumnFamilySet has interesting thread-safety requirements
477 // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
478 // mutex AND executed in the write thread.
479 // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
480 // single-threaded write thread. It is also called during Recovery and in
482 // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
483 // held and it needs to be executed from the write thread. SetDropped() also
484 // guarantees that it will be called only from single-threaded LogAndApply(),
485 // but this condition is not that important.
486 // * Iteration -- hold DB mutex, but you can release it in the body of
487 // iteration. If you release DB mutex in body, reference the column
488 // family before the mutex and unreference after you unlock, since the column
489 // family might get dropped when the DB mutex is released
490 // * GetDefault() -- thread safe
491 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
492 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
493 // NumberOfColumnFamilies -- inside of DB mutex
494 class ColumnFamilySet
{
496 // ColumnFamilySet supports iteration
499 explicit iterator(ColumnFamilyData
* cfd
)
501 iterator
& operator++() {
502 // dropped column families might still be included in this iteration
503 // (we're only removing them when client drops the last reference to the
505 // dummy is never dead, so this will never be infinite
507 current_
= current_
->next_
;
508 } while (current_
->refs_
.load(std::memory_order_relaxed
) == 0);
511 bool operator!=(const iterator
& other
) {
512 return this->current_
!= other
.current_
;
514 ColumnFamilyData
* operator*() { return current_
; }
517 ColumnFamilyData
* current_
;
520 ColumnFamilySet(const std::string
& dbname
,
521 const ImmutableDBOptions
* db_options
,
522 const EnvOptions
& env_options
, Cache
* table_cache
,
523 WriteBufferManager
* write_buffer_manager
,
524 WriteController
* write_controller
);
527 ColumnFamilyData
* GetDefault() const;
528 // GetColumnFamily() calls return nullptr if column family is not found
529 ColumnFamilyData
* GetColumnFamily(uint32_t id
) const;
530 ColumnFamilyData
* GetColumnFamily(const std::string
& name
) const;
531 // this call will return the next available column family ID. it guarantees
532 // that there is no column family with id greater than or equal to the
533 // returned value in the current running instance or anytime in RocksDB
535 uint32_t GetNextColumnFamilyID();
536 uint32_t GetMaxColumnFamily();
537 void UpdateMaxColumnFamily(uint32_t new_max_column_family
);
538 size_t NumberOfColumnFamilies() const;
540 ColumnFamilyData
* CreateColumnFamily(const std::string
& name
, uint32_t id
,
541 Version
* dummy_version
,
542 const ColumnFamilyOptions
& options
);
544 iterator
begin() { return iterator(dummy_cfd_
->next_
); }
545 iterator
end() { return iterator(dummy_cfd_
); }
547 // REQUIRES: DB mutex held
548 // Don't call while iterating over ColumnFamilySet
549 void FreeDeadColumnFamilies();
551 Cache
* get_table_cache() { return table_cache_
; }
554 friend class ColumnFamilyData
;
555 // helper function that gets called from cfd destructor
556 // REQUIRES: DB mutex held
557 void RemoveColumnFamily(ColumnFamilyData
* cfd
);
559 // column_families_ and column_family_data_ need to be protected:
560 // * when mutating both conditions have to be satisfied:
561 // 1. DB mutex locked
562 // 2. thread currently in single-threaded write thread
563 // * when reading, at least one condition needs to be satisfied:
564 // 1. DB mutex locked
565 // 2. accessed from a single-threaded write thread
566 std::unordered_map
<std::string
, uint32_t> column_families_
;
567 std::unordered_map
<uint32_t, ColumnFamilyData
*> column_family_data_
;
569 uint32_t max_column_family_
;
570 ColumnFamilyData
* dummy_cfd_
;
571 // We don't hold the refcount here, since default column family always exists
572 // We are also not responsible for cleaning up default_cfd_cache_. This is
573 // just a cache that makes common case (accessing default column family)
575 ColumnFamilyData
* default_cfd_cache_
;
577 const std::string db_name_
;
578 const ImmutableDBOptions
* const db_options_
;
579 const EnvOptions env_options_
;
581 WriteBufferManager
* write_buffer_manager_
;
582 WriteController
* write_controller_
;
585 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
586 // memtables of different column families (specified by ID in the write batch)
587 class ColumnFamilyMemTablesImpl
: public ColumnFamilyMemTables
{
589 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet
* column_family_set
)
590 : column_family_set_(column_family_set
), current_(nullptr) {}
592 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
593 // with the arguments used to construct *orig.
594 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl
* orig
)
595 : column_family_set_(orig
->column_family_set_
), current_(nullptr) {}
597 // sets current_ to ColumnFamilyData with column_family_id
598 // returns false if column family doesn't exist
599 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
600 // under a DB mutex OR from a write thread
601 bool Seek(uint32_t column_family_id
) override
;
603 // Returns log number of the selected column family
604 // REQUIRES: under a DB mutex OR from a write thread
605 uint64_t GetLogNumber() const override
;
607 // REQUIRES: Seek() called first
608 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
609 // under a DB mutex OR from a write thread
610 virtual MemTable
* GetMemTable() const override
;
612 // Returns column family handle for the selected column family
613 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
614 // under a DB mutex OR from a write thread
615 virtual ColumnFamilyHandle
* GetColumnFamilyHandle() override
;
617 // Cannot be called while another thread is calling Seek().
618 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
619 // under a DB mutex OR from a write thread
620 virtual ColumnFamilyData
* current() override
{ return current_
; }
623 ColumnFamilySet
* column_family_set_
;
624 ColumnFamilyData
* current_
;
625 ColumnFamilyHandleInternal handle_
;
628 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle
* column_family
);
630 extern const Comparator
* GetColumnFamilyUserComparator(
631 ColumnFamilyHandle
* column_family
);
633 } // namespace rocksdb