]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/column_family.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / db / column_family.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <unordered_map>
13#include <string>
14#include <vector>
15#include <atomic>
16
17#include "db/memtable_list.h"
18#include "db/table_cache.h"
19#include "db/table_properties_collector.h"
20#include "db/write_batch_internal.h"
21#include "db/write_controller.h"
22#include "options/cf_options.h"
23#include "rocksdb/compaction_job_stats.h"
24#include "rocksdb/db.h"
25#include "rocksdb/env.h"
26#include "rocksdb/options.h"
f67539c2 27#include "trace_replay/block_cache_tracer.h"
7c673cae
FG
28#include "util/thread_local.h"
29
f67539c2 30namespace ROCKSDB_NAMESPACE {
7c673cae
FG
31
32class Version;
33class VersionSet;
11fdf7f2 34class VersionStorageInfo;
7c673cae
FG
35class MemTable;
36class MemTableListVersion;
37class CompactionPicker;
38class Compaction;
39class InternalKey;
40class InternalStats;
41class ColumnFamilyData;
42class DBImpl;
43class LogBuffer;
44class InstrumentedMutex;
45class InstrumentedMutexLock;
11fdf7f2 46struct SuperVersionContext;
7c673cae
FG
47
48extern const double kIncSlowdownRatio;
f67539c2
TL
49// This file contains a list of data structures for managing column family
50// level metadata.
51//
52// The basic relationships among classes declared here are illustrated as
53// following:
54//
55// +----------------------+ +----------------------+ +--------+
56// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl |
57// | +----------------------+ | +----------------------+ +----+---+
58// | +--------------------------+ |
59// | | +-----------------------------+
60// | | |
61// | | +-----------------------------v-------------------------------+
62// | | | |
63// | | | ColumnFamilySet |
64// | | | |
65// | | +-------------+--------------------------+----------------+---+
66// | | | | |
67// | +-------------------------------------+ | |
68// | | | | v
69// | +-------------v-------------+ +-----v----v---------+
70// | | | | |
71// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ......
72// | | | | |
73// +---> | | |
74// | +---------+ | |
75// | | MemTable| | |
76// | | List | | |
77// +--------+---+--+-+----+----+ +--------------------++
78// | | | |
79// | | | |
80// | | | +-----------------------+
81// | | +-----------+ |
82// v +--------+ | |
83// +--------+--------+ | | |
84// | | | | +----------v----------+
85// +---> |SuperVersion 1.a +-----------------> |
86// | +------+ | | MemTableListVersion |
87// +---+-------------+ | | | | |
88// | | | | +----+------------+---+
89// | current | | | | |
90// | +-------------+ | |mem | |
91// | | | | | |
92// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+
93// | | | | | | | |
94// | Version 1.a | | memtable | | memtable | | memtable |
95// | | | 1.a | | 1.b | | 1.c |
96// +-------------+ | | | | | |
97// +----------+ +----------+ +----------+
98//
99// DBImpl keeps a ColumnFamilySet, which references to all column families by
100// pointing to respective ColumnFamilyData object of each column family.
101// This is how DBImpl can list and operate on all the column families.
102// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
103// when a user executes a query, it can directly find memtables and Version
104// as well as SuperVersion to the column family, without going through
105// ColumnFamilySet.
106//
107// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
108// and SST files) indirectly, while ongoing operations may hold references
109// to a current or an out-of-date SuperVersion, which in turn points to a
110// point-in-time view of the LSM-tree. This guarantees the memtables and SST
111// files being operated on will not go away, until the SuperVersion is
112// unreferenced to 0 and destoryed.
113//
114// The following graph illustrates a possible referencing relationships:
115//
116// Column +--------------+ current +-----------+
117// Family +---->+ +------------------->+ |
118// Data | SuperVersion +----------+ | Version A |
119// | 3 | imm | | |
120// Iter2 +----->+ | +-------v------+ +-----------+
121// +-----+--------+ | MemtableList +----------------> Empty
122// | | Version r | +-----------+
123// | +--------------+ | |
124// +------------------+ current| Version B |
125// +--------------+ | +----->+ |
126// | | | | +-----+-----+
127// Compaction +>+ SuperVersion +-------------+ ^
128// Job | 2 +------+ | |current
129// | +----+ | | mem | +------------+
130// +--------------+ | | +---------------------> |
131// | +------------------------> MemTable a |
132// | mem | | |
133// +--------------+ | | +------------+
134// | +--------------------------+
135// Iter1 +-----> SuperVersion | | +------------+
136// | 1 +------------------------------>+ |
137// | +-+ | mem | MemTable b |
138// +--------------+ | | | |
139// | | +--------------+ +-----^------+
140// | |imm | MemtableList | |
141// | +--->+ Version s +------------+
142// | +--------------+
143// | +--------------+
144// | | MemtableList |
145// +------>+ Version t +--------> Empty
146// imm +--------------+
147//
148// In this example, even if the current LSM-tree consists of Version A and
149// memtable a, which is also referenced by SuperVersion, two older SuperVersion
150// SuperVersion2 and Superversion1 still exist, and are referenced by a
151// compaction job and an old iterator Iter1, respectively. SuperVersion2
152// contains Version B, memtable a and memtable b; SuperVersion1 contains
153// Version B and memtable b (mutable). As a result, Version B and memtable b
154// are prevented from being destroyed or deleted.
7c673cae
FG
155
156// ColumnFamilyHandleImpl is the class that clients use to access different
157// column families. It has non-trivial destructor, which gets called when client
158// is done using the column family
159class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
160 public:
161 // create while holding the mutex
162 ColumnFamilyHandleImpl(
163 ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
164 // destroy without mutex
165 virtual ~ColumnFamilyHandleImpl();
166 virtual ColumnFamilyData* cfd() const { return cfd_; }
167
168 virtual uint32_t GetID() const override;
169 virtual const std::string& GetName() const override;
170 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
171 virtual const Comparator* GetComparator() const override;
172
173 private:
174 ColumnFamilyData* cfd_;
175 DBImpl* db_;
176 InstrumentedMutex* mutex_;
177};
178
179// Does not ref-count ColumnFamilyData
180// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
181// calls DBImpl methods. When this happens, MemTableInserter need access to
182// ColumnFamilyHandle (same as the client would need). In that case, we feed
183// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
184// methods
185class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
186 public:
187 ColumnFamilyHandleInternal()
11fdf7f2 188 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
7c673cae
FG
189
190 void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
191 virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
192
193 private:
194 ColumnFamilyData* internal_cfd_;
195};
196
197// holds references to memtable, all immutable memtables and version
198struct SuperVersion {
199 // Accessing members of this class is not thread-safe and requires external
200 // synchronization (ie db mutex held or on write thread).
f67539c2 201 ColumnFamilyData* cfd;
7c673cae
FG
202 MemTable* mem;
203 MemTableListVersion* imm;
204 Version* current;
205 MutableCFOptions mutable_cf_options;
206 // Version number of the current SuperVersion
207 uint64_t version_number;
11fdf7f2 208 WriteStallCondition write_stall_condition;
7c673cae
FG
209
210 InstrumentedMutex* db_mutex;
211
212 // should be called outside the mutex
213 SuperVersion() = default;
214 ~SuperVersion();
215 SuperVersion* Ref();
216 // If Unref() returns true, Cleanup() should be called with mutex held
217 // before deleting this SuperVersion.
218 bool Unref();
219
220 // call these two methods with db mutex held
221 // Cleanup unrefs mem, imm and current. Also, it stores all memtables
222 // that needs to be deleted in to_delete vector. Unrefing those
223 // objects needs to be done in the mutex
224 void Cleanup();
f67539c2
TL
225 void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
226 MemTableListVersion* new_imm, Version* new_current);
7c673cae
FG
227
228 // The value of dummy is not actually used. kSVInUse takes its address as a
229 // mark in the thread local storage to indicate the SuperVersion is in use
230 // by thread. This way, the value of kSVInUse is guaranteed to have no
231 // conflict with SuperVersion object address and portable on different
232 // platform.
233 static int dummy;
234 static void* const kSVInUse;
235 static void* const kSVObsolete;
236
237 private:
238 std::atomic<uint32_t> refs;
239 // We need to_delete because during Cleanup(), imm->Unref() returns
240 // all memtables that we need to free through this vector. We then
241 // delete all those memtables outside of mutex, during destruction
242 autovector<MemTable*> to_delete;
243};
244
245extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
246
247extern Status CheckConcurrentWritesSupported(
248 const ColumnFamilyOptions& cf_options);
249
11fdf7f2
TL
250extern Status CheckCFPathsSupported(const DBOptions& db_options,
251 const ColumnFamilyOptions& cf_options);
252
7c673cae
FG
253extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
254 const ColumnFamilyOptions& src);
255// Wrap user defined table proproties collector factories `from cf_options`
256// into internal ones in int_tbl_prop_collector_factories. Add a system internal
257// one too.
258extern void GetIntTblPropCollectorFactory(
259 const ImmutableCFOptions& ioptions,
260 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
261 int_tbl_prop_collector_factories);
262
263class ColumnFamilySet;
264
265// This class keeps all the data that a column family needs.
266// Most methods require DB mutex held, unless otherwise noted
267class ColumnFamilyData {
268 public:
269 ~ColumnFamilyData();
270
271 // thread-safe
272 uint32_t GetID() const { return id_; }
273 // thread-safe
274 const std::string& GetName() const { return name_; }
275
276 // Ref() can only be called from a context where the caller can guarantee
277 // that ColumnFamilyData is alive (while holding a non-zero ref already,
278 // holding a DB mutex, or as the leader in a write batch group).
f67539c2 279 void Ref() { refs_.fetch_add(1); }
7c673cae
FG
280
281 // Unref decreases the reference count, but does not handle deletion
282 // when the count goes to 0. If this method returns true then the
283 // caller should delete the instance immediately, or later, by calling
284 // FreeDeadColumnFamilies(). Unref() can only be called while holding
285 // a DB mutex, or during single-threaded recovery.
286 bool Unref() {
f67539c2 287 int old_refs = refs_.fetch_sub(1);
7c673cae
FG
288 assert(old_refs > 0);
289 return old_refs == 1;
290 }
291
f67539c2
TL
292 // UnrefAndTryDelete() decreases the reference count and do free if needed,
293 // return true if this is freed else false, UnrefAndTryDelete() can only
294 // be called while holding a DB mutex, or during single-threaded recovery.
295 bool UnrefAndTryDelete();
296
7c673cae
FG
297 // SetDropped() can only be called under following conditions:
298 // 1) Holding a DB mutex,
299 // 2) from single-threaded write thread, AND
300 // 3) from single-threaded VersionSet::LogAndApply()
301 // After dropping column family no other operation on that column family
302 // will be executed. All the files and memory will be, however, kept around
303 // until client drops the column family handle. That way, client can still
304 // access data from dropped column family.
305 // Column family can be dropped and still alive. In that state:
306 // *) Compaction and flush is not executed on the dropped column family.
307 // *) Client can continue reading from column family. Writes will fail unless
308 // WriteOptions::ignore_missing_column_families is true
309 // When the dropped column family is unreferenced, then we:
310 // *) Remove column family from the linked list maintained by ColumnFamilySet
311 // *) delete all memory associated with that column family
312 // *) delete all the files associated with that column family
313 void SetDropped();
11fdf7f2 314 bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
7c673cae
FG
315
316 // thread-safe
317 int NumberLevels() const { return ioptions_.num_levels; }
318
319 void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
320 uint64_t GetLogNumber() const { return log_number_; }
321
11fdf7f2
TL
322 void SetFlushReason(FlushReason flush_reason) {
323 flush_reason_ = flush_reason;
324 }
325 FlushReason GetFlushReason() const { return flush_reason_; }
7c673cae 326 // thread-safe
f67539c2 327 const FileOptions* soptions() const;
7c673cae
FG
328 const ImmutableCFOptions* ioptions() const { return &ioptions_; }
329 // REQUIRES: DB mutex held
330 // This returns the MutableCFOptions used by current SuperVersion
331 // You should use this API to reference MutableCFOptions most of the time.
332 const MutableCFOptions* GetCurrentMutableCFOptions() const {
333 return &(super_version_->mutable_cf_options);
334 }
335 // REQUIRES: DB mutex held
336 // This returns the latest MutableCFOptions, which may be not in effect yet.
337 const MutableCFOptions* GetLatestMutableCFOptions() const {
338 return &mutable_cf_options_;
339 }
340
341 // REQUIRES: DB mutex held
342 // Build ColumnFamiliesOptions with immutable options and latest mutable
343 // options.
344 ColumnFamilyOptions GetLatestCFOptions() const;
345
346 bool is_delete_range_supported() { return is_delete_range_supported_; }
347
f67539c2
TL
348 // Validate CF options against DB options
349 static Status ValidateOptions(const DBOptions& db_options,
350 const ColumnFamilyOptions& cf_options);
7c673cae
FG
351#ifndef ROCKSDB_LITE
352 // REQUIRES: DB mutex held
353 Status SetOptions(
f67539c2 354 const DBOptions& db_options,
7c673cae
FG
355 const std::unordered_map<std::string, std::string>& options_map);
356#endif // ROCKSDB_LITE
357
358 InternalStats* internal_stats() { return internal_stats_.get(); }
359
360 MemTableList* imm() { return &imm_; }
361 MemTable* mem() { return mem_; }
362 Version* current() { return current_; }
363 Version* dummy_versions() { return dummy_versions_; }
364 void SetCurrent(Version* _current);
365 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
366 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
11fdf7f2
TL
367 uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
368 void SetMemtable(MemTable* new_mem) {
369 uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
370 new_mem->SetID(memtable_id);
371 mem_ = new_mem;
372 }
7c673cae
FG
373
374 // calculate the oldest log needed for the durability of this column family
375 uint64_t OldestLogToKeep();
376
377 // See Memtable constructor for explanation of earliest_seq param.
378 MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
379 SequenceNumber earliest_seq);
380 void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
381 SequenceNumber earliest_seq);
382
383 TableCache* table_cache() const { return table_cache_.get(); }
384
385 // See documentation in compaction_picker.h
386 // REQUIRES: DB mutex held
387 bool NeedsCompaction() const;
388 // REQUIRES: DB mutex held
389 Compaction* PickCompaction(const MutableCFOptions& mutable_options,
390 LogBuffer* log_buffer);
391
392 // Check if the passed range overlap with any running compactions.
393 // REQUIRES: DB mutex held
394 bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
395 const Slice& largest_user_key,
396 int level) const;
397
11fdf7f2
TL
398 // Check if the passed ranges overlap with any unflushed memtables
399 // (immutable or mutable).
400 //
401 // @param super_version A referenced SuperVersion that will be held for the
402 // duration of this function.
403 //
404 // Thread-safe
405 Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
406 SuperVersion* super_version, bool* overlap);
407
7c673cae 408 // A flag to tell a manual compaction is to compact all levels together
11fdf7f2 409 // instead of a specific level.
7c673cae
FG
410 static const int kCompactAllLevels;
411 // A flag to tell a manual compaction's output is base level.
412 static const int kCompactToBaseLevel;
413 // REQUIRES: DB mutex held
414 Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
415 int input_level, int output_level,
f67539c2 416 const CompactRangeOptions& compact_range_options,
11fdf7f2 417 const InternalKey* begin, const InternalKey* end,
f67539c2
TL
418 InternalKey** compaction_end, bool* manual_conflict,
419 uint64_t max_file_num_to_ignore);
7c673cae
FG
420
421 CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
422 // thread-safe
423 const Comparator* user_comparator() const {
424 return internal_comparator_.user_comparator();
425 }
426 // thread-safe
427 const InternalKeyComparator& internal_comparator() const {
428 return internal_comparator_;
429 }
430
431 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
432 int_tbl_prop_collector_factories() const {
433 return &int_tbl_prop_collector_factories_;
434 }
435
436 SuperVersion* GetSuperVersion() { return super_version_; }
437 // thread-safe
438 // Return a already referenced SuperVersion to be used safely.
f67539c2 439 SuperVersion* GetReferencedSuperVersion(DBImpl* db);
7c673cae
FG
440 // thread-safe
441 // Get SuperVersion stored in thread local storage. If it does not exist,
442 // get a reference from a current SuperVersion.
f67539c2 443 SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
7c673cae
FG
444 // Try to return SuperVersion back to thread local storage. Retrun true on
445 // success and false on failure. It fails when the thread local storage
446 // contains anything other than SuperVersion::kSVInUse flag.
447 bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
448 // thread-safe
449 uint64_t GetSuperVersionNumber() const {
450 return super_version_number_.load();
451 }
452 // will return a pointer to SuperVersion* if previous SuperVersion
453 // if its reference count is zero and needs deletion or nullptr if not
454 // As argument takes a pointer to allocated SuperVersion to enable
455 // the clients to allocate SuperVersion outside of mutex.
456 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
11fdf7f2
TL
457 void InstallSuperVersion(SuperVersionContext* sv_context,
458 InstrumentedMutex* db_mutex,
459 const MutableCFOptions& mutable_cf_options);
460 void InstallSuperVersion(SuperVersionContext* sv_context,
461 InstrumentedMutex* db_mutex);
7c673cae
FG
462
463 void ResetThreadLocalSuperVersions();
464
465 // Protected by DB mutex
11fdf7f2
TL
466 void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
467 void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
468 bool queued_for_flush() { return queued_for_flush_; }
469 bool queued_for_compaction() { return queued_for_compaction_; }
470
471 enum class WriteStallCause {
472 kNone,
473 kMemtableLimit,
474 kL0FileCountLimit,
475 kPendingCompactionBytes,
476 };
477 static std::pair<WriteStallCondition, WriteStallCause>
478 GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
479 uint64_t num_compaction_needed_bytes,
480 const MutableCFOptions& mutable_cf_options);
7c673cae
FG
481
482 // Recalculate some small conditions, which are changed only during
483 // compaction, adding new memtable and/or
484 // recalculation of compaction score. These values are used in
485 // DBImpl::MakeRoomForWrite function to decide, if it need to make
486 // a write stall
11fdf7f2 487 WriteStallCondition RecalculateWriteStallConditions(
7c673cae
FG
488 const MutableCFOptions& mutable_cf_options);
489
11fdf7f2
TL
490 void set_initialized() { initialized_.store(true); }
491
492 bool initialized() const { return initialized_.load(); }
493
494 const ColumnFamilyOptions& initial_cf_options() {
495 return initial_cf_options_;
496 }
497
498 Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
499
f67539c2
TL
500 // created_dirs remembers directory created, so that we don't need to call
501 // the same data creation operation again.
502 Status AddDirectories(
503 std::map<std::string, std::shared_ptr<Directory>>* created_dirs);
11fdf7f2
TL
504
505 Directory* GetDataDir(size_t path_id) const;
506
494da23a
TL
507 ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
508
7c673cae
FG
509 private:
510 friend class ColumnFamilySet;
511 ColumnFamilyData(uint32_t id, const std::string& name,
512 Version* dummy_versions, Cache* table_cache,
513 WriteBufferManager* write_buffer_manager,
514 const ColumnFamilyOptions& options,
515 const ImmutableDBOptions& db_options,
f67539c2
TL
516 const FileOptions& file_options,
517 ColumnFamilySet* column_family_set,
518 BlockCacheTracer* const block_cache_tracer);
7c673cae
FG
519
520 uint32_t id_;
521 const std::string name_;
522 Version* dummy_versions_; // Head of circular doubly-linked list of versions.
523 Version* current_; // == dummy_versions->prev_
524
525 std::atomic<int> refs_; // outstanding references to ColumnFamilyData
11fdf7f2
TL
526 std::atomic<bool> initialized_;
527 std::atomic<bool> dropped_; // true if client dropped it
7c673cae
FG
528
529 const InternalKeyComparator internal_comparator_;
530 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
531 int_tbl_prop_collector_factories_;
532
533 const ColumnFamilyOptions initial_cf_options_;
534 const ImmutableCFOptions ioptions_;
535 MutableCFOptions mutable_cf_options_;
536
537 const bool is_delete_range_supported_;
538
539 std::unique_ptr<TableCache> table_cache_;
540
541 std::unique_ptr<InternalStats> internal_stats_;
542
543 WriteBufferManager* write_buffer_manager_;
544
545 MemTable* mem_;
546 MemTableList imm_;
547 SuperVersion* super_version_;
548
549 // An ordinal representing the current SuperVersion. Updated by
550 // InstallSuperVersion(), i.e. incremented every time super_version_
551 // changes.
552 std::atomic<uint64_t> super_version_number_;
553
554 // Thread's local copy of SuperVersion pointer
555 // This needs to be destructed before mutex_
556 std::unique_ptr<ThreadLocalPtr> local_sv_;
557
558 // pointers for a circular linked list. we use it to support iterations over
559 // all column families that are alive (note: dropped column families can also
560 // be alive as long as client holds a reference)
561 ColumnFamilyData* next_;
562 ColumnFamilyData* prev_;
563
564 // This is the earliest log file number that contains data from this
565 // Column Family. All earlier log files must be ignored and not
566 // recovered from
567 uint64_t log_number_;
568
11fdf7f2
TL
569 std::atomic<FlushReason> flush_reason_;
570
7c673cae
FG
571 // An object that keeps all the compaction stats
572 // and picks the next compaction
573 std::unique_ptr<CompactionPicker> compaction_picker_;
574
575 ColumnFamilySet* column_family_set_;
576
577 std::unique_ptr<WriteControllerToken> write_controller_token_;
578
579 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
11fdf7f2 580 bool queued_for_flush_;
7c673cae
FG
581
582 // If true --> this ColumnFamily is currently present in
583 // DBImpl::compaction_queue_
11fdf7f2 584 bool queued_for_compaction_;
7c673cae
FG
585
586 uint64_t prev_compaction_needed_bytes_;
587
588 // if the database was opened with 2pc enabled
589 bool allow_2pc_;
11fdf7f2
TL
590
591 // Memtable id to track flush.
592 std::atomic<uint64_t> last_memtable_id_;
593
594 // Directories corresponding to cf_paths.
f67539c2 595 std::vector<std::shared_ptr<Directory>> data_dirs_;
7c673cae
FG
596};
597
598// ColumnFamilySet has interesting thread-safety requirements
599// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
600// mutex AND executed in the write thread.
601// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
602// single-threaded write thread. It is also called during Recovery and in
603// DumpManifest().
604// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
605// held and it needs to be executed from the write thread. SetDropped() also
606// guarantees that it will be called only from single-threaded LogAndApply(),
607// but this condition is not that important.
608// * Iteration -- hold DB mutex, but you can release it in the body of
609// iteration. If you release DB mutex in body, reference the column
610// family before the mutex and unreference after you unlock, since the column
611// family might get dropped when the DB mutex is released
612// * GetDefault() -- thread safe
613// * GetColumnFamily() -- either inside of DB mutex or from a write thread
614// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
615// NumberOfColumnFamilies -- inside of DB mutex
616class ColumnFamilySet {
617 public:
618 // ColumnFamilySet supports iteration
619 class iterator {
620 public:
621 explicit iterator(ColumnFamilyData* cfd)
622 : current_(cfd) {}
623 iterator& operator++() {
624 // dropped column families might still be included in this iteration
625 // (we're only removing them when client drops the last reference to the
626 // column family).
627 // dummy is never dead, so this will never be infinite
628 do {
629 current_ = current_->next_;
630 } while (current_->refs_.load(std::memory_order_relaxed) == 0);
631 return *this;
632 }
633 bool operator!=(const iterator& other) {
634 return this->current_ != other.current_;
635 }
636 ColumnFamilyData* operator*() { return current_; }
637
638 private:
639 ColumnFamilyData* current_;
640 };
641
642 ColumnFamilySet(const std::string& dbname,
643 const ImmutableDBOptions* db_options,
f67539c2 644 const FileOptions& file_options, Cache* table_cache,
7c673cae 645 WriteBufferManager* write_buffer_manager,
f67539c2
TL
646 WriteController* write_controller,
647 BlockCacheTracer* const block_cache_tracer);
7c673cae
FG
648 ~ColumnFamilySet();
649
650 ColumnFamilyData* GetDefault() const;
651 // GetColumnFamily() calls return nullptr if column family is not found
652 ColumnFamilyData* GetColumnFamily(uint32_t id) const;
653 ColumnFamilyData* GetColumnFamily(const std::string& name) const;
654 // this call will return the next available column family ID. it guarantees
655 // that there is no column family with id greater than or equal to the
656 // returned value in the current running instance or anytime in RocksDB
657 // instance history.
658 uint32_t GetNextColumnFamilyID();
659 uint32_t GetMaxColumnFamily();
660 void UpdateMaxColumnFamily(uint32_t new_max_column_family);
661 size_t NumberOfColumnFamilies() const;
662
663 ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
664 Version* dummy_version,
665 const ColumnFamilyOptions& options);
666
667 iterator begin() { return iterator(dummy_cfd_->next_); }
668 iterator end() { return iterator(dummy_cfd_); }
669
670 // REQUIRES: DB mutex held
671 // Don't call while iterating over ColumnFamilySet
672 void FreeDeadColumnFamilies();
673
674 Cache* get_table_cache() { return table_cache_; }
675
676 private:
677 friend class ColumnFamilyData;
678 // helper function that gets called from cfd destructor
679 // REQUIRES: DB mutex held
680 void RemoveColumnFamily(ColumnFamilyData* cfd);
681
682 // column_families_ and column_family_data_ need to be protected:
683 // * when mutating both conditions have to be satisfied:
684 // 1. DB mutex locked
685 // 2. thread currently in single-threaded write thread
686 // * when reading, at least one condition needs to be satisfied:
687 // 1. DB mutex locked
688 // 2. accessed from a single-threaded write thread
689 std::unordered_map<std::string, uint32_t> column_families_;
690 std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
691
692 uint32_t max_column_family_;
693 ColumnFamilyData* dummy_cfd_;
694 // We don't hold the refcount here, since default column family always exists
695 // We are also not responsible for cleaning up default_cfd_cache_. This is
696 // just a cache that makes common case (accessing default column family)
697 // faster
698 ColumnFamilyData* default_cfd_cache_;
699
700 const std::string db_name_;
701 const ImmutableDBOptions* const db_options_;
f67539c2 702 const FileOptions file_options_;
7c673cae
FG
703 Cache* table_cache_;
704 WriteBufferManager* write_buffer_manager_;
705 WriteController* write_controller_;
f67539c2 706 BlockCacheTracer* const block_cache_tracer_;
7c673cae
FG
707};
708
709// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
710// memtables of different column families (specified by ID in the write batch)
711class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
712 public:
713 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
714 : column_family_set_(column_family_set), current_(nullptr) {}
715
716 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
717 // with the arguments used to construct *orig.
718 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
719 : column_family_set_(orig->column_family_set_), current_(nullptr) {}
720
721 // sets current_ to ColumnFamilyData with column_family_id
722 // returns false if column family doesn't exist
723 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
724 // under a DB mutex OR from a write thread
725 bool Seek(uint32_t column_family_id) override;
726
727 // Returns log number of the selected column family
728 // REQUIRES: under a DB mutex OR from a write thread
729 uint64_t GetLogNumber() const override;
730
731 // REQUIRES: Seek() called first
732 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
733 // under a DB mutex OR from a write thread
734 virtual MemTable* GetMemTable() const override;
735
736 // Returns column family handle for the selected column family
737 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
738 // under a DB mutex OR from a write thread
739 virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
740
741 // Cannot be called while another thread is calling Seek().
742 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
743 // under a DB mutex OR from a write thread
744 virtual ColumnFamilyData* current() override { return current_; }
745
746 private:
747 ColumnFamilySet* column_family_set_;
748 ColumnFamilyData* current_;
749 ColumnFamilyHandleInternal handle_;
750};
751
752extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
753
754extern const Comparator* GetColumnFamilyUserComparator(
755 ColumnFamilyHandle* column_family);
756
f67539c2 757} // namespace ROCKSDB_NAMESPACE