]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/column_family.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / column_family.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <unordered_map>
13#include <string>
14#include <vector>
15#include <atomic>
16
17#include "db/memtable_list.h"
18#include "db/table_cache.h"
19#include "db/table_properties_collector.h"
20#include "db/write_batch_internal.h"
21#include "db/write_controller.h"
22#include "options/cf_options.h"
23#include "rocksdb/compaction_job_stats.h"
24#include "rocksdb/db.h"
25#include "rocksdb/env.h"
26#include "rocksdb/options.h"
f67539c2 27#include "trace_replay/block_cache_tracer.h"
7c673cae
FG
28#include "util/thread_local.h"
29
f67539c2 30namespace ROCKSDB_NAMESPACE {
7c673cae
FG
31
32class Version;
33class VersionSet;
11fdf7f2 34class VersionStorageInfo;
7c673cae
FG
35class MemTable;
36class MemTableListVersion;
37class CompactionPicker;
38class Compaction;
39class InternalKey;
40class InternalStats;
41class ColumnFamilyData;
42class DBImpl;
43class LogBuffer;
44class InstrumentedMutex;
45class InstrumentedMutexLock;
11fdf7f2 46struct SuperVersionContext;
20effc67 47class BlobFileCache;
7c673cae
FG
48
49extern const double kIncSlowdownRatio;
f67539c2
TL
50// This file contains a list of data structures for managing column family
51// level metadata.
52//
53// The basic relationships among classes declared here are illustrated as
54// following:
55//
56// +----------------------+ +----------------------+ +--------+
57// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl |
58// | +----------------------+ | +----------------------+ +----+---+
59// | +--------------------------+ |
60// | | +-----------------------------+
61// | | |
62// | | +-----------------------------v-------------------------------+
63// | | | |
64// | | | ColumnFamilySet |
65// | | | |
66// | | +-------------+--------------------------+----------------+---+
67// | | | | |
68// | +-------------------------------------+ | |
69// | | | | v
70// | +-------------v-------------+ +-----v----v---------+
71// | | | | |
72// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ......
73// | | | | |
74// +---> | | |
75// | +---------+ | |
76// | | MemTable| | |
77// | | List | | |
78// +--------+---+--+-+----+----+ +--------------------++
79// | | | |
80// | | | |
81// | | | +-----------------------+
82// | | +-----------+ |
83// v +--------+ | |
84// +--------+--------+ | | |
85// | | | | +----------v----------+
86// +---> |SuperVersion 1.a +-----------------> |
87// | +------+ | | MemTableListVersion |
88// +---+-------------+ | | | | |
89// | | | | +----+------------+---+
90// | current | | | | |
91// | +-------------+ | |mem | |
92// | | | | | |
93// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+
94// | | | | | | | |
95// | Version 1.a | | memtable | | memtable | | memtable |
96// | | | 1.a | | 1.b | | 1.c |
97// +-------------+ | | | | | |
98// +----------+ +----------+ +----------+
99//
100// DBImpl keeps a ColumnFamilySet, which references to all column families by
101// pointing to respective ColumnFamilyData object of each column family.
102// This is how DBImpl can list and operate on all the column families.
103// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
104// when a user executes a query, it can directly find memtables and Version
105// as well as SuperVersion to the column family, without going through
106// ColumnFamilySet.
107//
108// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
109// and SST files) indirectly, while ongoing operations may hold references
110// to a current or an out-of-date SuperVersion, which in turn points to a
111// point-in-time view of the LSM-tree. This guarantees the memtables and SST
112// files being operated on will not go away, until the SuperVersion is
113// unreferenced to 0 and destoryed.
114//
115// The following graph illustrates a possible referencing relationships:
116//
117// Column +--------------+ current +-----------+
118// Family +---->+ +------------------->+ |
119// Data | SuperVersion +----------+ | Version A |
120// | 3 | imm | | |
121// Iter2 +----->+ | +-------v------+ +-----------+
122// +-----+--------+ | MemtableList +----------------> Empty
123// | | Version r | +-----------+
124// | +--------------+ | |
125// +------------------+ current| Version B |
126// +--------------+ | +----->+ |
127// | | | | +-----+-----+
128// Compaction +>+ SuperVersion +-------------+ ^
129// Job | 2 +------+ | |current
130// | +----+ | | mem | +------------+
131// +--------------+ | | +---------------------> |
132// | +------------------------> MemTable a |
133// | mem | | |
134// +--------------+ | | +------------+
135// | +--------------------------+
136// Iter1 +-----> SuperVersion | | +------------+
137// | 1 +------------------------------>+ |
138// | +-+ | mem | MemTable b |
139// +--------------+ | | | |
140// | | +--------------+ +-----^------+
141// | |imm | MemtableList | |
142// | +--->+ Version s +------------+
143// | +--------------+
144// | +--------------+
145// | | MemtableList |
146// +------>+ Version t +--------> Empty
147// imm +--------------+
148//
149// In this example, even if the current LSM-tree consists of Version A and
150// memtable a, which is also referenced by SuperVersion, two older SuperVersion
151// SuperVersion2 and Superversion1 still exist, and are referenced by a
152// compaction job and an old iterator Iter1, respectively. SuperVersion2
153// contains Version B, memtable a and memtable b; SuperVersion1 contains
154// Version B and memtable b (mutable). As a result, Version B and memtable b
155// are prevented from being destroyed or deleted.
7c673cae
FG
156
157// ColumnFamilyHandleImpl is the class that clients use to access different
158// column families. It has non-trivial destructor, which gets called when client
159// is done using the column family
160class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
161 public:
162 // create while holding the mutex
163 ColumnFamilyHandleImpl(
164 ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
165 // destroy without mutex
166 virtual ~ColumnFamilyHandleImpl();
167 virtual ColumnFamilyData* cfd() const { return cfd_; }
168
169 virtual uint32_t GetID() const override;
170 virtual const std::string& GetName() const override;
171 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
172 virtual const Comparator* GetComparator() const override;
173
174 private:
175 ColumnFamilyData* cfd_;
176 DBImpl* db_;
177 InstrumentedMutex* mutex_;
178};
179
180// Does not ref-count ColumnFamilyData
181// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
182// calls DBImpl methods. When this happens, MemTableInserter need access to
183// ColumnFamilyHandle (same as the client would need). In that case, we feed
184// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
185// methods
186class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
187 public:
188 ColumnFamilyHandleInternal()
11fdf7f2 189 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
7c673cae
FG
190
191 void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
192 virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
193
194 private:
195 ColumnFamilyData* internal_cfd_;
196};
197
198// holds references to memtable, all immutable memtables and version
199struct SuperVersion {
200 // Accessing members of this class is not thread-safe and requires external
201 // synchronization (ie db mutex held or on write thread).
f67539c2 202 ColumnFamilyData* cfd;
7c673cae
FG
203 MemTable* mem;
204 MemTableListVersion* imm;
205 Version* current;
206 MutableCFOptions mutable_cf_options;
207 // Version number of the current SuperVersion
208 uint64_t version_number;
11fdf7f2 209 WriteStallCondition write_stall_condition;
7c673cae
FG
210
211 InstrumentedMutex* db_mutex;
212
213 // should be called outside the mutex
214 SuperVersion() = default;
215 ~SuperVersion();
216 SuperVersion* Ref();
217 // If Unref() returns true, Cleanup() should be called with mutex held
218 // before deleting this SuperVersion.
219 bool Unref();
220
221 // call these two methods with db mutex held
222 // Cleanup unrefs mem, imm and current. Also, it stores all memtables
223 // that needs to be deleted in to_delete vector. Unrefing those
224 // objects needs to be done in the mutex
225 void Cleanup();
f67539c2
TL
226 void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
227 MemTableListVersion* new_imm, Version* new_current);
7c673cae
FG
228
229 // The value of dummy is not actually used. kSVInUse takes its address as a
230 // mark in the thread local storage to indicate the SuperVersion is in use
231 // by thread. This way, the value of kSVInUse is guaranteed to have no
232 // conflict with SuperVersion object address and portable on different
233 // platform.
234 static int dummy;
235 static void* const kSVInUse;
236 static void* const kSVObsolete;
237
238 private:
239 std::atomic<uint32_t> refs;
240 // We need to_delete because during Cleanup(), imm->Unref() returns
241 // all memtables that we need to free through this vector. We then
242 // delete all those memtables outside of mutex, during destruction
243 autovector<MemTable*> to_delete;
244};
245
246extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
247
248extern Status CheckConcurrentWritesSupported(
249 const ColumnFamilyOptions& cf_options);
250
11fdf7f2
TL
251extern Status CheckCFPathsSupported(const DBOptions& db_options,
252 const ColumnFamilyOptions& cf_options);
253
7c673cae
FG
254extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
255 const ColumnFamilyOptions& src);
256// Wrap user defined table proproties collector factories `from cf_options`
257// into internal ones in int_tbl_prop_collector_factories. Add a system internal
258// one too.
259extern void GetIntTblPropCollectorFactory(
260 const ImmutableCFOptions& ioptions,
261 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
262 int_tbl_prop_collector_factories);
263
264class ColumnFamilySet;
265
266// This class keeps all the data that a column family needs.
267// Most methods require DB mutex held, unless otherwise noted
268class ColumnFamilyData {
269 public:
270 ~ColumnFamilyData();
271
272 // thread-safe
273 uint32_t GetID() const { return id_; }
274 // thread-safe
275 const std::string& GetName() const { return name_; }
276
277 // Ref() can only be called from a context where the caller can guarantee
278 // that ColumnFamilyData is alive (while holding a non-zero ref already,
279 // holding a DB mutex, or as the leader in a write batch group).
f67539c2 280 void Ref() { refs_.fetch_add(1); }
7c673cae
FG
281
282 // Unref decreases the reference count, but does not handle deletion
283 // when the count goes to 0. If this method returns true then the
284 // caller should delete the instance immediately, or later, by calling
285 // FreeDeadColumnFamilies(). Unref() can only be called while holding
286 // a DB mutex, or during single-threaded recovery.
287 bool Unref() {
f67539c2 288 int old_refs = refs_.fetch_sub(1);
7c673cae
FG
289 assert(old_refs > 0);
290 return old_refs == 1;
291 }
292
f67539c2
TL
293 // UnrefAndTryDelete() decreases the reference count and do free if needed,
294 // return true if this is freed else false, UnrefAndTryDelete() can only
295 // be called while holding a DB mutex, or during single-threaded recovery.
296 bool UnrefAndTryDelete();
297
7c673cae
FG
298 // SetDropped() can only be called under following conditions:
299 // 1) Holding a DB mutex,
300 // 2) from single-threaded write thread, AND
301 // 3) from single-threaded VersionSet::LogAndApply()
302 // After dropping column family no other operation on that column family
303 // will be executed. All the files and memory will be, however, kept around
304 // until client drops the column family handle. That way, client can still
305 // access data from dropped column family.
306 // Column family can be dropped and still alive. In that state:
307 // *) Compaction and flush is not executed on the dropped column family.
308 // *) Client can continue reading from column family. Writes will fail unless
309 // WriteOptions::ignore_missing_column_families is true
310 // When the dropped column family is unreferenced, then we:
311 // *) Remove column family from the linked list maintained by ColumnFamilySet
312 // *) delete all memory associated with that column family
313 // *) delete all the files associated with that column family
314 void SetDropped();
11fdf7f2 315 bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
7c673cae
FG
316
317 // thread-safe
318 int NumberLevels() const { return ioptions_.num_levels; }
319
320 void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
321 uint64_t GetLogNumber() const { return log_number_; }
322
11fdf7f2
TL
323 void SetFlushReason(FlushReason flush_reason) {
324 flush_reason_ = flush_reason;
325 }
326 FlushReason GetFlushReason() const { return flush_reason_; }
7c673cae 327 // thread-safe
f67539c2 328 const FileOptions* soptions() const;
7c673cae
FG
329 const ImmutableCFOptions* ioptions() const { return &ioptions_; }
330 // REQUIRES: DB mutex held
331 // This returns the MutableCFOptions used by current SuperVersion
332 // You should use this API to reference MutableCFOptions most of the time.
333 const MutableCFOptions* GetCurrentMutableCFOptions() const {
334 return &(super_version_->mutable_cf_options);
335 }
336 // REQUIRES: DB mutex held
337 // This returns the latest MutableCFOptions, which may be not in effect yet.
338 const MutableCFOptions* GetLatestMutableCFOptions() const {
339 return &mutable_cf_options_;
340 }
341
342 // REQUIRES: DB mutex held
343 // Build ColumnFamiliesOptions with immutable options and latest mutable
344 // options.
345 ColumnFamilyOptions GetLatestCFOptions() const;
346
347 bool is_delete_range_supported() { return is_delete_range_supported_; }
348
f67539c2
TL
349 // Validate CF options against DB options
350 static Status ValidateOptions(const DBOptions& db_options,
351 const ColumnFamilyOptions& cf_options);
7c673cae
FG
352#ifndef ROCKSDB_LITE
353 // REQUIRES: DB mutex held
354 Status SetOptions(
f67539c2 355 const DBOptions& db_options,
7c673cae
FG
356 const std::unordered_map<std::string, std::string>& options_map);
357#endif // ROCKSDB_LITE
358
359 InternalStats* internal_stats() { return internal_stats_.get(); }
360
361 MemTableList* imm() { return &imm_; }
362 MemTable* mem() { return mem_; }
363 Version* current() { return current_; }
364 Version* dummy_versions() { return dummy_versions_; }
365 void SetCurrent(Version* _current);
366 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
367 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
11fdf7f2
TL
368 uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
369 void SetMemtable(MemTable* new_mem) {
370 uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
371 new_mem->SetID(memtable_id);
372 mem_ = new_mem;
373 }
7c673cae
FG
374
375 // calculate the oldest log needed for the durability of this column family
376 uint64_t OldestLogToKeep();
377
378 // See Memtable constructor for explanation of earliest_seq param.
379 MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
380 SequenceNumber earliest_seq);
381 void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
382 SequenceNumber earliest_seq);
383
384 TableCache* table_cache() const { return table_cache_.get(); }
20effc67 385 BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
7c673cae
FG
386
387 // See documentation in compaction_picker.h
388 // REQUIRES: DB mutex held
389 bool NeedsCompaction() const;
390 // REQUIRES: DB mutex held
391 Compaction* PickCompaction(const MutableCFOptions& mutable_options,
20effc67 392 const MutableDBOptions& mutable_db_options,
7c673cae
FG
393 LogBuffer* log_buffer);
394
395 // Check if the passed range overlap with any running compactions.
396 // REQUIRES: DB mutex held
397 bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
398 const Slice& largest_user_key,
399 int level) const;
400
11fdf7f2
TL
401 // Check if the passed ranges overlap with any unflushed memtables
402 // (immutable or mutable).
403 //
404 // @param super_version A referenced SuperVersion that will be held for the
405 // duration of this function.
406 //
407 // Thread-safe
408 Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
20effc67
TL
409 SuperVersion* super_version,
410 bool allow_data_in_errors, bool* overlap);
11fdf7f2 411
7c673cae 412 // A flag to tell a manual compaction is to compact all levels together
11fdf7f2 413 // instead of a specific level.
7c673cae
FG
414 static const int kCompactAllLevels;
415 // A flag to tell a manual compaction's output is base level.
416 static const int kCompactToBaseLevel;
417 // REQUIRES: DB mutex held
418 Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
20effc67 419 const MutableDBOptions& mutable_db_options,
7c673cae 420 int input_level, int output_level,
f67539c2 421 const CompactRangeOptions& compact_range_options,
11fdf7f2 422 const InternalKey* begin, const InternalKey* end,
f67539c2
TL
423 InternalKey** compaction_end, bool* manual_conflict,
424 uint64_t max_file_num_to_ignore);
7c673cae
FG
425
426 CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
427 // thread-safe
428 const Comparator* user_comparator() const {
429 return internal_comparator_.user_comparator();
430 }
431 // thread-safe
432 const InternalKeyComparator& internal_comparator() const {
433 return internal_comparator_;
434 }
435
436 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
437 int_tbl_prop_collector_factories() const {
438 return &int_tbl_prop_collector_factories_;
439 }
440
441 SuperVersion* GetSuperVersion() { return super_version_; }
442 // thread-safe
443 // Return a already referenced SuperVersion to be used safely.
f67539c2 444 SuperVersion* GetReferencedSuperVersion(DBImpl* db);
7c673cae
FG
445 // thread-safe
446 // Get SuperVersion stored in thread local storage. If it does not exist,
447 // get a reference from a current SuperVersion.
f67539c2 448 SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
7c673cae
FG
449 // Try to return SuperVersion back to thread local storage. Retrun true on
450 // success and false on failure. It fails when the thread local storage
451 // contains anything other than SuperVersion::kSVInUse flag.
452 bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
453 // thread-safe
454 uint64_t GetSuperVersionNumber() const {
455 return super_version_number_.load();
456 }
457 // will return a pointer to SuperVersion* if previous SuperVersion
458 // if its reference count is zero and needs deletion or nullptr if not
459 // As argument takes a pointer to allocated SuperVersion to enable
460 // the clients to allocate SuperVersion outside of mutex.
461 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
11fdf7f2
TL
462 void InstallSuperVersion(SuperVersionContext* sv_context,
463 InstrumentedMutex* db_mutex,
464 const MutableCFOptions& mutable_cf_options);
465 void InstallSuperVersion(SuperVersionContext* sv_context,
466 InstrumentedMutex* db_mutex);
7c673cae
FG
467
468 void ResetThreadLocalSuperVersions();
469
470 // Protected by DB mutex
11fdf7f2
TL
471 void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
472 void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
473 bool queued_for_flush() { return queued_for_flush_; }
474 bool queued_for_compaction() { return queued_for_compaction_; }
475
476 enum class WriteStallCause {
477 kNone,
478 kMemtableLimit,
479 kL0FileCountLimit,
480 kPendingCompactionBytes,
481 };
482 static std::pair<WriteStallCondition, WriteStallCause>
483 GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
484 uint64_t num_compaction_needed_bytes,
485 const MutableCFOptions& mutable_cf_options);
7c673cae
FG
486
487 // Recalculate some small conditions, which are changed only during
488 // compaction, adding new memtable and/or
489 // recalculation of compaction score. These values are used in
490 // DBImpl::MakeRoomForWrite function to decide, if it need to make
491 // a write stall
11fdf7f2 492 WriteStallCondition RecalculateWriteStallConditions(
7c673cae
FG
493 const MutableCFOptions& mutable_cf_options);
494
11fdf7f2
TL
495 void set_initialized() { initialized_.store(true); }
496
497 bool initialized() const { return initialized_.load(); }
498
499 const ColumnFamilyOptions& initial_cf_options() {
500 return initial_cf_options_;
501 }
502
503 Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
504
f67539c2
TL
505 // created_dirs remembers directory created, so that we don't need to call
506 // the same data creation operation again.
507 Status AddDirectories(
20effc67 508 std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
11fdf7f2 509
20effc67 510 FSDirectory* GetDataDir(size_t path_id) const;
11fdf7f2 511
494da23a
TL
512 ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
513
7c673cae
FG
514 private:
515 friend class ColumnFamilySet;
20effc67 516 static const uint32_t kDummyColumnFamilyDataId;
7c673cae
FG
517 ColumnFamilyData(uint32_t id, const std::string& name,
518 Version* dummy_versions, Cache* table_cache,
519 WriteBufferManager* write_buffer_manager,
520 const ColumnFamilyOptions& options,
521 const ImmutableDBOptions& db_options,
f67539c2
TL
522 const FileOptions& file_options,
523 ColumnFamilySet* column_family_set,
20effc67
TL
524 BlockCacheTracer* const block_cache_tracer,
525 const std::shared_ptr<IOTracer>& io_tracer);
526
527 std::vector<std::string> GetDbPaths() const;
7c673cae
FG
528
529 uint32_t id_;
530 const std::string name_;
531 Version* dummy_versions_; // Head of circular doubly-linked list of versions.
532 Version* current_; // == dummy_versions->prev_
533
534 std::atomic<int> refs_; // outstanding references to ColumnFamilyData
11fdf7f2
TL
535 std::atomic<bool> initialized_;
536 std::atomic<bool> dropped_; // true if client dropped it
7c673cae
FG
537
538 const InternalKeyComparator internal_comparator_;
539 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
540 int_tbl_prop_collector_factories_;
541
542 const ColumnFamilyOptions initial_cf_options_;
543 const ImmutableCFOptions ioptions_;
544 MutableCFOptions mutable_cf_options_;
545
546 const bool is_delete_range_supported_;
547
548 std::unique_ptr<TableCache> table_cache_;
20effc67 549 std::unique_ptr<BlobFileCache> blob_file_cache_;
7c673cae
FG
550
551 std::unique_ptr<InternalStats> internal_stats_;
552
553 WriteBufferManager* write_buffer_manager_;
554
555 MemTable* mem_;
556 MemTableList imm_;
557 SuperVersion* super_version_;
558
559 // An ordinal representing the current SuperVersion. Updated by
560 // InstallSuperVersion(), i.e. incremented every time super_version_
561 // changes.
562 std::atomic<uint64_t> super_version_number_;
563
564 // Thread's local copy of SuperVersion pointer
565 // This needs to be destructed before mutex_
566 std::unique_ptr<ThreadLocalPtr> local_sv_;
567
568 // pointers for a circular linked list. we use it to support iterations over
569 // all column families that are alive (note: dropped column families can also
570 // be alive as long as client holds a reference)
571 ColumnFamilyData* next_;
572 ColumnFamilyData* prev_;
573
574 // This is the earliest log file number that contains data from this
575 // Column Family. All earlier log files must be ignored and not
576 // recovered from
577 uint64_t log_number_;
578
11fdf7f2
TL
579 std::atomic<FlushReason> flush_reason_;
580
7c673cae
FG
581 // An object that keeps all the compaction stats
582 // and picks the next compaction
583 std::unique_ptr<CompactionPicker> compaction_picker_;
584
585 ColumnFamilySet* column_family_set_;
586
587 std::unique_ptr<WriteControllerToken> write_controller_token_;
588
589 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
11fdf7f2 590 bool queued_for_flush_;
7c673cae
FG
591
592 // If true --> this ColumnFamily is currently present in
593 // DBImpl::compaction_queue_
11fdf7f2 594 bool queued_for_compaction_;
7c673cae
FG
595
596 uint64_t prev_compaction_needed_bytes_;
597
598 // if the database was opened with 2pc enabled
599 bool allow_2pc_;
11fdf7f2
TL
600
601 // Memtable id to track flush.
602 std::atomic<uint64_t> last_memtable_id_;
603
604 // Directories corresponding to cf_paths.
20effc67
TL
605 std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
606
607 bool db_paths_registered_;
7c673cae
FG
608};
609
610// ColumnFamilySet has interesting thread-safety requirements
611// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
612// mutex AND executed in the write thread.
613// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
614// single-threaded write thread. It is also called during Recovery and in
615// DumpManifest().
616// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
617// held and it needs to be executed from the write thread. SetDropped() also
618// guarantees that it will be called only from single-threaded LogAndApply(),
619// but this condition is not that important.
620// * Iteration -- hold DB mutex, but you can release it in the body of
621// iteration. If you release DB mutex in body, reference the column
622// family before the mutex and unreference after you unlock, since the column
623// family might get dropped when the DB mutex is released
624// * GetDefault() -- thread safe
625// * GetColumnFamily() -- either inside of DB mutex or from a write thread
626// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
627// NumberOfColumnFamilies -- inside of DB mutex
628class ColumnFamilySet {
629 public:
630 // ColumnFamilySet supports iteration
631 class iterator {
632 public:
633 explicit iterator(ColumnFamilyData* cfd)
634 : current_(cfd) {}
635 iterator& operator++() {
636 // dropped column families might still be included in this iteration
637 // (we're only removing them when client drops the last reference to the
638 // column family).
639 // dummy is never dead, so this will never be infinite
640 do {
641 current_ = current_->next_;
642 } while (current_->refs_.load(std::memory_order_relaxed) == 0);
643 return *this;
644 }
645 bool operator!=(const iterator& other) {
646 return this->current_ != other.current_;
647 }
648 ColumnFamilyData* operator*() { return current_; }
649
650 private:
651 ColumnFamilyData* current_;
652 };
653
654 ColumnFamilySet(const std::string& dbname,
655 const ImmutableDBOptions* db_options,
f67539c2 656 const FileOptions& file_options, Cache* table_cache,
20effc67
TL
657 WriteBufferManager* _write_buffer_manager,
658 WriteController* _write_controller,
659 BlockCacheTracer* const block_cache_tracer,
660 const std::shared_ptr<IOTracer>& io_tracer);
7c673cae
FG
661 ~ColumnFamilySet();
662
663 ColumnFamilyData* GetDefault() const;
664 // GetColumnFamily() calls return nullptr if column family is not found
665 ColumnFamilyData* GetColumnFamily(uint32_t id) const;
666 ColumnFamilyData* GetColumnFamily(const std::string& name) const;
667 // this call will return the next available column family ID. it guarantees
668 // that there is no column family with id greater than or equal to the
669 // returned value in the current running instance or anytime in RocksDB
670 // instance history.
671 uint32_t GetNextColumnFamilyID();
672 uint32_t GetMaxColumnFamily();
673 void UpdateMaxColumnFamily(uint32_t new_max_column_family);
674 size_t NumberOfColumnFamilies() const;
675
676 ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
677 Version* dummy_version,
678 const ColumnFamilyOptions& options);
679
680 iterator begin() { return iterator(dummy_cfd_->next_); }
681 iterator end() { return iterator(dummy_cfd_); }
682
683 // REQUIRES: DB mutex held
684 // Don't call while iterating over ColumnFamilySet
685 void FreeDeadColumnFamilies();
686
687 Cache* get_table_cache() { return table_cache_; }
688
20effc67
TL
689 WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
690
691 WriteController* write_controller() { return write_controller_; }
692
7c673cae
FG
693 private:
694 friend class ColumnFamilyData;
695 // helper function that gets called from cfd destructor
696 // REQUIRES: DB mutex held
697 void RemoveColumnFamily(ColumnFamilyData* cfd);
698
699 // column_families_ and column_family_data_ need to be protected:
700 // * when mutating both conditions have to be satisfied:
701 // 1. DB mutex locked
702 // 2. thread currently in single-threaded write thread
703 // * when reading, at least one condition needs to be satisfied:
704 // 1. DB mutex locked
705 // 2. accessed from a single-threaded write thread
706 std::unordered_map<std::string, uint32_t> column_families_;
707 std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
708
709 uint32_t max_column_family_;
710 ColumnFamilyData* dummy_cfd_;
711 // We don't hold the refcount here, since default column family always exists
712 // We are also not responsible for cleaning up default_cfd_cache_. This is
713 // just a cache that makes common case (accessing default column family)
714 // faster
715 ColumnFamilyData* default_cfd_cache_;
716
717 const std::string db_name_;
718 const ImmutableDBOptions* const db_options_;
f67539c2 719 const FileOptions file_options_;
7c673cae
FG
720 Cache* table_cache_;
721 WriteBufferManager* write_buffer_manager_;
722 WriteController* write_controller_;
f67539c2 723 BlockCacheTracer* const block_cache_tracer_;
20effc67 724 std::shared_ptr<IOTracer> io_tracer_;
7c673cae
FG
725};
726
727// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
728// memtables of different column families (specified by ID in the write batch)
729class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
730 public:
731 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
732 : column_family_set_(column_family_set), current_(nullptr) {}
733
734 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
735 // with the arguments used to construct *orig.
736 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
737 : column_family_set_(orig->column_family_set_), current_(nullptr) {}
738
739 // sets current_ to ColumnFamilyData with column_family_id
740 // returns false if column family doesn't exist
741 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
742 // under a DB mutex OR from a write thread
743 bool Seek(uint32_t column_family_id) override;
744
745 // Returns log number of the selected column family
746 // REQUIRES: under a DB mutex OR from a write thread
747 uint64_t GetLogNumber() const override;
748
749 // REQUIRES: Seek() called first
750 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
751 // under a DB mutex OR from a write thread
752 virtual MemTable* GetMemTable() const override;
753
754 // Returns column family handle for the selected column family
755 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
756 // under a DB mutex OR from a write thread
757 virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
758
759 // Cannot be called while another thread is calling Seek().
760 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
761 // under a DB mutex OR from a write thread
762 virtual ColumnFamilyData* current() override { return current_; }
763
764 private:
765 ColumnFamilySet* column_family_set_;
766 ColumnFamilyData* current_;
767 ColumnFamilyHandleInternal handle_;
768};
769
770extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
771
772extern const Comparator* GetColumnFamilyUserComparator(
773 ColumnFamilyHandle* column_family);
774
f67539c2 775} // namespace ROCKSDB_NAMESPACE