]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/column_family.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / column_family.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
1e59de90 12#include <atomic>
7c673cae 13#include <string>
1e59de90 14#include <unordered_map>
7c673cae 15#include <vector>
7c673cae 16
1e59de90 17#include "cache/cache_reservation_manager.h"
7c673cae
FG
18#include "db/memtable_list.h"
19#include "db/table_cache.h"
20#include "db/table_properties_collector.h"
21#include "db/write_batch_internal.h"
22#include "db/write_controller.h"
23#include "options/cf_options.h"
24#include "rocksdb/compaction_job_stats.h"
25#include "rocksdb/db.h"
26#include "rocksdb/env.h"
27#include "rocksdb/options.h"
f67539c2 28#include "trace_replay/block_cache_tracer.h"
1e59de90 29#include "util/hash_containers.h"
7c673cae
FG
30#include "util/thread_local.h"
31
f67539c2 32namespace ROCKSDB_NAMESPACE {
7c673cae
FG
33
34class Version;
35class VersionSet;
11fdf7f2 36class VersionStorageInfo;
7c673cae
FG
37class MemTable;
38class MemTableListVersion;
39class CompactionPicker;
40class Compaction;
41class InternalKey;
42class InternalStats;
43class ColumnFamilyData;
44class DBImpl;
45class LogBuffer;
46class InstrumentedMutex;
47class InstrumentedMutexLock;
11fdf7f2 48struct SuperVersionContext;
20effc67 49class BlobFileCache;
1e59de90 50class BlobSource;
7c673cae
FG
51
52extern const double kIncSlowdownRatio;
f67539c2
TL
53// This file contains a list of data structures for managing column family
54// level metadata.
55//
56// The basic relationships among classes declared here are illustrated as
57// following:
58//
59// +----------------------+ +----------------------+ +--------+
60// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl |
61// | +----------------------+ | +----------------------+ +----+---+
62// | +--------------------------+ |
63// | | +-----------------------------+
64// | | |
65// | | +-----------------------------v-------------------------------+
66// | | | |
67// | | | ColumnFamilySet |
68// | | | |
69// | | +-------------+--------------------------+----------------+---+
70// | | | | |
71// | +-------------------------------------+ | |
72// | | | | v
73// | +-------------v-------------+ +-----v----v---------+
74// | | | | |
75// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ......
76// | | | | |
77// +---> | | |
78// | +---------+ | |
79// | | MemTable| | |
80// | | List | | |
81// +--------+---+--+-+----+----+ +--------------------++
82// | | | |
83// | | | |
84// | | | +-----------------------+
85// | | +-----------+ |
86// v +--------+ | |
87// +--------+--------+ | | |
88// | | | | +----------v----------+
89// +---> |SuperVersion 1.a +-----------------> |
90// | +------+ | | MemTableListVersion |
91// +---+-------------+ | | | | |
92// | | | | +----+------------+---+
93// | current | | | | |
94// | +-------------+ | |mem | |
95// | | | | | |
96// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+
97// | | | | | | | |
98// | Version 1.a | | memtable | | memtable | | memtable |
99// | | | 1.a | | 1.b | | 1.c |
100// +-------------+ | | | | | |
101// +----------+ +----------+ +----------+
102//
103// DBImpl keeps a ColumnFamilySet, which references to all column families by
104// pointing to respective ColumnFamilyData object of each column family.
105// This is how DBImpl can list and operate on all the column families.
106// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
107// when a user executes a query, it can directly find memtables and Version
108// as well as SuperVersion to the column family, without going through
109// ColumnFamilySet.
110//
111// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
112// and SST files) indirectly, while ongoing operations may hold references
113// to a current or an out-of-date SuperVersion, which in turn points to a
114// point-in-time view of the LSM-tree. This guarantees the memtables and SST
115// files being operated on will not go away, until the SuperVersion is
116// unreferenced to 0 and destoryed.
117//
118// The following graph illustrates a possible referencing relationships:
119//
120// Column +--------------+ current +-----------+
121// Family +---->+ +------------------->+ |
122// Data | SuperVersion +----------+ | Version A |
123// | 3 | imm | | |
124// Iter2 +----->+ | +-------v------+ +-----------+
125// +-----+--------+ | MemtableList +----------------> Empty
126// | | Version r | +-----------+
127// | +--------------+ | |
128// +------------------+ current| Version B |
129// +--------------+ | +----->+ |
130// | | | | +-----+-----+
131// Compaction +>+ SuperVersion +-------------+ ^
132// Job | 2 +------+ | |current
133// | +----+ | | mem | +------------+
134// +--------------+ | | +---------------------> |
135// | +------------------------> MemTable a |
136// | mem | | |
137// +--------------+ | | +------------+
138// | +--------------------------+
139// Iter1 +-----> SuperVersion | | +------------+
140// | 1 +------------------------------>+ |
141// | +-+ | mem | MemTable b |
142// +--------------+ | | | |
143// | | +--------------+ +-----^------+
144// | |imm | MemtableList | |
145// | +--->+ Version s +------------+
146// | +--------------+
147// | +--------------+
148// | | MemtableList |
149// +------>+ Version t +--------> Empty
150// imm +--------------+
151//
152// In this example, even if the current LSM-tree consists of Version A and
153// memtable a, which is also referenced by SuperVersion, two older SuperVersion
154// SuperVersion2 and Superversion1 still exist, and are referenced by a
155// compaction job and an old iterator Iter1, respectively. SuperVersion2
156// contains Version B, memtable a and memtable b; SuperVersion1 contains
157// Version B and memtable b (mutable). As a result, Version B and memtable b
158// are prevented from being destroyed or deleted.
7c673cae
FG
159
160// ColumnFamilyHandleImpl is the class that clients use to access different
161// column families. It has non-trivial destructor, which gets called when client
162// is done using the column family
163class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
164 public:
165 // create while holding the mutex
1e59de90
TL
166 ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db,
167 InstrumentedMutex* mutex);
7c673cae
FG
168 // destroy without mutex
169 virtual ~ColumnFamilyHandleImpl();
170 virtual ColumnFamilyData* cfd() const { return cfd_; }
171
172 virtual uint32_t GetID() const override;
173 virtual const std::string& GetName() const override;
174 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
175 virtual const Comparator* GetComparator() const override;
176
177 private:
178 ColumnFamilyData* cfd_;
179 DBImpl* db_;
180 InstrumentedMutex* mutex_;
181};
182
183// Does not ref-count ColumnFamilyData
184// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
185// calls DBImpl methods. When this happens, MemTableInserter need access to
186// ColumnFamilyHandle (same as the client would need). In that case, we feed
187// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
188// methods
189class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
190 public:
191 ColumnFamilyHandleInternal()
1e59de90
TL
192 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
193 internal_cfd_(nullptr) {}
7c673cae
FG
194
195 void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
196 virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
197
198 private:
199 ColumnFamilyData* internal_cfd_;
200};
201
202// holds references to memtable, all immutable memtables and version
203struct SuperVersion {
204 // Accessing members of this class is not thread-safe and requires external
205 // synchronization (ie db mutex held or on write thread).
f67539c2 206 ColumnFamilyData* cfd;
7c673cae
FG
207 MemTable* mem;
208 MemTableListVersion* imm;
209 Version* current;
210 MutableCFOptions mutable_cf_options;
211 // Version number of the current SuperVersion
212 uint64_t version_number;
11fdf7f2 213 WriteStallCondition write_stall_condition;
7c673cae 214
7c673cae
FG
215 // should be called outside the mutex
216 SuperVersion() = default;
217 ~SuperVersion();
218 SuperVersion* Ref();
219 // If Unref() returns true, Cleanup() should be called with mutex held
220 // before deleting this SuperVersion.
221 bool Unref();
222
223 // call these two methods with db mutex held
224 // Cleanup unrefs mem, imm and current. Also, it stores all memtables
225 // that needs to be deleted in to_delete vector. Unrefing those
226 // objects needs to be done in the mutex
227 void Cleanup();
f67539c2
TL
228 void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
229 MemTableListVersion* new_imm, Version* new_current);
7c673cae
FG
230
231 // The value of dummy is not actually used. kSVInUse takes its address as a
232 // mark in the thread local storage to indicate the SuperVersion is in use
233 // by thread. This way, the value of kSVInUse is guaranteed to have no
234 // conflict with SuperVersion object address and portable on different
235 // platform.
236 static int dummy;
237 static void* const kSVInUse;
238 static void* const kSVObsolete;
239
240 private:
241 std::atomic<uint32_t> refs;
242 // We need to_delete because during Cleanup(), imm->Unref() returns
243 // all memtables that we need to free through this vector. We then
244 // delete all those memtables outside of mutex, during destruction
245 autovector<MemTable*> to_delete;
246};
247
248extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
249
250extern Status CheckConcurrentWritesSupported(
251 const ColumnFamilyOptions& cf_options);
252
11fdf7f2
TL
253extern Status CheckCFPathsSupported(const DBOptions& db_options,
254 const ColumnFamilyOptions& cf_options);
255
7c673cae
FG
256extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
257 const ColumnFamilyOptions& src);
1e59de90 258// Wrap user defined table properties collector factories `from cf_options`
7c673cae
FG
259// into internal ones in int_tbl_prop_collector_factories. Add a system internal
260// one too.
261extern void GetIntTblPropCollectorFactory(
262 const ImmutableCFOptions& ioptions,
1e59de90 263 IntTblPropCollectorFactories* int_tbl_prop_collector_factories);
7c673cae
FG
264
265class ColumnFamilySet;
266
267// This class keeps all the data that a column family needs.
268// Most methods require DB mutex held, unless otherwise noted
269class ColumnFamilyData {
270 public:
271 ~ColumnFamilyData();
272
273 // thread-safe
274 uint32_t GetID() const { return id_; }
275 // thread-safe
276 const std::string& GetName() const { return name_; }
277
278 // Ref() can only be called from a context where the caller can guarantee
279 // that ColumnFamilyData is alive (while holding a non-zero ref already,
280 // holding a DB mutex, or as the leader in a write batch group).
f67539c2 281 void Ref() { refs_.fetch_add(1); }
7c673cae 282
f67539c2
TL
283 // UnrefAndTryDelete() decreases the reference count and do free if needed,
284 // return true if this is freed else false, UnrefAndTryDelete() can only
285 // be called while holding a DB mutex, or during single-threaded recovery.
286 bool UnrefAndTryDelete();
287
7c673cae
FG
288 // SetDropped() can only be called under following conditions:
289 // 1) Holding a DB mutex,
290 // 2) from single-threaded write thread, AND
291 // 3) from single-threaded VersionSet::LogAndApply()
292 // After dropping column family no other operation on that column family
293 // will be executed. All the files and memory will be, however, kept around
294 // until client drops the column family handle. That way, client can still
295 // access data from dropped column family.
296 // Column family can be dropped and still alive. In that state:
297 // *) Compaction and flush is not executed on the dropped column family.
298 // *) Client can continue reading from column family. Writes will fail unless
299 // WriteOptions::ignore_missing_column_families is true
300 // When the dropped column family is unreferenced, then we:
301 // *) Remove column family from the linked list maintained by ColumnFamilySet
302 // *) delete all memory associated with that column family
303 // *) delete all the files associated with that column family
304 void SetDropped();
11fdf7f2 305 bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
7c673cae
FG
306
307 // thread-safe
308 int NumberLevels() const { return ioptions_.num_levels; }
309
310 void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
311 uint64_t GetLogNumber() const { return log_number_; }
312
11fdf7f2
TL
313 void SetFlushReason(FlushReason flush_reason) {
314 flush_reason_ = flush_reason;
315 }
316 FlushReason GetFlushReason() const { return flush_reason_; }
7c673cae 317 // thread-safe
f67539c2 318 const FileOptions* soptions() const;
1e59de90 319 const ImmutableOptions* ioptions() const { return &ioptions_; }
7c673cae
FG
320 // REQUIRES: DB mutex held
321 // This returns the MutableCFOptions used by current SuperVersion
322 // You should use this API to reference MutableCFOptions most of the time.
323 const MutableCFOptions* GetCurrentMutableCFOptions() const {
324 return &(super_version_->mutable_cf_options);
325 }
326 // REQUIRES: DB mutex held
327 // This returns the latest MutableCFOptions, which may be not in effect yet.
328 const MutableCFOptions* GetLatestMutableCFOptions() const {
329 return &mutable_cf_options_;
330 }
331
332 // REQUIRES: DB mutex held
333 // Build ColumnFamiliesOptions with immutable options and latest mutable
334 // options.
335 ColumnFamilyOptions GetLatestCFOptions() const;
336
337 bool is_delete_range_supported() { return is_delete_range_supported_; }
338
f67539c2
TL
339 // Validate CF options against DB options
340 static Status ValidateOptions(const DBOptions& db_options,
341 const ColumnFamilyOptions& cf_options);
7c673cae
FG
342#ifndef ROCKSDB_LITE
343 // REQUIRES: DB mutex held
344 Status SetOptions(
f67539c2 345 const DBOptions& db_options,
7c673cae
FG
346 const std::unordered_map<std::string, std::string>& options_map);
347#endif // ROCKSDB_LITE
348
349 InternalStats* internal_stats() { return internal_stats_.get(); }
350
351 MemTableList* imm() { return &imm_; }
352 MemTable* mem() { return mem_; }
1e59de90
TL
353
354 bool IsEmpty() {
355 return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
356 }
357
7c673cae
FG
358 Version* current() { return current_; }
359 Version* dummy_versions() { return dummy_versions_; }
360 void SetCurrent(Version* _current);
1e59de90 361 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
7c673cae 362 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
11fdf7f2 363 uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
1e59de90 364 uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held
11fdf7f2
TL
365 void SetMemtable(MemTable* new_mem) {
366 uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
367 new_mem->SetID(memtable_id);
368 mem_ = new_mem;
369 }
7c673cae
FG
370
371 // calculate the oldest log needed for the durability of this column family
372 uint64_t OldestLogToKeep();
373
374 // See Memtable constructor for explanation of earliest_seq param.
375 MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
376 SequenceNumber earliest_seq);
377 void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
378 SequenceNumber earliest_seq);
379
380 TableCache* table_cache() const { return table_cache_.get(); }
1e59de90 381 BlobSource* blob_source() const { return blob_source_.get(); }
7c673cae
FG
382
383 // See documentation in compaction_picker.h
384 // REQUIRES: DB mutex held
385 bool NeedsCompaction() const;
386 // REQUIRES: DB mutex held
387 Compaction* PickCompaction(const MutableCFOptions& mutable_options,
20effc67 388 const MutableDBOptions& mutable_db_options,
7c673cae
FG
389 LogBuffer* log_buffer);
390
391 // Check if the passed range overlap with any running compactions.
392 // REQUIRES: DB mutex held
393 bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
394 const Slice& largest_user_key,
395 int level) const;
396
11fdf7f2
TL
397 // Check if the passed ranges overlap with any unflushed memtables
398 // (immutable or mutable).
399 //
400 // @param super_version A referenced SuperVersion that will be held for the
401 // duration of this function.
402 //
403 // Thread-safe
404 Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
20effc67
TL
405 SuperVersion* super_version,
406 bool allow_data_in_errors, bool* overlap);
11fdf7f2 407
7c673cae 408 // A flag to tell a manual compaction is to compact all levels together
11fdf7f2 409 // instead of a specific level.
7c673cae
FG
410 static const int kCompactAllLevels;
411 // A flag to tell a manual compaction's output is base level.
412 static const int kCompactToBaseLevel;
413 // REQUIRES: DB mutex held
414 Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
20effc67 415 const MutableDBOptions& mutable_db_options,
7c673cae 416 int input_level, int output_level,
f67539c2 417 const CompactRangeOptions& compact_range_options,
11fdf7f2 418 const InternalKey* begin, const InternalKey* end,
f67539c2 419 InternalKey** compaction_end, bool* manual_conflict,
1e59de90
TL
420 uint64_t max_file_num_to_ignore,
421 const std::string& trim_ts);
7c673cae
FG
422
423 CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
424 // thread-safe
425 const Comparator* user_comparator() const {
426 return internal_comparator_.user_comparator();
427 }
428 // thread-safe
429 const InternalKeyComparator& internal_comparator() const {
430 return internal_comparator_;
431 }
432
1e59de90 433 const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const {
7c673cae
FG
434 return &int_tbl_prop_collector_factories_;
435 }
436
437 SuperVersion* GetSuperVersion() { return super_version_; }
438 // thread-safe
439 // Return a already referenced SuperVersion to be used safely.
f67539c2 440 SuperVersion* GetReferencedSuperVersion(DBImpl* db);
7c673cae
FG
441 // thread-safe
442 // Get SuperVersion stored in thread local storage. If it does not exist,
443 // get a reference from a current SuperVersion.
f67539c2 444 SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
1e59de90 445 // Try to return SuperVersion back to thread local storage. Return true on
7c673cae
FG
446 // success and false on failure. It fails when the thread local storage
447 // contains anything other than SuperVersion::kSVInUse flag.
448 bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
449 // thread-safe
450 uint64_t GetSuperVersionNumber() const {
451 return super_version_number_.load();
452 }
453 // will return a pointer to SuperVersion* if previous SuperVersion
454 // if its reference count is zero and needs deletion or nullptr if not
455 // As argument takes a pointer to allocated SuperVersion to enable
456 // the clients to allocate SuperVersion outside of mutex.
457 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
11fdf7f2 458 void InstallSuperVersion(SuperVersionContext* sv_context,
11fdf7f2
TL
459 const MutableCFOptions& mutable_cf_options);
460 void InstallSuperVersion(SuperVersionContext* sv_context,
461 InstrumentedMutex* db_mutex);
7c673cae
FG
462
463 void ResetThreadLocalSuperVersions();
464
465 // Protected by DB mutex
11fdf7f2
TL
466 void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
467 void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
468 bool queued_for_flush() { return queued_for_flush_; }
469 bool queued_for_compaction() { return queued_for_compaction_; }
470
471 enum class WriteStallCause {
472 kNone,
473 kMemtableLimit,
474 kL0FileCountLimit,
475 kPendingCompactionBytes,
476 };
477 static std::pair<WriteStallCondition, WriteStallCause>
1e59de90
TL
478 GetWriteStallConditionAndCause(
479 int num_unflushed_memtables, int num_l0_files,
480 uint64_t num_compaction_needed_bytes,
481 const MutableCFOptions& mutable_cf_options,
482 const ImmutableCFOptions& immutable_cf_options);
483
484 // Recalculate some stall conditions, which are changed only during
485 // compaction, adding new memtable and/or recalculation of compaction score.
11fdf7f2 486 WriteStallCondition RecalculateWriteStallConditions(
7c673cae
FG
487 const MutableCFOptions& mutable_cf_options);
488
11fdf7f2
TL
489 void set_initialized() { initialized_.store(true); }
490
491 bool initialized() const { return initialized_.load(); }
492
493 const ColumnFamilyOptions& initial_cf_options() {
494 return initial_cf_options_;
495 }
496
497 Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
498
f67539c2
TL
499 // created_dirs remembers directory created, so that we don't need to call
500 // the same data creation operation again.
501 Status AddDirectories(
20effc67 502 std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
11fdf7f2 503
20effc67 504 FSDirectory* GetDataDir(size_t path_id) const;
11fdf7f2 505
1e59de90
TL
506 // full_history_ts_low_ can only increase.
507 void SetFullHistoryTsLow(std::string ts_low) {
508 assert(!ts_low.empty());
509 const Comparator* ucmp = user_comparator();
510 assert(ucmp);
511 if (full_history_ts_low_.empty() ||
512 ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
513 full_history_ts_low_ = std::move(ts_low);
514 }
515 }
516
517 const std::string& GetFullHistoryTsLow() const {
518 return full_history_ts_low_;
519 }
520
494da23a 521 ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
1e59de90
TL
522 WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
523 std::shared_ptr<CacheReservationManager>
524 GetFileMetadataCacheReservationManager() {
525 return file_metadata_cache_res_mgr_;
526 }
527
528 SequenceNumber GetFirstMemtableSequenceNumber() const;
529
530 static const uint32_t kDummyColumnFamilyDataId;
531
532 // Keep track of whether the mempurge feature was ever used.
533 void SetMempurgeUsed() { mempurge_used_ = true; }
534 bool GetMempurgeUsed() { return mempurge_used_; }
494da23a 535
7c673cae
FG
536 private:
537 friend class ColumnFamilySet;
538 ColumnFamilyData(uint32_t id, const std::string& name,
539 Version* dummy_versions, Cache* table_cache,
540 WriteBufferManager* write_buffer_manager,
541 const ColumnFamilyOptions& options,
542 const ImmutableDBOptions& db_options,
1e59de90 543 const FileOptions* file_options,
f67539c2 544 ColumnFamilySet* column_family_set,
20effc67 545 BlockCacheTracer* const block_cache_tracer,
1e59de90
TL
546 const std::shared_ptr<IOTracer>& io_tracer,
547 const std::string& db_id, const std::string& db_session_id);
20effc67
TL
548
549 std::vector<std::string> GetDbPaths() const;
7c673cae
FG
550
551 uint32_t id_;
552 const std::string name_;
553 Version* dummy_versions_; // Head of circular doubly-linked list of versions.
554 Version* current_; // == dummy_versions->prev_
555
1e59de90 556 std::atomic<int> refs_; // outstanding references to ColumnFamilyData
11fdf7f2
TL
557 std::atomic<bool> initialized_;
558 std::atomic<bool> dropped_; // true if client dropped it
7c673cae
FG
559
560 const InternalKeyComparator internal_comparator_;
1e59de90 561 IntTblPropCollectorFactories int_tbl_prop_collector_factories_;
7c673cae
FG
562
563 const ColumnFamilyOptions initial_cf_options_;
1e59de90 564 const ImmutableOptions ioptions_;
7c673cae
FG
565 MutableCFOptions mutable_cf_options_;
566
567 const bool is_delete_range_supported_;
568
569 std::unique_ptr<TableCache> table_cache_;
20effc67 570 std::unique_ptr<BlobFileCache> blob_file_cache_;
1e59de90 571 std::unique_ptr<BlobSource> blob_source_;
7c673cae
FG
572
573 std::unique_ptr<InternalStats> internal_stats_;
574
575 WriteBufferManager* write_buffer_manager_;
576
577 MemTable* mem_;
578 MemTableList imm_;
579 SuperVersion* super_version_;
580
581 // An ordinal representing the current SuperVersion. Updated by
582 // InstallSuperVersion(), i.e. incremented every time super_version_
583 // changes.
584 std::atomic<uint64_t> super_version_number_;
585
586 // Thread's local copy of SuperVersion pointer
587 // This needs to be destructed before mutex_
588 std::unique_ptr<ThreadLocalPtr> local_sv_;
589
590 // pointers for a circular linked list. we use it to support iterations over
591 // all column families that are alive (note: dropped column families can also
592 // be alive as long as client holds a reference)
593 ColumnFamilyData* next_;
594 ColumnFamilyData* prev_;
595
596 // This is the earliest log file number that contains data from this
597 // Column Family. All earlier log files must be ignored and not
598 // recovered from
599 uint64_t log_number_;
600
11fdf7f2
TL
601 std::atomic<FlushReason> flush_reason_;
602
7c673cae
FG
603 // An object that keeps all the compaction stats
604 // and picks the next compaction
605 std::unique_ptr<CompactionPicker> compaction_picker_;
606
607 ColumnFamilySet* column_family_set_;
608
609 std::unique_ptr<WriteControllerToken> write_controller_token_;
610
611 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
11fdf7f2 612 bool queued_for_flush_;
7c673cae
FG
613
614 // If true --> this ColumnFamily is currently present in
615 // DBImpl::compaction_queue_
11fdf7f2 616 bool queued_for_compaction_;
7c673cae
FG
617
618 uint64_t prev_compaction_needed_bytes_;
619
620 // if the database was opened with 2pc enabled
621 bool allow_2pc_;
11fdf7f2
TL
622
623 // Memtable id to track flush.
624 std::atomic<uint64_t> last_memtable_id_;
625
626 // Directories corresponding to cf_paths.
20effc67
TL
627 std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
628
629 bool db_paths_registered_;
1e59de90
TL
630
631 std::string full_history_ts_low_;
632
633 // For charging memory usage of file metadata created for newly added files to
634 // a Version associated with this CFD
635 std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
636 bool mempurge_used_;
7c673cae
FG
637};
638
639// ColumnFamilySet has interesting thread-safety requirements
640// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
641// mutex AND executed in the write thread.
642// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
643// single-threaded write thread. It is also called during Recovery and in
644// DumpManifest().
645// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
646// held and it needs to be executed from the write thread. SetDropped() also
647// guarantees that it will be called only from single-threaded LogAndApply(),
648// but this condition is not that important.
1e59de90
TL
649// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
650// body of the iteration, wrap in a RefedColumnFamilySet.
7c673cae
FG
651// * GetDefault() -- thread safe
652// * GetColumnFamily() -- either inside of DB mutex or from a write thread
653// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
654// NumberOfColumnFamilies -- inside of DB mutex
655class ColumnFamilySet {
656 public:
657 // ColumnFamilySet supports iteration
658 class iterator {
659 public:
1e59de90
TL
660 explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {}
661 // NOTE: minimum operators for for-loop iteration
7c673cae 662 iterator& operator++() {
1e59de90 663 current_ = current_->next_;
7c673cae
FG
664 return *this;
665 }
1e59de90 666 bool operator!=(const iterator& other) const {
7c673cae
FG
667 return this->current_ != other.current_;
668 }
669 ColumnFamilyData* operator*() { return current_; }
670
671 private:
672 ColumnFamilyData* current_;
673 };
674
675 ColumnFamilySet(const std::string& dbname,
676 const ImmutableDBOptions* db_options,
f67539c2 677 const FileOptions& file_options, Cache* table_cache,
20effc67
TL
678 WriteBufferManager* _write_buffer_manager,
679 WriteController* _write_controller,
680 BlockCacheTracer* const block_cache_tracer,
1e59de90
TL
681 const std::shared_ptr<IOTracer>& io_tracer,
682 const std::string& db_id, const std::string& db_session_id);
7c673cae
FG
683 ~ColumnFamilySet();
684
685 ColumnFamilyData* GetDefault() const;
686 // GetColumnFamily() calls return nullptr if column family is not found
687 ColumnFamilyData* GetColumnFamily(uint32_t id) const;
688 ColumnFamilyData* GetColumnFamily(const std::string& name) const;
689 // this call will return the next available column family ID. it guarantees
690 // that there is no column family with id greater than or equal to the
691 // returned value in the current running instance or anytime in RocksDB
692 // instance history.
693 uint32_t GetNextColumnFamilyID();
694 uint32_t GetMaxColumnFamily();
695 void UpdateMaxColumnFamily(uint32_t new_max_column_family);
696 size_t NumberOfColumnFamilies() const;
697
698 ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
699 Version* dummy_version,
700 const ColumnFamilyOptions& options);
701
702 iterator begin() { return iterator(dummy_cfd_->next_); }
703 iterator end() { return iterator(dummy_cfd_); }
704
7c673cae
FG
705 Cache* get_table_cache() { return table_cache_; }
706
20effc67
TL
707 WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
708
709 WriteController* write_controller() { return write_controller_; }
710
7c673cae
FG
711 private:
712 friend class ColumnFamilyData;
713 // helper function that gets called from cfd destructor
714 // REQUIRES: DB mutex held
715 void RemoveColumnFamily(ColumnFamilyData* cfd);
716
717 // column_families_ and column_family_data_ need to be protected:
718 // * when mutating both conditions have to be satisfied:
719 // 1. DB mutex locked
720 // 2. thread currently in single-threaded write thread
721 // * when reading, at least one condition needs to be satisfied:
722 // 1. DB mutex locked
723 // 2. accessed from a single-threaded write thread
1e59de90
TL
724 UnorderedMap<std::string, uint32_t> column_families_;
725 UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
7c673cae
FG
726
727 uint32_t max_column_family_;
1e59de90
TL
728 const FileOptions file_options_;
729
7c673cae
FG
730 ColumnFamilyData* dummy_cfd_;
731 // We don't hold the refcount here, since default column family always exists
732 // We are also not responsible for cleaning up default_cfd_cache_. This is
733 // just a cache that makes common case (accessing default column family)
734 // faster
735 ColumnFamilyData* default_cfd_cache_;
736
737 const std::string db_name_;
738 const ImmutableDBOptions* const db_options_;
7c673cae
FG
739 Cache* table_cache_;
740 WriteBufferManager* write_buffer_manager_;
741 WriteController* write_controller_;
f67539c2 742 BlockCacheTracer* const block_cache_tracer_;
20effc67 743 std::shared_ptr<IOTracer> io_tracer_;
1e59de90
TL
744 const std::string& db_id_;
745 std::string db_session_id_;
746};
747
748// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
749// iteration over the iterator, because the cfd is Refed and Unrefed during
750// each iteration to prevent concurrent CF drop from destroying it (until
751// Unref).
752class RefedColumnFamilySet {
753 public:
754 explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
755
756 class iterator {
757 public:
758 explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
759 MaybeRef(*wrapped_);
760 }
761 ~iterator() { MaybeUnref(*wrapped_); }
762 inline void MaybeRef(ColumnFamilyData* cfd) {
763 if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
764 cfd->Ref();
765 }
766 }
767 inline void MaybeUnref(ColumnFamilyData* cfd) {
768 if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
769 cfd->UnrefAndTryDelete();
770 }
771 }
772 // NOTE: minimum operators for for-loop iteration
773 inline iterator& operator++() {
774 ColumnFamilyData* old = *wrapped_;
775 ++wrapped_;
776 // Can only unref & potentially free cfd after accessing its next_
777 MaybeUnref(old);
778 MaybeRef(*wrapped_);
779 return *this;
780 }
781 inline bool operator!=(const iterator& other) const {
782 return this->wrapped_ != other.wrapped_;
783 }
784 inline ColumnFamilyData* operator*() { return *wrapped_; }
785
786 private:
787 ColumnFamilySet::iterator wrapped_;
788 };
789
790 iterator begin() { return iterator(wrapped_->begin()); }
791 iterator end() { return iterator(wrapped_->end()); }
792
793 private:
794 ColumnFamilySet* wrapped_;
7c673cae
FG
795};
796
797// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
798// memtables of different column families (specified by ID in the write batch)
799class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
800 public:
801 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
802 : column_family_set_(column_family_set), current_(nullptr) {}
803
804 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
805 // with the arguments used to construct *orig.
806 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
807 : column_family_set_(orig->column_family_set_), current_(nullptr) {}
808
809 // sets current_ to ColumnFamilyData with column_family_id
810 // returns false if column family doesn't exist
811 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
812 // under a DB mutex OR from a write thread
813 bool Seek(uint32_t column_family_id) override;
814
815 // Returns log number of the selected column family
816 // REQUIRES: under a DB mutex OR from a write thread
817 uint64_t GetLogNumber() const override;
818
819 // REQUIRES: Seek() called first
820 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
821 // under a DB mutex OR from a write thread
822 virtual MemTable* GetMemTable() const override;
823
824 // Returns column family handle for the selected column family
825 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
826 // under a DB mutex OR from a write thread
827 virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
828
829 // Cannot be called while another thread is calling Seek().
830 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
831 // under a DB mutex OR from a write thread
832 virtual ColumnFamilyData* current() override { return current_; }
833
834 private:
835 ColumnFamilySet* column_family_set_;
836 ColumnFamilyData* current_;
837 ColumnFamilyHandleInternal handle_;
838};
839
840extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
841
842extern const Comparator* GetColumnFamilyUserComparator(
843 ColumnFamilyHandle* column_family);
844
f67539c2 845} // namespace ROCKSDB_NAMESPACE