]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/column_family.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / db / column_family.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #pragma once
11
12 #include <unordered_map>
13 #include <string>
14 #include <vector>
15 #include <atomic>
16
17 #include "db/memtable_list.h"
18 #include "db/table_cache.h"
19 #include "db/table_properties_collector.h"
20 #include "db/write_batch_internal.h"
21 #include "db/write_controller.h"
22 #include "options/cf_options.h"
23 #include "rocksdb/compaction_job_stats.h"
24 #include "rocksdb/db.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/options.h"
27 #include "util/thread_local.h"
28
29 namespace rocksdb {
30
31 class Version;
32 class VersionSet;
33 class MemTable;
34 class MemTableListVersion;
35 class CompactionPicker;
36 class Compaction;
37 class InternalKey;
38 class InternalStats;
39 class ColumnFamilyData;
40 class DBImpl;
41 class LogBuffer;
42 class InstrumentedMutex;
43 class InstrumentedMutexLock;
44
45 extern const double kIncSlowdownRatio;
46
47 // ColumnFamilyHandleImpl is the class that clients use to access different
48 // column families. It has non-trivial destructor, which gets called when client
49 // is done using the column family
50 class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
51 public:
52 // create while holding the mutex
53 ColumnFamilyHandleImpl(
54 ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
55 // destroy without mutex
56 virtual ~ColumnFamilyHandleImpl();
57 virtual ColumnFamilyData* cfd() const { return cfd_; }
58
59 virtual uint32_t GetID() const override;
60 virtual const std::string& GetName() const override;
61 virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
62 virtual const Comparator* GetComparator() const override;
63
64 private:
65 ColumnFamilyData* cfd_;
66 DBImpl* db_;
67 InstrumentedMutex* mutex_;
68 };
69
70 // Does not ref-count ColumnFamilyData
71 // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
72 // calls DBImpl methods. When this happens, MemTableInserter need access to
73 // ColumnFamilyHandle (same as the client would need). In that case, we feed
74 // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
75 // methods
76 class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
77 public:
78 ColumnFamilyHandleInternal()
79 : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
80
81 void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
82 virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
83
84 private:
85 ColumnFamilyData* internal_cfd_;
86 };
87
88 // holds references to memtable, all immutable memtables and version
89 struct SuperVersion {
90 // Accessing members of this class is not thread-safe and requires external
91 // synchronization (ie db mutex held or on write thread).
92 MemTable* mem;
93 MemTableListVersion* imm;
94 Version* current;
95 MutableCFOptions mutable_cf_options;
96 // Version number of the current SuperVersion
97 uint64_t version_number;
98
99 InstrumentedMutex* db_mutex;
100
101 // should be called outside the mutex
102 SuperVersion() = default;
103 ~SuperVersion();
104 SuperVersion* Ref();
105 // If Unref() returns true, Cleanup() should be called with mutex held
106 // before deleting this SuperVersion.
107 bool Unref();
108
109 // call these two methods with db mutex held
110 // Cleanup unrefs mem, imm and current. Also, it stores all memtables
111 // that needs to be deleted in to_delete vector. Unrefing those
112 // objects needs to be done in the mutex
113 void Cleanup();
114 void Init(MemTable* new_mem, MemTableListVersion* new_imm,
115 Version* new_current);
116
117 // The value of dummy is not actually used. kSVInUse takes its address as a
118 // mark in the thread local storage to indicate the SuperVersion is in use
119 // by thread. This way, the value of kSVInUse is guaranteed to have no
120 // conflict with SuperVersion object address and portable on different
121 // platform.
122 static int dummy;
123 static void* const kSVInUse;
124 static void* const kSVObsolete;
125
126 private:
127 std::atomic<uint32_t> refs;
128 // We need to_delete because during Cleanup(), imm->Unref() returns
129 // all memtables that we need to free through this vector. We then
130 // delete all those memtables outside of mutex, during destruction
131 autovector<MemTable*> to_delete;
132 };
133
134 extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
135
136 extern Status CheckConcurrentWritesSupported(
137 const ColumnFamilyOptions& cf_options);
138
139 extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
140 const ColumnFamilyOptions& src);
141 // Wrap user defined table proproties collector factories `from cf_options`
142 // into internal ones in int_tbl_prop_collector_factories. Add a system internal
143 // one too.
144 extern void GetIntTblPropCollectorFactory(
145 const ImmutableCFOptions& ioptions,
146 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
147 int_tbl_prop_collector_factories);
148
149 class ColumnFamilySet;
150
151 // This class keeps all the data that a column family needs.
152 // Most methods require DB mutex held, unless otherwise noted
153 class ColumnFamilyData {
154 public:
155 ~ColumnFamilyData();
156
157 // thread-safe
158 uint32_t GetID() const { return id_; }
159 // thread-safe
160 const std::string& GetName() const { return name_; }
161
162 // Ref() can only be called from a context where the caller can guarantee
163 // that ColumnFamilyData is alive (while holding a non-zero ref already,
164 // holding a DB mutex, or as the leader in a write batch group).
165 void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
166
167 // Unref decreases the reference count, but does not handle deletion
168 // when the count goes to 0. If this method returns true then the
169 // caller should delete the instance immediately, or later, by calling
170 // FreeDeadColumnFamilies(). Unref() can only be called while holding
171 // a DB mutex, or during single-threaded recovery.
172 bool Unref() {
173 int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
174 assert(old_refs > 0);
175 return old_refs == 1;
176 }
177
178 // SetDropped() can only be called under following conditions:
179 // 1) Holding a DB mutex,
180 // 2) from single-threaded write thread, AND
181 // 3) from single-threaded VersionSet::LogAndApply()
182 // After dropping column family no other operation on that column family
183 // will be executed. All the files and memory will be, however, kept around
184 // until client drops the column family handle. That way, client can still
185 // access data from dropped column family.
186 // Column family can be dropped and still alive. In that state:
187 // *) Compaction and flush is not executed on the dropped column family.
188 // *) Client can continue reading from column family. Writes will fail unless
189 // WriteOptions::ignore_missing_column_families is true
190 // When the dropped column family is unreferenced, then we:
191 // *) Remove column family from the linked list maintained by ColumnFamilySet
192 // *) delete all memory associated with that column family
193 // *) delete all the files associated with that column family
194 void SetDropped();
195 bool IsDropped() const { return dropped_; }
196
197 // thread-safe
198 int NumberLevels() const { return ioptions_.num_levels; }
199
200 void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
201 uint64_t GetLogNumber() const { return log_number_; }
202
203 // thread-safe
204 const EnvOptions* soptions() const;
205 const ImmutableCFOptions* ioptions() const { return &ioptions_; }
206 // REQUIRES: DB mutex held
207 // This returns the MutableCFOptions used by current SuperVersion
208 // You should use this API to reference MutableCFOptions most of the time.
209 const MutableCFOptions* GetCurrentMutableCFOptions() const {
210 return &(super_version_->mutable_cf_options);
211 }
212 // REQUIRES: DB mutex held
213 // This returns the latest MutableCFOptions, which may be not in effect yet.
214 const MutableCFOptions* GetLatestMutableCFOptions() const {
215 return &mutable_cf_options_;
216 }
217
218 // REQUIRES: DB mutex held
219 // Build ColumnFamiliesOptions with immutable options and latest mutable
220 // options.
221 ColumnFamilyOptions GetLatestCFOptions() const;
222
223 bool is_delete_range_supported() { return is_delete_range_supported_; }
224
225 #ifndef ROCKSDB_LITE
226 // REQUIRES: DB mutex held
227 Status SetOptions(
228 const std::unordered_map<std::string, std::string>& options_map);
229 #endif // ROCKSDB_LITE
230
231 InternalStats* internal_stats() { return internal_stats_.get(); }
232
233 MemTableList* imm() { return &imm_; }
234 MemTable* mem() { return mem_; }
235 Version* current() { return current_; }
236 Version* dummy_versions() { return dummy_versions_; }
237 void SetCurrent(Version* _current);
238 uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held
239 uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
240 void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
241
242 // calculate the oldest log needed for the durability of this column family
243 uint64_t OldestLogToKeep();
244
245 // See Memtable constructor for explanation of earliest_seq param.
246 MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
247 SequenceNumber earliest_seq);
248 void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
249 SequenceNumber earliest_seq);
250
251 TableCache* table_cache() const { return table_cache_.get(); }
252
253 // See documentation in compaction_picker.h
254 // REQUIRES: DB mutex held
255 bool NeedsCompaction() const;
256 // REQUIRES: DB mutex held
257 Compaction* PickCompaction(const MutableCFOptions& mutable_options,
258 LogBuffer* log_buffer);
259
260 // Check if the passed range overlap with any running compactions.
261 // REQUIRES: DB mutex held
262 bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
263 const Slice& largest_user_key,
264 int level) const;
265
266 // A flag to tell a manual compaction is to compact all levels together
267 // instad of for specific level.
268 static const int kCompactAllLevels;
269 // A flag to tell a manual compaction's output is base level.
270 static const int kCompactToBaseLevel;
271 // REQUIRES: DB mutex held
272 Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
273 int input_level, int output_level,
274 uint32_t output_path_id, const InternalKey* begin,
275 const InternalKey* end, InternalKey** compaction_end,
276 bool* manual_conflict);
277
278 CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
279 // thread-safe
280 const Comparator* user_comparator() const {
281 return internal_comparator_.user_comparator();
282 }
283 // thread-safe
284 const InternalKeyComparator& internal_comparator() const {
285 return internal_comparator_;
286 }
287
288 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
289 int_tbl_prop_collector_factories() const {
290 return &int_tbl_prop_collector_factories_;
291 }
292
293 SuperVersion* GetSuperVersion() { return super_version_; }
294 // thread-safe
295 // Return a already referenced SuperVersion to be used safely.
296 SuperVersion* GetReferencedSuperVersion(InstrumentedMutex* db_mutex);
297 // thread-safe
298 // Get SuperVersion stored in thread local storage. If it does not exist,
299 // get a reference from a current SuperVersion.
300 SuperVersion* GetThreadLocalSuperVersion(InstrumentedMutex* db_mutex);
301 // Try to return SuperVersion back to thread local storage. Retrun true on
302 // success and false on failure. It fails when the thread local storage
303 // contains anything other than SuperVersion::kSVInUse flag.
304 bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
305 // thread-safe
306 uint64_t GetSuperVersionNumber() const {
307 return super_version_number_.load();
308 }
309 // will return a pointer to SuperVersion* if previous SuperVersion
310 // if its reference count is zero and needs deletion or nullptr if not
311 // As argument takes a pointer to allocated SuperVersion to enable
312 // the clients to allocate SuperVersion outside of mutex.
313 // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
314 SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
315 InstrumentedMutex* db_mutex,
316 const MutableCFOptions& mutable_cf_options);
317 SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
318 InstrumentedMutex* db_mutex);
319
320 void ResetThreadLocalSuperVersions();
321
322 // Protected by DB mutex
323 void set_pending_flush(bool value) { pending_flush_ = value; }
324 void set_pending_compaction(bool value) { pending_compaction_ = value; }
325 bool pending_flush() { return pending_flush_; }
326 bool pending_compaction() { return pending_compaction_; }
327
328 // Recalculate some small conditions, which are changed only during
329 // compaction, adding new memtable and/or
330 // recalculation of compaction score. These values are used in
331 // DBImpl::MakeRoomForWrite function to decide, if it need to make
332 // a write stall
333 void RecalculateWriteStallConditions(
334 const MutableCFOptions& mutable_cf_options);
335
336 private:
337 friend class ColumnFamilySet;
338 ColumnFamilyData(uint32_t id, const std::string& name,
339 Version* dummy_versions, Cache* table_cache,
340 WriteBufferManager* write_buffer_manager,
341 const ColumnFamilyOptions& options,
342 const ImmutableDBOptions& db_options,
343 const EnvOptions& env_options,
344 ColumnFamilySet* column_family_set);
345
346 uint32_t id_;
347 const std::string name_;
348 Version* dummy_versions_; // Head of circular doubly-linked list of versions.
349 Version* current_; // == dummy_versions->prev_
350
351 std::atomic<int> refs_; // outstanding references to ColumnFamilyData
352 bool dropped_; // true if client dropped it
353
354 const InternalKeyComparator internal_comparator_;
355 std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
356 int_tbl_prop_collector_factories_;
357
358 const ColumnFamilyOptions initial_cf_options_;
359 const ImmutableCFOptions ioptions_;
360 MutableCFOptions mutable_cf_options_;
361
362 const bool is_delete_range_supported_;
363
364 std::unique_ptr<TableCache> table_cache_;
365
366 std::unique_ptr<InternalStats> internal_stats_;
367
368 WriteBufferManager* write_buffer_manager_;
369
370 MemTable* mem_;
371 MemTableList imm_;
372 SuperVersion* super_version_;
373
374 // An ordinal representing the current SuperVersion. Updated by
375 // InstallSuperVersion(), i.e. incremented every time super_version_
376 // changes.
377 std::atomic<uint64_t> super_version_number_;
378
379 // Thread's local copy of SuperVersion pointer
380 // This needs to be destructed before mutex_
381 std::unique_ptr<ThreadLocalPtr> local_sv_;
382
383 // pointers for a circular linked list. we use it to support iterations over
384 // all column families that are alive (note: dropped column families can also
385 // be alive as long as client holds a reference)
386 ColumnFamilyData* next_;
387 ColumnFamilyData* prev_;
388
389 // This is the earliest log file number that contains data from this
390 // Column Family. All earlier log files must be ignored and not
391 // recovered from
392 uint64_t log_number_;
393
394 // An object that keeps all the compaction stats
395 // and picks the next compaction
396 std::unique_ptr<CompactionPicker> compaction_picker_;
397
398 ColumnFamilySet* column_family_set_;
399
400 std::unique_ptr<WriteControllerToken> write_controller_token_;
401
402 // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
403 bool pending_flush_;
404
405 // If true --> this ColumnFamily is currently present in
406 // DBImpl::compaction_queue_
407 bool pending_compaction_;
408
409 uint64_t prev_compaction_needed_bytes_;
410
411 // if the database was opened with 2pc enabled
412 bool allow_2pc_;
413 };
414
415 // ColumnFamilySet has interesting thread-safety requirements
416 // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
417 // mutex AND executed in the write thread.
418 // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
419 // single-threaded write thread. It is also called during Recovery and in
420 // DumpManifest().
421 // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
422 // held and it needs to be executed from the write thread. SetDropped() also
423 // guarantees that it will be called only from single-threaded LogAndApply(),
424 // but this condition is not that important.
425 // * Iteration -- hold DB mutex, but you can release it in the body of
426 // iteration. If you release DB mutex in body, reference the column
427 // family before the mutex and unreference after you unlock, since the column
428 // family might get dropped when the DB mutex is released
429 // * GetDefault() -- thread safe
430 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
431 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
432 // NumberOfColumnFamilies -- inside of DB mutex
433 class ColumnFamilySet {
434 public:
435 // ColumnFamilySet supports iteration
436 class iterator {
437 public:
438 explicit iterator(ColumnFamilyData* cfd)
439 : current_(cfd) {}
440 iterator& operator++() {
441 // dropped column families might still be included in this iteration
442 // (we're only removing them when client drops the last reference to the
443 // column family).
444 // dummy is never dead, so this will never be infinite
445 do {
446 current_ = current_->next_;
447 } while (current_->refs_.load(std::memory_order_relaxed) == 0);
448 return *this;
449 }
450 bool operator!=(const iterator& other) {
451 return this->current_ != other.current_;
452 }
453 ColumnFamilyData* operator*() { return current_; }
454
455 private:
456 ColumnFamilyData* current_;
457 };
458
459 ColumnFamilySet(const std::string& dbname,
460 const ImmutableDBOptions* db_options,
461 const EnvOptions& env_options, Cache* table_cache,
462 WriteBufferManager* write_buffer_manager,
463 WriteController* write_controller);
464 ~ColumnFamilySet();
465
466 ColumnFamilyData* GetDefault() const;
467 // GetColumnFamily() calls return nullptr if column family is not found
468 ColumnFamilyData* GetColumnFamily(uint32_t id) const;
469 ColumnFamilyData* GetColumnFamily(const std::string& name) const;
470 // this call will return the next available column family ID. it guarantees
471 // that there is no column family with id greater than or equal to the
472 // returned value in the current running instance or anytime in RocksDB
473 // instance history.
474 uint32_t GetNextColumnFamilyID();
475 uint32_t GetMaxColumnFamily();
476 void UpdateMaxColumnFamily(uint32_t new_max_column_family);
477 size_t NumberOfColumnFamilies() const;
478
479 ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
480 Version* dummy_version,
481 const ColumnFamilyOptions& options);
482
483 iterator begin() { return iterator(dummy_cfd_->next_); }
484 iterator end() { return iterator(dummy_cfd_); }
485
486 // REQUIRES: DB mutex held
487 // Don't call while iterating over ColumnFamilySet
488 void FreeDeadColumnFamilies();
489
490 Cache* get_table_cache() { return table_cache_; }
491
492 private:
493 friend class ColumnFamilyData;
494 // helper function that gets called from cfd destructor
495 // REQUIRES: DB mutex held
496 void RemoveColumnFamily(ColumnFamilyData* cfd);
497
498 // column_families_ and column_family_data_ need to be protected:
499 // * when mutating both conditions have to be satisfied:
500 // 1. DB mutex locked
501 // 2. thread currently in single-threaded write thread
502 // * when reading, at least one condition needs to be satisfied:
503 // 1. DB mutex locked
504 // 2. accessed from a single-threaded write thread
505 std::unordered_map<std::string, uint32_t> column_families_;
506 std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
507
508 uint32_t max_column_family_;
509 ColumnFamilyData* dummy_cfd_;
510 // We don't hold the refcount here, since default column family always exists
511 // We are also not responsible for cleaning up default_cfd_cache_. This is
512 // just a cache that makes common case (accessing default column family)
513 // faster
514 ColumnFamilyData* default_cfd_cache_;
515
516 const std::string db_name_;
517 const ImmutableDBOptions* const db_options_;
518 const EnvOptions env_options_;
519 Cache* table_cache_;
520 WriteBufferManager* write_buffer_manager_;
521 WriteController* write_controller_;
522 };
523
524 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
525 // memtables of different column families (specified by ID in the write batch)
526 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
527 public:
528 explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
529 : column_family_set_(column_family_set), current_(nullptr) {}
530
531 // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
532 // with the arguments used to construct *orig.
533 explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
534 : column_family_set_(orig->column_family_set_), current_(nullptr) {}
535
536 // sets current_ to ColumnFamilyData with column_family_id
537 // returns false if column family doesn't exist
538 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
539 // under a DB mutex OR from a write thread
540 bool Seek(uint32_t column_family_id) override;
541
542 // Returns log number of the selected column family
543 // REQUIRES: under a DB mutex OR from a write thread
544 uint64_t GetLogNumber() const override;
545
546 // REQUIRES: Seek() called first
547 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
548 // under a DB mutex OR from a write thread
549 virtual MemTable* GetMemTable() const override;
550
551 // Returns column family handle for the selected column family
552 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
553 // under a DB mutex OR from a write thread
554 virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
555
556 // Cannot be called while another thread is calling Seek().
557 // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
558 // under a DB mutex OR from a write thread
559 virtual ColumnFamilyData* current() override { return current_; }
560
561 private:
562 ColumnFamilySet* column_family_set_;
563 ColumnFamilyData* current_;
564 ColumnFamilyHandleInternal handle_;
565 };
566
567 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
568
569 extern const Comparator* GetColumnFamilyUserComparator(
570 ColumnFamilyHandle* column_family);
571
572 } // namespace rocksdb