]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | ||
12 | #include <unordered_map> | |
13 | #include <string> | |
14 | #include <vector> | |
15 | #include <atomic> | |
16 | ||
17 | #include "db/memtable_list.h" | |
18 | #include "db/table_cache.h" | |
19 | #include "db/table_properties_collector.h" | |
20 | #include "db/write_batch_internal.h" | |
21 | #include "db/write_controller.h" | |
22 | #include "options/cf_options.h" | |
23 | #include "rocksdb/compaction_job_stats.h" | |
24 | #include "rocksdb/db.h" | |
25 | #include "rocksdb/env.h" | |
26 | #include "rocksdb/options.h" | |
f67539c2 | 27 | #include "trace_replay/block_cache_tracer.h" |
7c673cae FG |
28 | #include "util/thread_local.h" |
29 | ||
f67539c2 | 30 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
31 | |
32 | class Version; | |
33 | class VersionSet; | |
11fdf7f2 | 34 | class VersionStorageInfo; |
7c673cae FG |
35 | class MemTable; |
36 | class MemTableListVersion; | |
37 | class CompactionPicker; | |
38 | class Compaction; | |
39 | class InternalKey; | |
40 | class InternalStats; | |
41 | class ColumnFamilyData; | |
42 | class DBImpl; | |
43 | class LogBuffer; | |
44 | class InstrumentedMutex; | |
45 | class InstrumentedMutexLock; | |
11fdf7f2 | 46 | struct SuperVersionContext; |
20effc67 | 47 | class BlobFileCache; |
7c673cae FG |
48 | |
49 | extern const double kIncSlowdownRatio; | |
f67539c2 TL |
50 | // This file contains a list of data structures for managing column family |
51 | // level metadata. | |
52 | // | |
53 | // The basic relationships among classes declared here are illustrated as | |
54 | // following: | |
55 | // | |
56 | // +----------------------+ +----------------------+ +--------+ | |
57 | // +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | | |
58 | // | +----------------------+ | +----------------------+ +----+---+ | |
59 | // | +--------------------------+ | | |
60 | // | | +-----------------------------+ | |
61 | // | | | | |
62 | // | | +-----------------------------v-------------------------------+ | |
63 | // | | | | | |
64 | // | | | ColumnFamilySet | | |
65 | // | | | | | |
66 | // | | +-------------+--------------------------+----------------+---+ | |
67 | // | | | | | | |
68 | // | +-------------------------------------+ | | | |
69 | // | | | | v | |
70 | // | +-------------v-------------+ +-----v----v---------+ | |
71 | // | | | | | | |
72 | // | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... | |
73 | // | | | | | | |
74 | // +---> | | | | |
75 | // | +---------+ | | | |
76 | // | | MemTable| | | | |
77 | // | | List | | | | |
78 | // +--------+---+--+-+----+----+ +--------------------++ | |
79 | // | | | | | |
80 | // | | | | | |
81 | // | | | +-----------------------+ | |
82 | // | | +-----------+ | | |
83 | // v +--------+ | | | |
84 | // +--------+--------+ | | | | |
85 | // | | | | +----------v----------+ | |
86 | // +---> |SuperVersion 1.a +-----------------> | | |
87 | // | +------+ | | MemTableListVersion | | |
88 | // +---+-------------+ | | | | | | |
89 | // | | | | +----+------------+---+ | |
90 | // | current | | | | | | |
91 | // | +-------------+ | |mem | | | |
92 | // | | | | | | | |
93 | // +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ | |
94 | // | | | | | | | | | |
95 | // | Version 1.a | | memtable | | memtable | | memtable | | |
96 | // | | | 1.a | | 1.b | | 1.c | | |
97 | // +-------------+ | | | | | | | |
98 | // +----------+ +----------+ +----------+ | |
99 | // | |
100 | // DBImpl keeps a ColumnFamilySet, which references to all column families by | |
101 | // pointing to respective ColumnFamilyData object of each column family. | |
102 | // This is how DBImpl can list and operate on all the column families. | |
103 | // ColumnFamilyHandle also points to ColumnFamilyData directly, so that | |
104 | // when a user executes a query, it can directly find memtables and Version | |
105 | // as well as SuperVersion to the column family, without going through | |
106 | // ColumnFamilySet. | |
107 | // | |
108 | // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables | |
109 | // and SST files) indirectly, while ongoing operations may hold references | |
110 | // to a current or an out-of-date SuperVersion, which in turn points to a | |
111 | // point-in-time view of the LSM-tree. This guarantees the memtables and SST | |
112 | // files being operated on will not go away, until the SuperVersion is | |
113 | // unreferenced to 0 and destoryed. | |
114 | // | |
115 | // The following graph illustrates a possible referencing relationships: | |
116 | // | |
117 | // Column +--------------+ current +-----------+ | |
118 | // Family +---->+ +------------------->+ | | |
119 | // Data | SuperVersion +----------+ | Version A | | |
120 | // | 3 | imm | | | | |
121 | // Iter2 +----->+ | +-------v------+ +-----------+ | |
122 | // +-----+--------+ | MemtableList +----------------> Empty | |
123 | // | | Version r | +-----------+ | |
124 | // | +--------------+ | | | |
125 | // +------------------+ current| Version B | | |
126 | // +--------------+ | +----->+ | | |
127 | // | | | | +-----+-----+ | |
128 | // Compaction +>+ SuperVersion +-------------+ ^ | |
129 | // Job | 2 +------+ | |current | |
130 | // | +----+ | | mem | +------------+ | |
131 | // +--------------+ | | +---------------------> | | |
132 | // | +------------------------> MemTable a | | |
133 | // | mem | | | | |
134 | // +--------------+ | | +------------+ | |
135 | // | +--------------------------+ | |
136 | // Iter1 +-----> SuperVersion | | +------------+ | |
137 | // | 1 +------------------------------>+ | | |
138 | // | +-+ | mem | MemTable b | | |
139 | // +--------------+ | | | | | |
140 | // | | +--------------+ +-----^------+ | |
141 | // | |imm | MemtableList | | | |
142 | // | +--->+ Version s +------------+ | |
143 | // | +--------------+ | |
144 | // | +--------------+ | |
145 | // | | MemtableList | | |
146 | // +------>+ Version t +--------> Empty | |
147 | // imm +--------------+ | |
148 | // | |
149 | // In this example, even if the current LSM-tree consists of Version A and | |
150 | // memtable a, which is also referenced by SuperVersion, two older SuperVersion | |
151 | // SuperVersion2 and Superversion1 still exist, and are referenced by a | |
152 | // compaction job and an old iterator Iter1, respectively. SuperVersion2 | |
153 | // contains Version B, memtable a and memtable b; SuperVersion1 contains | |
154 | // Version B and memtable b (mutable). As a result, Version B and memtable b | |
155 | // are prevented from being destroyed or deleted. | |
7c673cae FG |
156 | |
157 | // ColumnFamilyHandleImpl is the class that clients use to access different | |
158 | // column families. It has non-trivial destructor, which gets called when client | |
159 | // is done using the column family | |
160 | class ColumnFamilyHandleImpl : public ColumnFamilyHandle { | |
161 | public: | |
162 | // create while holding the mutex | |
163 | ColumnFamilyHandleImpl( | |
164 | ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex); | |
165 | // destroy without mutex | |
166 | virtual ~ColumnFamilyHandleImpl(); | |
167 | virtual ColumnFamilyData* cfd() const { return cfd_; } | |
168 | ||
169 | virtual uint32_t GetID() const override; | |
170 | virtual const std::string& GetName() const override; | |
171 | virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; | |
172 | virtual const Comparator* GetComparator() const override; | |
173 | ||
174 | private: | |
175 | ColumnFamilyData* cfd_; | |
176 | DBImpl* db_; | |
177 | InstrumentedMutex* mutex_; | |
178 | }; | |
179 | ||
180 | // Does not ref-count ColumnFamilyData | |
181 | // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter | |
182 | // calls DBImpl methods. When this happens, MemTableInserter need access to | |
183 | // ColumnFamilyHandle (same as the client would need). In that case, we feed | |
184 | // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl | |
185 | // methods | |
186 | class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { | |
187 | public: | |
188 | ColumnFamilyHandleInternal() | |
11fdf7f2 | 189 | : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {} |
7c673cae FG |
190 | |
191 | void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } | |
192 | virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } | |
193 | ||
194 | private: | |
195 | ColumnFamilyData* internal_cfd_; | |
196 | }; | |
197 | ||
198 | // holds references to memtable, all immutable memtables and version | |
199 | struct SuperVersion { | |
200 | // Accessing members of this class is not thread-safe and requires external | |
201 | // synchronization (ie db mutex held or on write thread). | |
f67539c2 | 202 | ColumnFamilyData* cfd; |
7c673cae FG |
203 | MemTable* mem; |
204 | MemTableListVersion* imm; | |
205 | Version* current; | |
206 | MutableCFOptions mutable_cf_options; | |
207 | // Version number of the current SuperVersion | |
208 | uint64_t version_number; | |
11fdf7f2 | 209 | WriteStallCondition write_stall_condition; |
7c673cae FG |
210 | |
211 | InstrumentedMutex* db_mutex; | |
212 | ||
213 | // should be called outside the mutex | |
214 | SuperVersion() = default; | |
215 | ~SuperVersion(); | |
216 | SuperVersion* Ref(); | |
217 | // If Unref() returns true, Cleanup() should be called with mutex held | |
218 | // before deleting this SuperVersion. | |
219 | bool Unref(); | |
220 | ||
221 | // call these two methods with db mutex held | |
222 | // Cleanup unrefs mem, imm and current. Also, it stores all memtables | |
223 | // that needs to be deleted in to_delete vector. Unrefing those | |
224 | // objects needs to be done in the mutex | |
225 | void Cleanup(); | |
f67539c2 TL |
226 | void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, |
227 | MemTableListVersion* new_imm, Version* new_current); | |
7c673cae FG |
228 | |
229 | // The value of dummy is not actually used. kSVInUse takes its address as a | |
230 | // mark in the thread local storage to indicate the SuperVersion is in use | |
231 | // by thread. This way, the value of kSVInUse is guaranteed to have no | |
232 | // conflict with SuperVersion object address and portable on different | |
233 | // platform. | |
234 | static int dummy; | |
235 | static void* const kSVInUse; | |
236 | static void* const kSVObsolete; | |
237 | ||
238 | private: | |
239 | std::atomic<uint32_t> refs; | |
240 | // We need to_delete because during Cleanup(), imm->Unref() returns | |
241 | // all memtables that we need to free through this vector. We then | |
242 | // delete all those memtables outside of mutex, during destruction | |
243 | autovector<MemTable*> to_delete; | |
244 | }; | |
245 | ||
246 | extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); | |
247 | ||
248 | extern Status CheckConcurrentWritesSupported( | |
249 | const ColumnFamilyOptions& cf_options); | |
250 | ||
11fdf7f2 TL |
251 | extern Status CheckCFPathsSupported(const DBOptions& db_options, |
252 | const ColumnFamilyOptions& cf_options); | |
253 | ||
7c673cae FG |
254 | extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, |
255 | const ColumnFamilyOptions& src); | |
256 | // Wrap user defined table proproties collector factories `from cf_options` | |
257 | // into internal ones in int_tbl_prop_collector_factories. Add a system internal | |
258 | // one too. | |
259 | extern void GetIntTblPropCollectorFactory( | |
260 | const ImmutableCFOptions& ioptions, | |
261 | std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* | |
262 | int_tbl_prop_collector_factories); | |
263 | ||
264 | class ColumnFamilySet; | |
265 | ||
266 | // This class keeps all the data that a column family needs. | |
267 | // Most methods require DB mutex held, unless otherwise noted | |
268 | class ColumnFamilyData { | |
269 | public: | |
270 | ~ColumnFamilyData(); | |
271 | ||
272 | // thread-safe | |
273 | uint32_t GetID() const { return id_; } | |
274 | // thread-safe | |
275 | const std::string& GetName() const { return name_; } | |
276 | ||
277 | // Ref() can only be called from a context where the caller can guarantee | |
278 | // that ColumnFamilyData is alive (while holding a non-zero ref already, | |
279 | // holding a DB mutex, or as the leader in a write batch group). | |
f67539c2 | 280 | void Ref() { refs_.fetch_add(1); } |
7c673cae FG |
281 | |
282 | // Unref decreases the reference count, but does not handle deletion | |
283 | // when the count goes to 0. If this method returns true then the | |
284 | // caller should delete the instance immediately, or later, by calling | |
285 | // FreeDeadColumnFamilies(). Unref() can only be called while holding | |
286 | // a DB mutex, or during single-threaded recovery. | |
287 | bool Unref() { | |
f67539c2 | 288 | int old_refs = refs_.fetch_sub(1); |
7c673cae FG |
289 | assert(old_refs > 0); |
290 | return old_refs == 1; | |
291 | } | |
292 | ||
f67539c2 TL |
293 | // UnrefAndTryDelete() decreases the reference count and do free if needed, |
294 | // return true if this is freed else false, UnrefAndTryDelete() can only | |
295 | // be called while holding a DB mutex, or during single-threaded recovery. | |
296 | bool UnrefAndTryDelete(); | |
297 | ||
7c673cae FG |
298 | // SetDropped() can only be called under following conditions: |
299 | // 1) Holding a DB mutex, | |
300 | // 2) from single-threaded write thread, AND | |
301 | // 3) from single-threaded VersionSet::LogAndApply() | |
302 | // After dropping column family no other operation on that column family | |
303 | // will be executed. All the files and memory will be, however, kept around | |
304 | // until client drops the column family handle. That way, client can still | |
305 | // access data from dropped column family. | |
306 | // Column family can be dropped and still alive. In that state: | |
307 | // *) Compaction and flush is not executed on the dropped column family. | |
308 | // *) Client can continue reading from column family. Writes will fail unless | |
309 | // WriteOptions::ignore_missing_column_families is true | |
310 | // When the dropped column family is unreferenced, then we: | |
311 | // *) Remove column family from the linked list maintained by ColumnFamilySet | |
312 | // *) delete all memory associated with that column family | |
313 | // *) delete all the files associated with that column family | |
314 | void SetDropped(); | |
11fdf7f2 | 315 | bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } |
7c673cae FG |
316 | |
317 | // thread-safe | |
318 | int NumberLevels() const { return ioptions_.num_levels; } | |
319 | ||
320 | void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } | |
321 | uint64_t GetLogNumber() const { return log_number_; } | |
322 | ||
11fdf7f2 TL |
323 | void SetFlushReason(FlushReason flush_reason) { |
324 | flush_reason_ = flush_reason; | |
325 | } | |
326 | FlushReason GetFlushReason() const { return flush_reason_; } | |
7c673cae | 327 | // thread-safe |
f67539c2 | 328 | const FileOptions* soptions() const; |
7c673cae FG |
329 | const ImmutableCFOptions* ioptions() const { return &ioptions_; } |
330 | // REQUIRES: DB mutex held | |
331 | // This returns the MutableCFOptions used by current SuperVersion | |
332 | // You should use this API to reference MutableCFOptions most of the time. | |
333 | const MutableCFOptions* GetCurrentMutableCFOptions() const { | |
334 | return &(super_version_->mutable_cf_options); | |
335 | } | |
336 | // REQUIRES: DB mutex held | |
337 | // This returns the latest MutableCFOptions, which may be not in effect yet. | |
338 | const MutableCFOptions* GetLatestMutableCFOptions() const { | |
339 | return &mutable_cf_options_; | |
340 | } | |
341 | ||
342 | // REQUIRES: DB mutex held | |
343 | // Build ColumnFamiliesOptions with immutable options and latest mutable | |
344 | // options. | |
345 | ColumnFamilyOptions GetLatestCFOptions() const; | |
346 | ||
347 | bool is_delete_range_supported() { return is_delete_range_supported_; } | |
348 | ||
f67539c2 TL |
349 | // Validate CF options against DB options |
350 | static Status ValidateOptions(const DBOptions& db_options, | |
351 | const ColumnFamilyOptions& cf_options); | |
7c673cae FG |
352 | #ifndef ROCKSDB_LITE |
353 | // REQUIRES: DB mutex held | |
354 | Status SetOptions( | |
f67539c2 | 355 | const DBOptions& db_options, |
7c673cae FG |
356 | const std::unordered_map<std::string, std::string>& options_map); |
357 | #endif // ROCKSDB_LITE | |
358 | ||
359 | InternalStats* internal_stats() { return internal_stats_.get(); } | |
360 | ||
361 | MemTableList* imm() { return &imm_; } | |
362 | MemTable* mem() { return mem_; } | |
363 | Version* current() { return current_; } | |
364 | Version* dummy_versions() { return dummy_versions_; } | |
365 | void SetCurrent(Version* _current); | |
366 | uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held | |
367 | uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held | |
11fdf7f2 TL |
368 | uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held |
369 | void SetMemtable(MemTable* new_mem) { | |
370 | uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; | |
371 | new_mem->SetID(memtable_id); | |
372 | mem_ = new_mem; | |
373 | } | |
7c673cae FG |
374 | |
375 | // calculate the oldest log needed for the durability of this column family | |
376 | uint64_t OldestLogToKeep(); | |
377 | ||
378 | // See Memtable constructor for explanation of earliest_seq param. | |
379 | MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, | |
380 | SequenceNumber earliest_seq); | |
381 | void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, | |
382 | SequenceNumber earliest_seq); | |
383 | ||
384 | TableCache* table_cache() const { return table_cache_.get(); } | |
20effc67 | 385 | BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); } |
7c673cae FG |
386 | |
387 | // See documentation in compaction_picker.h | |
388 | // REQUIRES: DB mutex held | |
389 | bool NeedsCompaction() const; | |
390 | // REQUIRES: DB mutex held | |
391 | Compaction* PickCompaction(const MutableCFOptions& mutable_options, | |
20effc67 | 392 | const MutableDBOptions& mutable_db_options, |
7c673cae FG |
393 | LogBuffer* log_buffer); |
394 | ||
395 | // Check if the passed range overlap with any running compactions. | |
396 | // REQUIRES: DB mutex held | |
397 | bool RangeOverlapWithCompaction(const Slice& smallest_user_key, | |
398 | const Slice& largest_user_key, | |
399 | int level) const; | |
400 | ||
11fdf7f2 TL |
401 | // Check if the passed ranges overlap with any unflushed memtables |
402 | // (immutable or mutable). | |
403 | // | |
404 | // @param super_version A referenced SuperVersion that will be held for the | |
405 | // duration of this function. | |
406 | // | |
407 | // Thread-safe | |
408 | Status RangesOverlapWithMemtables(const autovector<Range>& ranges, | |
20effc67 TL |
409 | SuperVersion* super_version, |
410 | bool allow_data_in_errors, bool* overlap); | |
11fdf7f2 | 411 | |
7c673cae | 412 | // A flag to tell a manual compaction is to compact all levels together |
11fdf7f2 | 413 | // instead of a specific level. |
7c673cae FG |
414 | static const int kCompactAllLevels; |
415 | // A flag to tell a manual compaction's output is base level. | |
416 | static const int kCompactToBaseLevel; | |
417 | // REQUIRES: DB mutex held | |
418 | Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, | |
20effc67 | 419 | const MutableDBOptions& mutable_db_options, |
7c673cae | 420 | int input_level, int output_level, |
f67539c2 | 421 | const CompactRangeOptions& compact_range_options, |
11fdf7f2 | 422 | const InternalKey* begin, const InternalKey* end, |
f67539c2 TL |
423 | InternalKey** compaction_end, bool* manual_conflict, |
424 | uint64_t max_file_num_to_ignore); | |
7c673cae FG |
425 | |
426 | CompactionPicker* compaction_picker() { return compaction_picker_.get(); } | |
427 | // thread-safe | |
428 | const Comparator* user_comparator() const { | |
429 | return internal_comparator_.user_comparator(); | |
430 | } | |
431 | // thread-safe | |
432 | const InternalKeyComparator& internal_comparator() const { | |
433 | return internal_comparator_; | |
434 | } | |
435 | ||
436 | const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* | |
437 | int_tbl_prop_collector_factories() const { | |
438 | return &int_tbl_prop_collector_factories_; | |
439 | } | |
440 | ||
441 | SuperVersion* GetSuperVersion() { return super_version_; } | |
442 | // thread-safe | |
443 | // Return a already referenced SuperVersion to be used safely. | |
f67539c2 | 444 | SuperVersion* GetReferencedSuperVersion(DBImpl* db); |
7c673cae FG |
445 | // thread-safe |
446 | // Get SuperVersion stored in thread local storage. If it does not exist, | |
447 | // get a reference from a current SuperVersion. | |
f67539c2 | 448 | SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); |
7c673cae FG |
449 | // Try to return SuperVersion back to thread local storage. Retrun true on |
450 | // success and false on failure. It fails when the thread local storage | |
451 | // contains anything other than SuperVersion::kSVInUse flag. | |
452 | bool ReturnThreadLocalSuperVersion(SuperVersion* sv); | |
453 | // thread-safe | |
454 | uint64_t GetSuperVersionNumber() const { | |
455 | return super_version_number_.load(); | |
456 | } | |
457 | // will return a pointer to SuperVersion* if previous SuperVersion | |
458 | // if its reference count is zero and needs deletion or nullptr if not | |
459 | // As argument takes a pointer to allocated SuperVersion to enable | |
460 | // the clients to allocate SuperVersion outside of mutex. | |
461 | // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() | |
11fdf7f2 TL |
462 | void InstallSuperVersion(SuperVersionContext* sv_context, |
463 | InstrumentedMutex* db_mutex, | |
464 | const MutableCFOptions& mutable_cf_options); | |
465 | void InstallSuperVersion(SuperVersionContext* sv_context, | |
466 | InstrumentedMutex* db_mutex); | |
7c673cae FG |
467 | |
468 | void ResetThreadLocalSuperVersions(); | |
469 | ||
470 | // Protected by DB mutex | |
11fdf7f2 TL |
471 | void set_queued_for_flush(bool value) { queued_for_flush_ = value; } |
472 | void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } | |
473 | bool queued_for_flush() { return queued_for_flush_; } | |
474 | bool queued_for_compaction() { return queued_for_compaction_; } | |
475 | ||
476 | enum class WriteStallCause { | |
477 | kNone, | |
478 | kMemtableLimit, | |
479 | kL0FileCountLimit, | |
480 | kPendingCompactionBytes, | |
481 | }; | |
482 | static std::pair<WriteStallCondition, WriteStallCause> | |
483 | GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files, | |
484 | uint64_t num_compaction_needed_bytes, | |
485 | const MutableCFOptions& mutable_cf_options); | |
7c673cae FG |
486 | |
487 | // Recalculate some small conditions, which are changed only during | |
488 | // compaction, adding new memtable and/or | |
489 | // recalculation of compaction score. These values are used in | |
490 | // DBImpl::MakeRoomForWrite function to decide, if it need to make | |
491 | // a write stall | |
11fdf7f2 | 492 | WriteStallCondition RecalculateWriteStallConditions( |
7c673cae FG |
493 | const MutableCFOptions& mutable_cf_options); |
494 | ||
11fdf7f2 TL |
495 | void set_initialized() { initialized_.store(true); } |
496 | ||
497 | bool initialized() const { return initialized_.load(); } | |
498 | ||
499 | const ColumnFamilyOptions& initial_cf_options() { | |
500 | return initial_cf_options_; | |
501 | } | |
502 | ||
503 | Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); | |
504 | ||
f67539c2 TL |
505 | // created_dirs remembers directory created, so that we don't need to call |
506 | // the same data creation operation again. | |
507 | Status AddDirectories( | |
20effc67 | 508 | std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs); |
11fdf7f2 | 509 | |
20effc67 | 510 | FSDirectory* GetDataDir(size_t path_id) const; |
11fdf7f2 | 511 | |
494da23a TL |
512 | ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } |
513 | ||
7c673cae FG |
514 | private: |
515 | friend class ColumnFamilySet; | |
20effc67 | 516 | static const uint32_t kDummyColumnFamilyDataId; |
7c673cae FG |
517 | ColumnFamilyData(uint32_t id, const std::string& name, |
518 | Version* dummy_versions, Cache* table_cache, | |
519 | WriteBufferManager* write_buffer_manager, | |
520 | const ColumnFamilyOptions& options, | |
521 | const ImmutableDBOptions& db_options, | |
f67539c2 TL |
522 | const FileOptions& file_options, |
523 | ColumnFamilySet* column_family_set, | |
20effc67 TL |
524 | BlockCacheTracer* const block_cache_tracer, |
525 | const std::shared_ptr<IOTracer>& io_tracer); | |
526 | ||
527 | std::vector<std::string> GetDbPaths() const; | |
7c673cae FG |
528 | |
529 | uint32_t id_; | |
530 | const std::string name_; | |
531 | Version* dummy_versions_; // Head of circular doubly-linked list of versions. | |
532 | Version* current_; // == dummy_versions->prev_ | |
533 | ||
534 | std::atomic<int> refs_; // outstanding references to ColumnFamilyData | |
11fdf7f2 TL |
535 | std::atomic<bool> initialized_; |
536 | std::atomic<bool> dropped_; // true if client dropped it | |
7c673cae FG |
537 | |
538 | const InternalKeyComparator internal_comparator_; | |
539 | std::vector<std::unique_ptr<IntTblPropCollectorFactory>> | |
540 | int_tbl_prop_collector_factories_; | |
541 | ||
542 | const ColumnFamilyOptions initial_cf_options_; | |
543 | const ImmutableCFOptions ioptions_; | |
544 | MutableCFOptions mutable_cf_options_; | |
545 | ||
546 | const bool is_delete_range_supported_; | |
547 | ||
548 | std::unique_ptr<TableCache> table_cache_; | |
20effc67 | 549 | std::unique_ptr<BlobFileCache> blob_file_cache_; |
7c673cae FG |
550 | |
551 | std::unique_ptr<InternalStats> internal_stats_; | |
552 | ||
553 | WriteBufferManager* write_buffer_manager_; | |
554 | ||
555 | MemTable* mem_; | |
556 | MemTableList imm_; | |
557 | SuperVersion* super_version_; | |
558 | ||
559 | // An ordinal representing the current SuperVersion. Updated by | |
560 | // InstallSuperVersion(), i.e. incremented every time super_version_ | |
561 | // changes. | |
562 | std::atomic<uint64_t> super_version_number_; | |
563 | ||
564 | // Thread's local copy of SuperVersion pointer | |
565 | // This needs to be destructed before mutex_ | |
566 | std::unique_ptr<ThreadLocalPtr> local_sv_; | |
567 | ||
568 | // pointers for a circular linked list. we use it to support iterations over | |
569 | // all column families that are alive (note: dropped column families can also | |
570 | // be alive as long as client holds a reference) | |
571 | ColumnFamilyData* next_; | |
572 | ColumnFamilyData* prev_; | |
573 | ||
574 | // This is the earliest log file number that contains data from this | |
575 | // Column Family. All earlier log files must be ignored and not | |
576 | // recovered from | |
577 | uint64_t log_number_; | |
578 | ||
11fdf7f2 TL |
579 | std::atomic<FlushReason> flush_reason_; |
580 | ||
7c673cae FG |
581 | // An object that keeps all the compaction stats |
582 | // and picks the next compaction | |
583 | std::unique_ptr<CompactionPicker> compaction_picker_; | |
584 | ||
585 | ColumnFamilySet* column_family_set_; | |
586 | ||
587 | std::unique_ptr<WriteControllerToken> write_controller_token_; | |
588 | ||
589 | // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ | |
11fdf7f2 | 590 | bool queued_for_flush_; |
7c673cae FG |
591 | |
592 | // If true --> this ColumnFamily is currently present in | |
593 | // DBImpl::compaction_queue_ | |
11fdf7f2 | 594 | bool queued_for_compaction_; |
7c673cae FG |
595 | |
596 | uint64_t prev_compaction_needed_bytes_; | |
597 | ||
598 | // if the database was opened with 2pc enabled | |
599 | bool allow_2pc_; | |
11fdf7f2 TL |
600 | |
601 | // Memtable id to track flush. | |
602 | std::atomic<uint64_t> last_memtable_id_; | |
603 | ||
604 | // Directories corresponding to cf_paths. | |
20effc67 TL |
605 | std::vector<std::shared_ptr<FSDirectory>> data_dirs_; |
606 | ||
607 | bool db_paths_registered_; | |
7c673cae FG |
608 | }; |
609 | ||
610 | // ColumnFamilySet has interesting thread-safety requirements | |
611 | // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB | |
612 | // mutex AND executed in the write thread. | |
613 | // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND | |
614 | // single-threaded write thread. It is also called during Recovery and in | |
615 | // DumpManifest(). | |
616 | // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be | |
617 | // held and it needs to be executed from the write thread. SetDropped() also | |
618 | // guarantees that it will be called only from single-threaded LogAndApply(), | |
619 | // but this condition is not that important. | |
620 | // * Iteration -- hold DB mutex, but you can release it in the body of | |
621 | // iteration. If you release DB mutex in body, reference the column | |
622 | // family before the mutex and unreference after you unlock, since the column | |
623 | // family might get dropped when the DB mutex is released | |
624 | // * GetDefault() -- thread safe | |
625 | // * GetColumnFamily() -- either inside of DB mutex or from a write thread | |
626 | // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), | |
627 | // NumberOfColumnFamilies -- inside of DB mutex | |
628 | class ColumnFamilySet { | |
629 | public: | |
630 | // ColumnFamilySet supports iteration | |
631 | class iterator { | |
632 | public: | |
633 | explicit iterator(ColumnFamilyData* cfd) | |
634 | : current_(cfd) {} | |
635 | iterator& operator++() { | |
636 | // dropped column families might still be included in this iteration | |
637 | // (we're only removing them when client drops the last reference to the | |
638 | // column family). | |
639 | // dummy is never dead, so this will never be infinite | |
640 | do { | |
641 | current_ = current_->next_; | |
642 | } while (current_->refs_.load(std::memory_order_relaxed) == 0); | |
643 | return *this; | |
644 | } | |
645 | bool operator!=(const iterator& other) { | |
646 | return this->current_ != other.current_; | |
647 | } | |
648 | ColumnFamilyData* operator*() { return current_; } | |
649 | ||
650 | private: | |
651 | ColumnFamilyData* current_; | |
652 | }; | |
653 | ||
654 | ColumnFamilySet(const std::string& dbname, | |
655 | const ImmutableDBOptions* db_options, | |
f67539c2 | 656 | const FileOptions& file_options, Cache* table_cache, |
20effc67 TL |
657 | WriteBufferManager* _write_buffer_manager, |
658 | WriteController* _write_controller, | |
659 | BlockCacheTracer* const block_cache_tracer, | |
660 | const std::shared_ptr<IOTracer>& io_tracer); | |
7c673cae FG |
661 | ~ColumnFamilySet(); |
662 | ||
663 | ColumnFamilyData* GetDefault() const; | |
664 | // GetColumnFamily() calls return nullptr if column family is not found | |
665 | ColumnFamilyData* GetColumnFamily(uint32_t id) const; | |
666 | ColumnFamilyData* GetColumnFamily(const std::string& name) const; | |
667 | // this call will return the next available column family ID. it guarantees | |
668 | // that there is no column family with id greater than or equal to the | |
669 | // returned value in the current running instance or anytime in RocksDB | |
670 | // instance history. | |
671 | uint32_t GetNextColumnFamilyID(); | |
672 | uint32_t GetMaxColumnFamily(); | |
673 | void UpdateMaxColumnFamily(uint32_t new_max_column_family); | |
674 | size_t NumberOfColumnFamilies() const; | |
675 | ||
676 | ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, | |
677 | Version* dummy_version, | |
678 | const ColumnFamilyOptions& options); | |
679 | ||
680 | iterator begin() { return iterator(dummy_cfd_->next_); } | |
681 | iterator end() { return iterator(dummy_cfd_); } | |
682 | ||
683 | // REQUIRES: DB mutex held | |
684 | // Don't call while iterating over ColumnFamilySet | |
685 | void FreeDeadColumnFamilies(); | |
686 | ||
687 | Cache* get_table_cache() { return table_cache_; } | |
688 | ||
20effc67 TL |
689 | WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } |
690 | ||
691 | WriteController* write_controller() { return write_controller_; } | |
692 | ||
7c673cae FG |
693 | private: |
694 | friend class ColumnFamilyData; | |
695 | // helper function that gets called from cfd destructor | |
696 | // REQUIRES: DB mutex held | |
697 | void RemoveColumnFamily(ColumnFamilyData* cfd); | |
698 | ||
699 | // column_families_ and column_family_data_ need to be protected: | |
700 | // * when mutating both conditions have to be satisfied: | |
701 | // 1. DB mutex locked | |
702 | // 2. thread currently in single-threaded write thread | |
703 | // * when reading, at least one condition needs to be satisfied: | |
704 | // 1. DB mutex locked | |
705 | // 2. accessed from a single-threaded write thread | |
706 | std::unordered_map<std::string, uint32_t> column_families_; | |
707 | std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_; | |
708 | ||
709 | uint32_t max_column_family_; | |
710 | ColumnFamilyData* dummy_cfd_; | |
711 | // We don't hold the refcount here, since default column family always exists | |
712 | // We are also not responsible for cleaning up default_cfd_cache_. This is | |
713 | // just a cache that makes common case (accessing default column family) | |
714 | // faster | |
715 | ColumnFamilyData* default_cfd_cache_; | |
716 | ||
717 | const std::string db_name_; | |
718 | const ImmutableDBOptions* const db_options_; | |
f67539c2 | 719 | const FileOptions file_options_; |
7c673cae FG |
720 | Cache* table_cache_; |
721 | WriteBufferManager* write_buffer_manager_; | |
722 | WriteController* write_controller_; | |
f67539c2 | 723 | BlockCacheTracer* const block_cache_tracer_; |
20effc67 | 724 | std::shared_ptr<IOTracer> io_tracer_; |
7c673cae FG |
725 | }; |
726 | ||
727 | // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access | |
728 | // memtables of different column families (specified by ID in the write batch) | |
729 | class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { | |
730 | public: | |
731 | explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) | |
732 | : column_family_set_(column_family_set), current_(nullptr) {} | |
733 | ||
734 | // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed | |
735 | // with the arguments used to construct *orig. | |
736 | explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) | |
737 | : column_family_set_(orig->column_family_set_), current_(nullptr) {} | |
738 | ||
739 | // sets current_ to ColumnFamilyData with column_family_id | |
740 | // returns false if column family doesn't exist | |
741 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
742 | // under a DB mutex OR from a write thread | |
743 | bool Seek(uint32_t column_family_id) override; | |
744 | ||
745 | // Returns log number of the selected column family | |
746 | // REQUIRES: under a DB mutex OR from a write thread | |
747 | uint64_t GetLogNumber() const override; | |
748 | ||
749 | // REQUIRES: Seek() called first | |
750 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
751 | // under a DB mutex OR from a write thread | |
752 | virtual MemTable* GetMemTable() const override; | |
753 | ||
754 | // Returns column family handle for the selected column family | |
755 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
756 | // under a DB mutex OR from a write thread | |
757 | virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; | |
758 | ||
759 | // Cannot be called while another thread is calling Seek(). | |
760 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
761 | // under a DB mutex OR from a write thread | |
762 | virtual ColumnFamilyData* current() override { return current_; } | |
763 | ||
764 | private: | |
765 | ColumnFamilySet* column_family_set_; | |
766 | ColumnFamilyData* current_; | |
767 | ColumnFamilyHandleInternal handle_; | |
768 | }; | |
769 | ||
770 | extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); | |
771 | ||
772 | extern const Comparator* GetColumnFamilyUserComparator( | |
773 | ColumnFamilyHandle* column_family); | |
774 | ||
f67539c2 | 775 | } // namespace ROCKSDB_NAMESPACE |