]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | ||
12 | #include <unordered_map> | |
13 | #include <string> | |
14 | #include <vector> | |
15 | #include <atomic> | |
16 | ||
17 | #include "db/memtable_list.h" | |
18 | #include "db/table_cache.h" | |
19 | #include "db/table_properties_collector.h" | |
20 | #include "db/write_batch_internal.h" | |
21 | #include "db/write_controller.h" | |
22 | #include "options/cf_options.h" | |
23 | #include "rocksdb/compaction_job_stats.h" | |
24 | #include "rocksdb/db.h" | |
25 | #include "rocksdb/env.h" | |
26 | #include "rocksdb/options.h" | |
f67539c2 | 27 | #include "trace_replay/block_cache_tracer.h" |
7c673cae FG |
28 | #include "util/thread_local.h" |
29 | ||
f67539c2 | 30 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
31 | |
32 | class Version; | |
33 | class VersionSet; | |
11fdf7f2 | 34 | class VersionStorageInfo; |
7c673cae FG |
35 | class MemTable; |
36 | class MemTableListVersion; | |
37 | class CompactionPicker; | |
38 | class Compaction; | |
39 | class InternalKey; | |
40 | class InternalStats; | |
41 | class ColumnFamilyData; | |
42 | class DBImpl; | |
43 | class LogBuffer; | |
44 | class InstrumentedMutex; | |
45 | class InstrumentedMutexLock; | |
11fdf7f2 | 46 | struct SuperVersionContext; |
7c673cae FG |
47 | |
48 | extern const double kIncSlowdownRatio; | |
f67539c2 TL |
49 | // This file contains a list of data structures for managing column family |
50 | // level metadata. | |
51 | // | |
52 | // The basic relationships among classes declared here are illustrated as | |
53 | // following: | |
54 | // | |
55 | // +----------------------+ +----------------------+ +--------+ | |
56 | // +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | | |
57 | // | +----------------------+ | +----------------------+ +----+---+ | |
58 | // | +--------------------------+ | | |
59 | // | | +-----------------------------+ | |
60 | // | | | | |
61 | // | | +-----------------------------v-------------------------------+ | |
62 | // | | | | | |
63 | // | | | ColumnFamilySet | | |
64 | // | | | | | |
65 | // | | +-------------+--------------------------+----------------+---+ | |
66 | // | | | | | | |
67 | // | +-------------------------------------+ | | | |
68 | // | | | | v | |
69 | // | +-------------v-------------+ +-----v----v---------+ | |
70 | // | | | | | | |
71 | // | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... | |
72 | // | | | | | | |
73 | // +---> | | | | |
74 | // | +---------+ | | | |
75 | // | | MemTable| | | | |
76 | // | | List | | | | |
77 | // +--------+---+--+-+----+----+ +--------------------++ | |
78 | // | | | | | |
79 | // | | | | | |
80 | // | | | +-----------------------+ | |
81 | // | | +-----------+ | | |
82 | // v +--------+ | | | |
83 | // +--------+--------+ | | | | |
84 | // | | | | +----------v----------+ | |
85 | // +---> |SuperVersion 1.a +-----------------> | | |
86 | // | +------+ | | MemTableListVersion | | |
87 | // +---+-------------+ | | | | | | |
88 | // | | | | +----+------------+---+ | |
89 | // | current | | | | | | |
90 | // | +-------------+ | |mem | | | |
91 | // | | | | | | | |
92 | // +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ | |
93 | // | | | | | | | | | |
94 | // | Version 1.a | | memtable | | memtable | | memtable | | |
95 | // | | | 1.a | | 1.b | | 1.c | | |
96 | // +-------------+ | | | | | | | |
97 | // +----------+ +----------+ +----------+ | |
98 | // | |
99 | // DBImpl keeps a ColumnFamilySet, which references to all column families by | |
100 | // pointing to respective ColumnFamilyData object of each column family. | |
101 | // This is how DBImpl can list and operate on all the column families. | |
102 | // ColumnFamilyHandle also points to ColumnFamilyData directly, so that | |
103 | // when a user executes a query, it can directly find memtables and Version | |
104 | // as well as SuperVersion to the column family, without going through | |
105 | // ColumnFamilySet. | |
106 | // | |
107 | // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables | |
108 | // and SST files) indirectly, while ongoing operations may hold references | |
109 | // to a current or an out-of-date SuperVersion, which in turn points to a | |
110 | // point-in-time view of the LSM-tree. This guarantees the memtables and SST | |
111 | // files being operated on will not go away, until the SuperVersion is | |
112 | // unreferenced to 0 and destoryed. | |
113 | // | |
114 | // The following graph illustrates a possible referencing relationships: | |
115 | // | |
116 | // Column +--------------+ current +-----------+ | |
117 | // Family +---->+ +------------------->+ | | |
118 | // Data | SuperVersion +----------+ | Version A | | |
119 | // | 3 | imm | | | | |
120 | // Iter2 +----->+ | +-------v------+ +-----------+ | |
121 | // +-----+--------+ | MemtableList +----------------> Empty | |
122 | // | | Version r | +-----------+ | |
123 | // | +--------------+ | | | |
124 | // +------------------+ current| Version B | | |
125 | // +--------------+ | +----->+ | | |
126 | // | | | | +-----+-----+ | |
127 | // Compaction +>+ SuperVersion +-------------+ ^ | |
128 | // Job | 2 +------+ | |current | |
129 | // | +----+ | | mem | +------------+ | |
130 | // +--------------+ | | +---------------------> | | |
131 | // | +------------------------> MemTable a | | |
132 | // | mem | | | | |
133 | // +--------------+ | | +------------+ | |
134 | // | +--------------------------+ | |
135 | // Iter1 +-----> SuperVersion | | +------------+ | |
136 | // | 1 +------------------------------>+ | | |
137 | // | +-+ | mem | MemTable b | | |
138 | // +--------------+ | | | | | |
139 | // | | +--------------+ +-----^------+ | |
140 | // | |imm | MemtableList | | | |
141 | // | +--->+ Version s +------------+ | |
142 | // | +--------------+ | |
143 | // | +--------------+ | |
144 | // | | MemtableList | | |
145 | // +------>+ Version t +--------> Empty | |
146 | // imm +--------------+ | |
147 | // | |
148 | // In this example, even if the current LSM-tree consists of Version A and | |
149 | // memtable a, which is also referenced by SuperVersion, two older SuperVersion | |
150 | // SuperVersion2 and Superversion1 still exist, and are referenced by a | |
151 | // compaction job and an old iterator Iter1, respectively. SuperVersion2 | |
152 | // contains Version B, memtable a and memtable b; SuperVersion1 contains | |
153 | // Version B and memtable b (mutable). As a result, Version B and memtable b | |
154 | // are prevented from being destroyed or deleted. | |
7c673cae FG |
155 | |
156 | // ColumnFamilyHandleImpl is the class that clients use to access different | |
157 | // column families. It has non-trivial destructor, which gets called when client | |
158 | // is done using the column family | |
159 | class ColumnFamilyHandleImpl : public ColumnFamilyHandle { | |
160 | public: | |
161 | // create while holding the mutex | |
162 | ColumnFamilyHandleImpl( | |
163 | ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex); | |
164 | // destroy without mutex | |
165 | virtual ~ColumnFamilyHandleImpl(); | |
166 | virtual ColumnFamilyData* cfd() const { return cfd_; } | |
167 | ||
168 | virtual uint32_t GetID() const override; | |
169 | virtual const std::string& GetName() const override; | |
170 | virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; | |
171 | virtual const Comparator* GetComparator() const override; | |
172 | ||
173 | private: | |
174 | ColumnFamilyData* cfd_; | |
175 | DBImpl* db_; | |
176 | InstrumentedMutex* mutex_; | |
177 | }; | |
178 | ||
179 | // Does not ref-count ColumnFamilyData | |
180 | // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter | |
181 | // calls DBImpl methods. When this happens, MemTableInserter need access to | |
182 | // ColumnFamilyHandle (same as the client would need). In that case, we feed | |
183 | // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl | |
184 | // methods | |
185 | class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { | |
186 | public: | |
187 | ColumnFamilyHandleInternal() | |
11fdf7f2 | 188 | : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {} |
7c673cae FG |
189 | |
190 | void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } | |
191 | virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } | |
192 | ||
193 | private: | |
194 | ColumnFamilyData* internal_cfd_; | |
195 | }; | |
196 | ||
197 | // holds references to memtable, all immutable memtables and version | |
198 | struct SuperVersion { | |
199 | // Accessing members of this class is not thread-safe and requires external | |
200 | // synchronization (ie db mutex held or on write thread). | |
f67539c2 | 201 | ColumnFamilyData* cfd; |
7c673cae FG |
202 | MemTable* mem; |
203 | MemTableListVersion* imm; | |
204 | Version* current; | |
205 | MutableCFOptions mutable_cf_options; | |
206 | // Version number of the current SuperVersion | |
207 | uint64_t version_number; | |
11fdf7f2 | 208 | WriteStallCondition write_stall_condition; |
7c673cae FG |
209 | |
210 | InstrumentedMutex* db_mutex; | |
211 | ||
212 | // should be called outside the mutex | |
213 | SuperVersion() = default; | |
214 | ~SuperVersion(); | |
215 | SuperVersion* Ref(); | |
216 | // If Unref() returns true, Cleanup() should be called with mutex held | |
217 | // before deleting this SuperVersion. | |
218 | bool Unref(); | |
219 | ||
220 | // call these two methods with db mutex held | |
221 | // Cleanup unrefs mem, imm and current. Also, it stores all memtables | |
222 | // that needs to be deleted in to_delete vector. Unrefing those | |
223 | // objects needs to be done in the mutex | |
224 | void Cleanup(); | |
f67539c2 TL |
225 | void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, |
226 | MemTableListVersion* new_imm, Version* new_current); | |
7c673cae FG |
227 | |
228 | // The value of dummy is not actually used. kSVInUse takes its address as a | |
229 | // mark in the thread local storage to indicate the SuperVersion is in use | |
230 | // by thread. This way, the value of kSVInUse is guaranteed to have no | |
231 | // conflict with SuperVersion object address and portable on different | |
232 | // platform. | |
233 | static int dummy; | |
234 | static void* const kSVInUse; | |
235 | static void* const kSVObsolete; | |
236 | ||
237 | private: | |
238 | std::atomic<uint32_t> refs; | |
239 | // We need to_delete because during Cleanup(), imm->Unref() returns | |
240 | // all memtables that we need to free through this vector. We then | |
241 | // delete all those memtables outside of mutex, during destruction | |
242 | autovector<MemTable*> to_delete; | |
243 | }; | |
244 | ||
245 | extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); | |
246 | ||
247 | extern Status CheckConcurrentWritesSupported( | |
248 | const ColumnFamilyOptions& cf_options); | |
249 | ||
11fdf7f2 TL |
250 | extern Status CheckCFPathsSupported(const DBOptions& db_options, |
251 | const ColumnFamilyOptions& cf_options); | |
252 | ||
7c673cae FG |
253 | extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, |
254 | const ColumnFamilyOptions& src); | |
255 | // Wrap user defined table proproties collector factories `from cf_options` | |
256 | // into internal ones in int_tbl_prop_collector_factories. Add a system internal | |
257 | // one too. | |
258 | extern void GetIntTblPropCollectorFactory( | |
259 | const ImmutableCFOptions& ioptions, | |
260 | std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* | |
261 | int_tbl_prop_collector_factories); | |
262 | ||
263 | class ColumnFamilySet; | |
264 | ||
265 | // This class keeps all the data that a column family needs. | |
266 | // Most methods require DB mutex held, unless otherwise noted | |
267 | class ColumnFamilyData { | |
268 | public: | |
269 | ~ColumnFamilyData(); | |
270 | ||
271 | // thread-safe | |
272 | uint32_t GetID() const { return id_; } | |
273 | // thread-safe | |
274 | const std::string& GetName() const { return name_; } | |
275 | ||
276 | // Ref() can only be called from a context where the caller can guarantee | |
277 | // that ColumnFamilyData is alive (while holding a non-zero ref already, | |
278 | // holding a DB mutex, or as the leader in a write batch group). | |
f67539c2 | 279 | void Ref() { refs_.fetch_add(1); } |
7c673cae FG |
280 | |
281 | // Unref decreases the reference count, but does not handle deletion | |
282 | // when the count goes to 0. If this method returns true then the | |
283 | // caller should delete the instance immediately, or later, by calling | |
284 | // FreeDeadColumnFamilies(). Unref() can only be called while holding | |
285 | // a DB mutex, or during single-threaded recovery. | |
286 | bool Unref() { | |
f67539c2 | 287 | int old_refs = refs_.fetch_sub(1); |
7c673cae FG |
288 | assert(old_refs > 0); |
289 | return old_refs == 1; | |
290 | } | |
291 | ||
f67539c2 TL |
292 | // UnrefAndTryDelete() decreases the reference count and do free if needed, |
293 | // return true if this is freed else false, UnrefAndTryDelete() can only | |
294 | // be called while holding a DB mutex, or during single-threaded recovery. | |
295 | bool UnrefAndTryDelete(); | |
296 | ||
7c673cae FG |
297 | // SetDropped() can only be called under following conditions: |
298 | // 1) Holding a DB mutex, | |
299 | // 2) from single-threaded write thread, AND | |
300 | // 3) from single-threaded VersionSet::LogAndApply() | |
301 | // After dropping column family no other operation on that column family | |
302 | // will be executed. All the files and memory will be, however, kept around | |
303 | // until client drops the column family handle. That way, client can still | |
304 | // access data from dropped column family. | |
305 | // Column family can be dropped and still alive. In that state: | |
306 | // *) Compaction and flush is not executed on the dropped column family. | |
307 | // *) Client can continue reading from column family. Writes will fail unless | |
308 | // WriteOptions::ignore_missing_column_families is true | |
309 | // When the dropped column family is unreferenced, then we: | |
310 | // *) Remove column family from the linked list maintained by ColumnFamilySet | |
311 | // *) delete all memory associated with that column family | |
312 | // *) delete all the files associated with that column family | |
313 | void SetDropped(); | |
11fdf7f2 | 314 | bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } |
7c673cae FG |
315 | |
316 | // thread-safe | |
317 | int NumberLevels() const { return ioptions_.num_levels; } | |
318 | ||
319 | void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } | |
320 | uint64_t GetLogNumber() const { return log_number_; } | |
321 | ||
11fdf7f2 TL |
322 | void SetFlushReason(FlushReason flush_reason) { |
323 | flush_reason_ = flush_reason; | |
324 | } | |
325 | FlushReason GetFlushReason() const { return flush_reason_; } | |
7c673cae | 326 | // thread-safe |
f67539c2 | 327 | const FileOptions* soptions() const; |
7c673cae FG |
328 | const ImmutableCFOptions* ioptions() const { return &ioptions_; } |
329 | // REQUIRES: DB mutex held | |
330 | // This returns the MutableCFOptions used by current SuperVersion | |
331 | // You should use this API to reference MutableCFOptions most of the time. | |
332 | const MutableCFOptions* GetCurrentMutableCFOptions() const { | |
333 | return &(super_version_->mutable_cf_options); | |
334 | } | |
335 | // REQUIRES: DB mutex held | |
336 | // This returns the latest MutableCFOptions, which may be not in effect yet. | |
337 | const MutableCFOptions* GetLatestMutableCFOptions() const { | |
338 | return &mutable_cf_options_; | |
339 | } | |
340 | ||
341 | // REQUIRES: DB mutex held | |
342 | // Build ColumnFamiliesOptions with immutable options and latest mutable | |
343 | // options. | |
344 | ColumnFamilyOptions GetLatestCFOptions() const; | |
345 | ||
346 | bool is_delete_range_supported() { return is_delete_range_supported_; } | |
347 | ||
f67539c2 TL |
348 | // Validate CF options against DB options |
349 | static Status ValidateOptions(const DBOptions& db_options, | |
350 | const ColumnFamilyOptions& cf_options); | |
7c673cae FG |
351 | #ifndef ROCKSDB_LITE |
352 | // REQUIRES: DB mutex held | |
353 | Status SetOptions( | |
f67539c2 | 354 | const DBOptions& db_options, |
7c673cae FG |
355 | const std::unordered_map<std::string, std::string>& options_map); |
356 | #endif // ROCKSDB_LITE | |
357 | ||
358 | InternalStats* internal_stats() { return internal_stats_.get(); } | |
359 | ||
360 | MemTableList* imm() { return &imm_; } | |
361 | MemTable* mem() { return mem_; } | |
362 | Version* current() { return current_; } | |
363 | Version* dummy_versions() { return dummy_versions_; } | |
364 | void SetCurrent(Version* _current); | |
365 | uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held | |
366 | uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held | |
11fdf7f2 TL |
367 | uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held |
368 | void SetMemtable(MemTable* new_mem) { | |
369 | uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; | |
370 | new_mem->SetID(memtable_id); | |
371 | mem_ = new_mem; | |
372 | } | |
7c673cae FG |
373 | |
374 | // calculate the oldest log needed for the durability of this column family | |
375 | uint64_t OldestLogToKeep(); | |
376 | ||
377 | // See Memtable constructor for explanation of earliest_seq param. | |
378 | MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, | |
379 | SequenceNumber earliest_seq); | |
380 | void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, | |
381 | SequenceNumber earliest_seq); | |
382 | ||
383 | TableCache* table_cache() const { return table_cache_.get(); } | |
384 | ||
385 | // See documentation in compaction_picker.h | |
386 | // REQUIRES: DB mutex held | |
387 | bool NeedsCompaction() const; | |
388 | // REQUIRES: DB mutex held | |
389 | Compaction* PickCompaction(const MutableCFOptions& mutable_options, | |
390 | LogBuffer* log_buffer); | |
391 | ||
392 | // Check if the passed range overlap with any running compactions. | |
393 | // REQUIRES: DB mutex held | |
394 | bool RangeOverlapWithCompaction(const Slice& smallest_user_key, | |
395 | const Slice& largest_user_key, | |
396 | int level) const; | |
397 | ||
11fdf7f2 TL |
398 | // Check if the passed ranges overlap with any unflushed memtables |
399 | // (immutable or mutable). | |
400 | // | |
401 | // @param super_version A referenced SuperVersion that will be held for the | |
402 | // duration of this function. | |
403 | // | |
404 | // Thread-safe | |
405 | Status RangesOverlapWithMemtables(const autovector<Range>& ranges, | |
406 | SuperVersion* super_version, bool* overlap); | |
407 | ||
7c673cae | 408 | // A flag to tell a manual compaction is to compact all levels together |
11fdf7f2 | 409 | // instead of a specific level. |
7c673cae FG |
410 | static const int kCompactAllLevels; |
411 | // A flag to tell a manual compaction's output is base level. | |
412 | static const int kCompactToBaseLevel; | |
413 | // REQUIRES: DB mutex held | |
414 | Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, | |
415 | int input_level, int output_level, | |
f67539c2 | 416 | const CompactRangeOptions& compact_range_options, |
11fdf7f2 | 417 | const InternalKey* begin, const InternalKey* end, |
f67539c2 TL |
418 | InternalKey** compaction_end, bool* manual_conflict, |
419 | uint64_t max_file_num_to_ignore); | |
7c673cae FG |
420 | |
421 | CompactionPicker* compaction_picker() { return compaction_picker_.get(); } | |
422 | // thread-safe | |
423 | const Comparator* user_comparator() const { | |
424 | return internal_comparator_.user_comparator(); | |
425 | } | |
426 | // thread-safe | |
427 | const InternalKeyComparator& internal_comparator() const { | |
428 | return internal_comparator_; | |
429 | } | |
430 | ||
431 | const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* | |
432 | int_tbl_prop_collector_factories() const { | |
433 | return &int_tbl_prop_collector_factories_; | |
434 | } | |
435 | ||
436 | SuperVersion* GetSuperVersion() { return super_version_; } | |
437 | // thread-safe | |
438 | // Return a already referenced SuperVersion to be used safely. | |
f67539c2 | 439 | SuperVersion* GetReferencedSuperVersion(DBImpl* db); |
7c673cae FG |
440 | // thread-safe |
441 | // Get SuperVersion stored in thread local storage. If it does not exist, | |
442 | // get a reference from a current SuperVersion. | |
f67539c2 | 443 | SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); |
7c673cae FG |
444 | // Try to return SuperVersion back to thread local storage. Retrun true on |
445 | // success and false on failure. It fails when the thread local storage | |
446 | // contains anything other than SuperVersion::kSVInUse flag. | |
447 | bool ReturnThreadLocalSuperVersion(SuperVersion* sv); | |
448 | // thread-safe | |
449 | uint64_t GetSuperVersionNumber() const { | |
450 | return super_version_number_.load(); | |
451 | } | |
452 | // will return a pointer to SuperVersion* if previous SuperVersion | |
453 | // if its reference count is zero and needs deletion or nullptr if not | |
454 | // As argument takes a pointer to allocated SuperVersion to enable | |
455 | // the clients to allocate SuperVersion outside of mutex. | |
456 | // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() | |
11fdf7f2 TL |
457 | void InstallSuperVersion(SuperVersionContext* sv_context, |
458 | InstrumentedMutex* db_mutex, | |
459 | const MutableCFOptions& mutable_cf_options); | |
460 | void InstallSuperVersion(SuperVersionContext* sv_context, | |
461 | InstrumentedMutex* db_mutex); | |
7c673cae FG |
462 | |
463 | void ResetThreadLocalSuperVersions(); | |
464 | ||
465 | // Protected by DB mutex | |
11fdf7f2 TL |
466 | void set_queued_for_flush(bool value) { queued_for_flush_ = value; } |
467 | void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } | |
468 | bool queued_for_flush() { return queued_for_flush_; } | |
469 | bool queued_for_compaction() { return queued_for_compaction_; } | |
470 | ||
471 | enum class WriteStallCause { | |
472 | kNone, | |
473 | kMemtableLimit, | |
474 | kL0FileCountLimit, | |
475 | kPendingCompactionBytes, | |
476 | }; | |
477 | static std::pair<WriteStallCondition, WriteStallCause> | |
478 | GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files, | |
479 | uint64_t num_compaction_needed_bytes, | |
480 | const MutableCFOptions& mutable_cf_options); | |
7c673cae FG |
481 | |
482 | // Recalculate some small conditions, which are changed only during | |
483 | // compaction, adding new memtable and/or | |
484 | // recalculation of compaction score. These values are used in | |
485 | // DBImpl::MakeRoomForWrite function to decide, if it need to make | |
486 | // a write stall | |
11fdf7f2 | 487 | WriteStallCondition RecalculateWriteStallConditions( |
7c673cae FG |
488 | const MutableCFOptions& mutable_cf_options); |
489 | ||
11fdf7f2 TL |
490 | void set_initialized() { initialized_.store(true); } |
491 | ||
492 | bool initialized() const { return initialized_.load(); } | |
493 | ||
494 | const ColumnFamilyOptions& initial_cf_options() { | |
495 | return initial_cf_options_; | |
496 | } | |
497 | ||
498 | Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); | |
499 | ||
f67539c2 TL |
500 | // created_dirs remembers directory created, so that we don't need to call |
501 | // the same data creation operation again. | |
502 | Status AddDirectories( | |
503 | std::map<std::string, std::shared_ptr<Directory>>* created_dirs); | |
11fdf7f2 TL |
504 | |
505 | Directory* GetDataDir(size_t path_id) const; | |
506 | ||
494da23a TL |
507 | ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } |
508 | ||
7c673cae FG |
509 | private: |
510 | friend class ColumnFamilySet; | |
511 | ColumnFamilyData(uint32_t id, const std::string& name, | |
512 | Version* dummy_versions, Cache* table_cache, | |
513 | WriteBufferManager* write_buffer_manager, | |
514 | const ColumnFamilyOptions& options, | |
515 | const ImmutableDBOptions& db_options, | |
f67539c2 TL |
516 | const FileOptions& file_options, |
517 | ColumnFamilySet* column_family_set, | |
518 | BlockCacheTracer* const block_cache_tracer); | |
7c673cae FG |
519 | |
520 | uint32_t id_; | |
521 | const std::string name_; | |
522 | Version* dummy_versions_; // Head of circular doubly-linked list of versions. | |
523 | Version* current_; // == dummy_versions->prev_ | |
524 | ||
525 | std::atomic<int> refs_; // outstanding references to ColumnFamilyData | |
11fdf7f2 TL |
526 | std::atomic<bool> initialized_; |
527 | std::atomic<bool> dropped_; // true if client dropped it | |
7c673cae FG |
528 | |
529 | const InternalKeyComparator internal_comparator_; | |
530 | std::vector<std::unique_ptr<IntTblPropCollectorFactory>> | |
531 | int_tbl_prop_collector_factories_; | |
532 | ||
533 | const ColumnFamilyOptions initial_cf_options_; | |
534 | const ImmutableCFOptions ioptions_; | |
535 | MutableCFOptions mutable_cf_options_; | |
536 | ||
537 | const bool is_delete_range_supported_; | |
538 | ||
539 | std::unique_ptr<TableCache> table_cache_; | |
540 | ||
541 | std::unique_ptr<InternalStats> internal_stats_; | |
542 | ||
543 | WriteBufferManager* write_buffer_manager_; | |
544 | ||
545 | MemTable* mem_; | |
546 | MemTableList imm_; | |
547 | SuperVersion* super_version_; | |
548 | ||
549 | // An ordinal representing the current SuperVersion. Updated by | |
550 | // InstallSuperVersion(), i.e. incremented every time super_version_ | |
551 | // changes. | |
552 | std::atomic<uint64_t> super_version_number_; | |
553 | ||
554 | // Thread's local copy of SuperVersion pointer | |
555 | // This needs to be destructed before mutex_ | |
556 | std::unique_ptr<ThreadLocalPtr> local_sv_; | |
557 | ||
558 | // pointers for a circular linked list. we use it to support iterations over | |
559 | // all column families that are alive (note: dropped column families can also | |
560 | // be alive as long as client holds a reference) | |
561 | ColumnFamilyData* next_; | |
562 | ColumnFamilyData* prev_; | |
563 | ||
564 | // This is the earliest log file number that contains data from this | |
565 | // Column Family. All earlier log files must be ignored and not | |
566 | // recovered from | |
567 | uint64_t log_number_; | |
568 | ||
11fdf7f2 TL |
569 | std::atomic<FlushReason> flush_reason_; |
570 | ||
7c673cae FG |
571 | // An object that keeps all the compaction stats |
572 | // and picks the next compaction | |
573 | std::unique_ptr<CompactionPicker> compaction_picker_; | |
574 | ||
575 | ColumnFamilySet* column_family_set_; | |
576 | ||
577 | std::unique_ptr<WriteControllerToken> write_controller_token_; | |
578 | ||
579 | // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ | |
11fdf7f2 | 580 | bool queued_for_flush_; |
7c673cae FG |
581 | |
582 | // If true --> this ColumnFamily is currently present in | |
583 | // DBImpl::compaction_queue_ | |
11fdf7f2 | 584 | bool queued_for_compaction_; |
7c673cae FG |
585 | |
586 | uint64_t prev_compaction_needed_bytes_; | |
587 | ||
588 | // if the database was opened with 2pc enabled | |
589 | bool allow_2pc_; | |
11fdf7f2 TL |
590 | |
591 | // Memtable id to track flush. | |
592 | std::atomic<uint64_t> last_memtable_id_; | |
593 | ||
594 | // Directories corresponding to cf_paths. | |
f67539c2 | 595 | std::vector<std::shared_ptr<Directory>> data_dirs_; |
7c673cae FG |
596 | }; |
597 | ||
598 | // ColumnFamilySet has interesting thread-safety requirements | |
599 | // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB | |
600 | // mutex AND executed in the write thread. | |
601 | // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND | |
602 | // single-threaded write thread. It is also called during Recovery and in | |
603 | // DumpManifest(). | |
604 | // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be | |
605 | // held and it needs to be executed from the write thread. SetDropped() also | |
606 | // guarantees that it will be called only from single-threaded LogAndApply(), | |
607 | // but this condition is not that important. | |
608 | // * Iteration -- hold DB mutex, but you can release it in the body of | |
609 | // iteration. If you release DB mutex in body, reference the column | |
610 | // family before the mutex and unreference after you unlock, since the column | |
611 | // family might get dropped when the DB mutex is released | |
612 | // * GetDefault() -- thread safe | |
613 | // * GetColumnFamily() -- either inside of DB mutex or from a write thread | |
614 | // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), | |
615 | // NumberOfColumnFamilies -- inside of DB mutex | |
616 | class ColumnFamilySet { | |
617 | public: | |
618 | // ColumnFamilySet supports iteration | |
619 | class iterator { | |
620 | public: | |
621 | explicit iterator(ColumnFamilyData* cfd) | |
622 | : current_(cfd) {} | |
623 | iterator& operator++() { | |
624 | // dropped column families might still be included in this iteration | |
625 | // (we're only removing them when client drops the last reference to the | |
626 | // column family). | |
627 | // dummy is never dead, so this will never be infinite | |
628 | do { | |
629 | current_ = current_->next_; | |
630 | } while (current_->refs_.load(std::memory_order_relaxed) == 0); | |
631 | return *this; | |
632 | } | |
633 | bool operator!=(const iterator& other) { | |
634 | return this->current_ != other.current_; | |
635 | } | |
636 | ColumnFamilyData* operator*() { return current_; } | |
637 | ||
638 | private: | |
639 | ColumnFamilyData* current_; | |
640 | }; | |
641 | ||
642 | ColumnFamilySet(const std::string& dbname, | |
643 | const ImmutableDBOptions* db_options, | |
f67539c2 | 644 | const FileOptions& file_options, Cache* table_cache, |
7c673cae | 645 | WriteBufferManager* write_buffer_manager, |
f67539c2 TL |
646 | WriteController* write_controller, |
647 | BlockCacheTracer* const block_cache_tracer); | |
7c673cae FG |
648 | ~ColumnFamilySet(); |
649 | ||
650 | ColumnFamilyData* GetDefault() const; | |
651 | // GetColumnFamily() calls return nullptr if column family is not found | |
652 | ColumnFamilyData* GetColumnFamily(uint32_t id) const; | |
653 | ColumnFamilyData* GetColumnFamily(const std::string& name) const; | |
654 | // this call will return the next available column family ID. it guarantees | |
655 | // that there is no column family with id greater than or equal to the | |
656 | // returned value in the current running instance or anytime in RocksDB | |
657 | // instance history. | |
658 | uint32_t GetNextColumnFamilyID(); | |
659 | uint32_t GetMaxColumnFamily(); | |
660 | void UpdateMaxColumnFamily(uint32_t new_max_column_family); | |
661 | size_t NumberOfColumnFamilies() const; | |
662 | ||
663 | ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, | |
664 | Version* dummy_version, | |
665 | const ColumnFamilyOptions& options); | |
666 | ||
667 | iterator begin() { return iterator(dummy_cfd_->next_); } | |
668 | iterator end() { return iterator(dummy_cfd_); } | |
669 | ||
670 | // REQUIRES: DB mutex held | |
671 | // Don't call while iterating over ColumnFamilySet | |
672 | void FreeDeadColumnFamilies(); | |
673 | ||
674 | Cache* get_table_cache() { return table_cache_; } | |
675 | ||
676 | private: | |
677 | friend class ColumnFamilyData; | |
678 | // helper function that gets called from cfd destructor | |
679 | // REQUIRES: DB mutex held | |
680 | void RemoveColumnFamily(ColumnFamilyData* cfd); | |
681 | ||
682 | // column_families_ and column_family_data_ need to be protected: | |
683 | // * when mutating both conditions have to be satisfied: | |
684 | // 1. DB mutex locked | |
685 | // 2. thread currently in single-threaded write thread | |
686 | // * when reading, at least one condition needs to be satisfied: | |
687 | // 1. DB mutex locked | |
688 | // 2. accessed from a single-threaded write thread | |
689 | std::unordered_map<std::string, uint32_t> column_families_; | |
690 | std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_; | |
691 | ||
692 | uint32_t max_column_family_; | |
693 | ColumnFamilyData* dummy_cfd_; | |
694 | // We don't hold the refcount here, since default column family always exists | |
695 | // We are also not responsible for cleaning up default_cfd_cache_. This is | |
696 | // just a cache that makes common case (accessing default column family) | |
697 | // faster | |
698 | ColumnFamilyData* default_cfd_cache_; | |
699 | ||
700 | const std::string db_name_; | |
701 | const ImmutableDBOptions* const db_options_; | |
f67539c2 | 702 | const FileOptions file_options_; |
7c673cae FG |
703 | Cache* table_cache_; |
704 | WriteBufferManager* write_buffer_manager_; | |
705 | WriteController* write_controller_; | |
f67539c2 | 706 | BlockCacheTracer* const block_cache_tracer_; |
7c673cae FG |
707 | }; |
708 | ||
709 | // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access | |
710 | // memtables of different column families (specified by ID in the write batch) | |
711 | class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { | |
712 | public: | |
713 | explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) | |
714 | : column_family_set_(column_family_set), current_(nullptr) {} | |
715 | ||
716 | // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed | |
717 | // with the arguments used to construct *orig. | |
718 | explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) | |
719 | : column_family_set_(orig->column_family_set_), current_(nullptr) {} | |
720 | ||
721 | // sets current_ to ColumnFamilyData with column_family_id | |
722 | // returns false if column family doesn't exist | |
723 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
724 | // under a DB mutex OR from a write thread | |
725 | bool Seek(uint32_t column_family_id) override; | |
726 | ||
727 | // Returns log number of the selected column family | |
728 | // REQUIRES: under a DB mutex OR from a write thread | |
729 | uint64_t GetLogNumber() const override; | |
730 | ||
731 | // REQUIRES: Seek() called first | |
732 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
733 | // under a DB mutex OR from a write thread | |
734 | virtual MemTable* GetMemTable() const override; | |
735 | ||
736 | // Returns column family handle for the selected column family | |
737 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
738 | // under a DB mutex OR from a write thread | |
739 | virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; | |
740 | ||
741 | // Cannot be called while another thread is calling Seek(). | |
742 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
743 | // under a DB mutex OR from a write thread | |
744 | virtual ColumnFamilyData* current() override { return current_; } | |
745 | ||
746 | private: | |
747 | ColumnFamilySet* column_family_set_; | |
748 | ColumnFamilyData* current_; | |
749 | ColumnFamilyHandleInternal handle_; | |
750 | }; | |
751 | ||
752 | extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); | |
753 | ||
754 | extern const Comparator* GetColumnFamilyUserComparator( | |
755 | ColumnFamilyHandle* column_family); | |
756 | ||
f67539c2 | 757 | } // namespace ROCKSDB_NAMESPACE |