]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | ||
1e59de90 | 12 | #include <atomic> |
7c673cae | 13 | #include <string> |
1e59de90 | 14 | #include <unordered_map> |
7c673cae | 15 | #include <vector> |
7c673cae | 16 | |
1e59de90 | 17 | #include "cache/cache_reservation_manager.h" |
7c673cae FG |
18 | #include "db/memtable_list.h" |
19 | #include "db/table_cache.h" | |
20 | #include "db/table_properties_collector.h" | |
21 | #include "db/write_batch_internal.h" | |
22 | #include "db/write_controller.h" | |
23 | #include "options/cf_options.h" | |
24 | #include "rocksdb/compaction_job_stats.h" | |
25 | #include "rocksdb/db.h" | |
26 | #include "rocksdb/env.h" | |
27 | #include "rocksdb/options.h" | |
f67539c2 | 28 | #include "trace_replay/block_cache_tracer.h" |
1e59de90 | 29 | #include "util/hash_containers.h" |
7c673cae FG |
30 | #include "util/thread_local.h" |
31 | ||
f67539c2 | 32 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
33 | |
34 | class Version; | |
35 | class VersionSet; | |
11fdf7f2 | 36 | class VersionStorageInfo; |
7c673cae FG |
37 | class MemTable; |
38 | class MemTableListVersion; | |
39 | class CompactionPicker; | |
40 | class Compaction; | |
41 | class InternalKey; | |
42 | class InternalStats; | |
43 | class ColumnFamilyData; | |
44 | class DBImpl; | |
45 | class LogBuffer; | |
46 | class InstrumentedMutex; | |
47 | class InstrumentedMutexLock; | |
11fdf7f2 | 48 | struct SuperVersionContext; |
20effc67 | 49 | class BlobFileCache; |
1e59de90 | 50 | class BlobSource; |
7c673cae FG |
51 | |
52 | extern const double kIncSlowdownRatio; | |
f67539c2 TL |
53 | // This file contains a list of data structures for managing column family |
54 | // level metadata. | |
55 | // | |
56 | // The basic relationships among classes declared here are illustrated as | |
57 | // following: | |
58 | // | |
59 | // +----------------------+ +----------------------+ +--------+ | |
60 | // +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | | |
61 | // | +----------------------+ | +----------------------+ +----+---+ | |
62 | // | +--------------------------+ | | |
63 | // | | +-----------------------------+ | |
64 | // | | | | |
65 | // | | +-----------------------------v-------------------------------+ | |
66 | // | | | | | |
67 | // | | | ColumnFamilySet | | |
68 | // | | | | | |
69 | // | | +-------------+--------------------------+----------------+---+ | |
70 | // | | | | | | |
71 | // | +-------------------------------------+ | | | |
72 | // | | | | v | |
73 | // | +-------------v-------------+ +-----v----v---------+ | |
74 | // | | | | | | |
75 | // | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... | |
76 | // | | | | | | |
77 | // +---> | | | | |
78 | // | +---------+ | | | |
79 | // | | MemTable| | | | |
80 | // | | List | | | | |
81 | // +--------+---+--+-+----+----+ +--------------------++ | |
82 | // | | | | | |
83 | // | | | | | |
84 | // | | | +-----------------------+ | |
85 | // | | +-----------+ | | |
86 | // v +--------+ | | | |
87 | // +--------+--------+ | | | | |
88 | // | | | | +----------v----------+ | |
89 | // +---> |SuperVersion 1.a +-----------------> | | |
90 | // | +------+ | | MemTableListVersion | | |
91 | // +---+-------------+ | | | | | | |
92 | // | | | | +----+------------+---+ | |
93 | // | current | | | | | | |
94 | // | +-------------+ | |mem | | | |
95 | // | | | | | | | |
96 | // +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ | |
97 | // | | | | | | | | | |
98 | // | Version 1.a | | memtable | | memtable | | memtable | | |
99 | // | | | 1.a | | 1.b | | 1.c | | |
100 | // +-------------+ | | | | | | | |
101 | // +----------+ +----------+ +----------+ | |
102 | // | |
103 | // DBImpl keeps a ColumnFamilySet, which references to all column families by | |
104 | // pointing to respective ColumnFamilyData object of each column family. | |
105 | // This is how DBImpl can list and operate on all the column families. | |
106 | // ColumnFamilyHandle also points to ColumnFamilyData directly, so that | |
107 | // when a user executes a query, it can directly find memtables and Version | |
108 | // as well as SuperVersion to the column family, without going through | |
109 | // ColumnFamilySet. | |
110 | // | |
111 | // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables | |
112 | // and SST files) indirectly, while ongoing operations may hold references | |
113 | // to a current or an out-of-date SuperVersion, which in turn points to a | |
114 | // point-in-time view of the LSM-tree. This guarantees the memtables and SST | |
115 | // files being operated on will not go away, until the SuperVersion is | |
116 | // unreferenced to 0 and destoryed. | |
117 | // | |
118 | // The following graph illustrates a possible referencing relationships: | |
119 | // | |
120 | // Column +--------------+ current +-----------+ | |
121 | // Family +---->+ +------------------->+ | | |
122 | // Data | SuperVersion +----------+ | Version A | | |
123 | // | 3 | imm | | | | |
124 | // Iter2 +----->+ | +-------v------+ +-----------+ | |
125 | // +-----+--------+ | MemtableList +----------------> Empty | |
126 | // | | Version r | +-----------+ | |
127 | // | +--------------+ | | | |
128 | // +------------------+ current| Version B | | |
129 | // +--------------+ | +----->+ | | |
130 | // | | | | +-----+-----+ | |
131 | // Compaction +>+ SuperVersion +-------------+ ^ | |
132 | // Job | 2 +------+ | |current | |
133 | // | +----+ | | mem | +------------+ | |
134 | // +--------------+ | | +---------------------> | | |
135 | // | +------------------------> MemTable a | | |
136 | // | mem | | | | |
137 | // +--------------+ | | +------------+ | |
138 | // | +--------------------------+ | |
139 | // Iter1 +-----> SuperVersion | | +------------+ | |
140 | // | 1 +------------------------------>+ | | |
141 | // | +-+ | mem | MemTable b | | |
142 | // +--------------+ | | | | | |
143 | // | | +--------------+ +-----^------+ | |
144 | // | |imm | MemtableList | | | |
145 | // | +--->+ Version s +------------+ | |
146 | // | +--------------+ | |
147 | // | +--------------+ | |
148 | // | | MemtableList | | |
149 | // +------>+ Version t +--------> Empty | |
150 | // imm +--------------+ | |
151 | // | |
152 | // In this example, even if the current LSM-tree consists of Version A and | |
153 | // memtable a, which is also referenced by SuperVersion, two older SuperVersion | |
154 | // SuperVersion2 and Superversion1 still exist, and are referenced by a | |
155 | // compaction job and an old iterator Iter1, respectively. SuperVersion2 | |
156 | // contains Version B, memtable a and memtable b; SuperVersion1 contains | |
157 | // Version B and memtable b (mutable). As a result, Version B and memtable b | |
158 | // are prevented from being destroyed or deleted. | |
7c673cae FG |
159 | |
160 | // ColumnFamilyHandleImpl is the class that clients use to access different | |
161 | // column families. It has non-trivial destructor, which gets called when client | |
162 | // is done using the column family | |
163 | class ColumnFamilyHandleImpl : public ColumnFamilyHandle { | |
164 | public: | |
165 | // create while holding the mutex | |
1e59de90 TL |
166 | ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, |
167 | InstrumentedMutex* mutex); | |
7c673cae FG |
168 | // destroy without mutex |
169 | virtual ~ColumnFamilyHandleImpl(); | |
170 | virtual ColumnFamilyData* cfd() const { return cfd_; } | |
171 | ||
172 | virtual uint32_t GetID() const override; | |
173 | virtual const std::string& GetName() const override; | |
174 | virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override; | |
175 | virtual const Comparator* GetComparator() const override; | |
176 | ||
177 | private: | |
178 | ColumnFamilyData* cfd_; | |
179 | DBImpl* db_; | |
180 | InstrumentedMutex* mutex_; | |
181 | }; | |
182 | ||
183 | // Does not ref-count ColumnFamilyData | |
184 | // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter | |
185 | // calls DBImpl methods. When this happens, MemTableInserter need access to | |
186 | // ColumnFamilyHandle (same as the client would need). In that case, we feed | |
187 | // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl | |
188 | // methods | |
189 | class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { | |
190 | public: | |
191 | ColumnFamilyHandleInternal() | |
1e59de90 TL |
192 | : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), |
193 | internal_cfd_(nullptr) {} | |
7c673cae FG |
194 | |
195 | void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } | |
196 | virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } | |
197 | ||
198 | private: | |
199 | ColumnFamilyData* internal_cfd_; | |
200 | }; | |
201 | ||
202 | // holds references to memtable, all immutable memtables and version | |
203 | struct SuperVersion { | |
204 | // Accessing members of this class is not thread-safe and requires external | |
205 | // synchronization (ie db mutex held or on write thread). | |
f67539c2 | 206 | ColumnFamilyData* cfd; |
7c673cae FG |
207 | MemTable* mem; |
208 | MemTableListVersion* imm; | |
209 | Version* current; | |
210 | MutableCFOptions mutable_cf_options; | |
211 | // Version number of the current SuperVersion | |
212 | uint64_t version_number; | |
11fdf7f2 | 213 | WriteStallCondition write_stall_condition; |
7c673cae | 214 | |
7c673cae FG |
215 | // should be called outside the mutex |
216 | SuperVersion() = default; | |
217 | ~SuperVersion(); | |
218 | SuperVersion* Ref(); | |
219 | // If Unref() returns true, Cleanup() should be called with mutex held | |
220 | // before deleting this SuperVersion. | |
221 | bool Unref(); | |
222 | ||
223 | // call these two methods with db mutex held | |
224 | // Cleanup unrefs mem, imm and current. Also, it stores all memtables | |
225 | // that needs to be deleted in to_delete vector. Unrefing those | |
226 | // objects needs to be done in the mutex | |
227 | void Cleanup(); | |
f67539c2 TL |
228 | void Init(ColumnFamilyData* new_cfd, MemTable* new_mem, |
229 | MemTableListVersion* new_imm, Version* new_current); | |
7c673cae FG |
230 | |
231 | // The value of dummy is not actually used. kSVInUse takes its address as a | |
232 | // mark in the thread local storage to indicate the SuperVersion is in use | |
233 | // by thread. This way, the value of kSVInUse is guaranteed to have no | |
234 | // conflict with SuperVersion object address and portable on different | |
235 | // platform. | |
236 | static int dummy; | |
237 | static void* const kSVInUse; | |
238 | static void* const kSVObsolete; | |
239 | ||
240 | private: | |
241 | std::atomic<uint32_t> refs; | |
242 | // We need to_delete because during Cleanup(), imm->Unref() returns | |
243 | // all memtables that we need to free through this vector. We then | |
244 | // delete all those memtables outside of mutex, during destruction | |
245 | autovector<MemTable*> to_delete; | |
246 | }; | |
247 | ||
248 | extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); | |
249 | ||
250 | extern Status CheckConcurrentWritesSupported( | |
251 | const ColumnFamilyOptions& cf_options); | |
252 | ||
11fdf7f2 TL |
253 | extern Status CheckCFPathsSupported(const DBOptions& db_options, |
254 | const ColumnFamilyOptions& cf_options); | |
255 | ||
7c673cae FG |
256 | extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, |
257 | const ColumnFamilyOptions& src); | |
1e59de90 | 258 | // Wrap user defined table properties collector factories `from cf_options` |
7c673cae FG |
259 | // into internal ones in int_tbl_prop_collector_factories. Add a system internal |
260 | // one too. | |
261 | extern void GetIntTblPropCollectorFactory( | |
262 | const ImmutableCFOptions& ioptions, | |
1e59de90 | 263 | IntTblPropCollectorFactories* int_tbl_prop_collector_factories); |
7c673cae FG |
264 | |
265 | class ColumnFamilySet; | |
266 | ||
267 | // This class keeps all the data that a column family needs. | |
268 | // Most methods require DB mutex held, unless otherwise noted | |
269 | class ColumnFamilyData { | |
270 | public: | |
271 | ~ColumnFamilyData(); | |
272 | ||
273 | // thread-safe | |
274 | uint32_t GetID() const { return id_; } | |
275 | // thread-safe | |
276 | const std::string& GetName() const { return name_; } | |
277 | ||
278 | // Ref() can only be called from a context where the caller can guarantee | |
279 | // that ColumnFamilyData is alive (while holding a non-zero ref already, | |
280 | // holding a DB mutex, or as the leader in a write batch group). | |
f67539c2 | 281 | void Ref() { refs_.fetch_add(1); } |
7c673cae | 282 | |
f67539c2 TL |
283 | // UnrefAndTryDelete() decreases the reference count and do free if needed, |
284 | // return true if this is freed else false, UnrefAndTryDelete() can only | |
285 | // be called while holding a DB mutex, or during single-threaded recovery. | |
286 | bool UnrefAndTryDelete(); | |
287 | ||
7c673cae FG |
288 | // SetDropped() can only be called under following conditions: |
289 | // 1) Holding a DB mutex, | |
290 | // 2) from single-threaded write thread, AND | |
291 | // 3) from single-threaded VersionSet::LogAndApply() | |
292 | // After dropping column family no other operation on that column family | |
293 | // will be executed. All the files and memory will be, however, kept around | |
294 | // until client drops the column family handle. That way, client can still | |
295 | // access data from dropped column family. | |
296 | // Column family can be dropped and still alive. In that state: | |
297 | // *) Compaction and flush is not executed on the dropped column family. | |
298 | // *) Client can continue reading from column family. Writes will fail unless | |
299 | // WriteOptions::ignore_missing_column_families is true | |
300 | // When the dropped column family is unreferenced, then we: | |
301 | // *) Remove column family from the linked list maintained by ColumnFamilySet | |
302 | // *) delete all memory associated with that column family | |
303 | // *) delete all the files associated with that column family | |
304 | void SetDropped(); | |
11fdf7f2 | 305 | bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } |
7c673cae FG |
306 | |
307 | // thread-safe | |
308 | int NumberLevels() const { return ioptions_.num_levels; } | |
309 | ||
310 | void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } | |
311 | uint64_t GetLogNumber() const { return log_number_; } | |
312 | ||
11fdf7f2 TL |
313 | void SetFlushReason(FlushReason flush_reason) { |
314 | flush_reason_ = flush_reason; | |
315 | } | |
316 | FlushReason GetFlushReason() const { return flush_reason_; } | |
7c673cae | 317 | // thread-safe |
f67539c2 | 318 | const FileOptions* soptions() const; |
1e59de90 | 319 | const ImmutableOptions* ioptions() const { return &ioptions_; } |
7c673cae FG |
320 | // REQUIRES: DB mutex held |
321 | // This returns the MutableCFOptions used by current SuperVersion | |
322 | // You should use this API to reference MutableCFOptions most of the time. | |
323 | const MutableCFOptions* GetCurrentMutableCFOptions() const { | |
324 | return &(super_version_->mutable_cf_options); | |
325 | } | |
326 | // REQUIRES: DB mutex held | |
327 | // This returns the latest MutableCFOptions, which may be not in effect yet. | |
328 | const MutableCFOptions* GetLatestMutableCFOptions() const { | |
329 | return &mutable_cf_options_; | |
330 | } | |
331 | ||
332 | // REQUIRES: DB mutex held | |
333 | // Build ColumnFamiliesOptions with immutable options and latest mutable | |
334 | // options. | |
335 | ColumnFamilyOptions GetLatestCFOptions() const; | |
336 | ||
337 | bool is_delete_range_supported() { return is_delete_range_supported_; } | |
338 | ||
f67539c2 TL |
339 | // Validate CF options against DB options |
340 | static Status ValidateOptions(const DBOptions& db_options, | |
341 | const ColumnFamilyOptions& cf_options); | |
7c673cae FG |
342 | #ifndef ROCKSDB_LITE |
343 | // REQUIRES: DB mutex held | |
344 | Status SetOptions( | |
f67539c2 | 345 | const DBOptions& db_options, |
7c673cae FG |
346 | const std::unordered_map<std::string, std::string>& options_map); |
347 | #endif // ROCKSDB_LITE | |
348 | ||
349 | InternalStats* internal_stats() { return internal_stats_.get(); } | |
350 | ||
351 | MemTableList* imm() { return &imm_; } | |
352 | MemTable* mem() { return mem_; } | |
1e59de90 TL |
353 | |
354 | bool IsEmpty() { | |
355 | return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; | |
356 | } | |
357 | ||
7c673cae FG |
358 | Version* current() { return current_; } |
359 | Version* dummy_versions() { return dummy_versions_; } | |
360 | void SetCurrent(Version* _current); | |
1e59de90 | 361 | uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held |
7c673cae | 362 | uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held |
11fdf7f2 | 363 | uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held |
1e59de90 | 364 | uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held |
11fdf7f2 TL |
365 | void SetMemtable(MemTable* new_mem) { |
366 | uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; | |
367 | new_mem->SetID(memtable_id); | |
368 | mem_ = new_mem; | |
369 | } | |
7c673cae FG |
370 | |
371 | // calculate the oldest log needed for the durability of this column family | |
372 | uint64_t OldestLogToKeep(); | |
373 | ||
374 | // See Memtable constructor for explanation of earliest_seq param. | |
375 | MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, | |
376 | SequenceNumber earliest_seq); | |
377 | void CreateNewMemtable(const MutableCFOptions& mutable_cf_options, | |
378 | SequenceNumber earliest_seq); | |
379 | ||
380 | TableCache* table_cache() const { return table_cache_.get(); } | |
1e59de90 | 381 | BlobSource* blob_source() const { return blob_source_.get(); } |
7c673cae FG |
382 | |
383 | // See documentation in compaction_picker.h | |
384 | // REQUIRES: DB mutex held | |
385 | bool NeedsCompaction() const; | |
386 | // REQUIRES: DB mutex held | |
387 | Compaction* PickCompaction(const MutableCFOptions& mutable_options, | |
20effc67 | 388 | const MutableDBOptions& mutable_db_options, |
7c673cae FG |
389 | LogBuffer* log_buffer); |
390 | ||
391 | // Check if the passed range overlap with any running compactions. | |
392 | // REQUIRES: DB mutex held | |
393 | bool RangeOverlapWithCompaction(const Slice& smallest_user_key, | |
394 | const Slice& largest_user_key, | |
395 | int level) const; | |
396 | ||
11fdf7f2 TL |
397 | // Check if the passed ranges overlap with any unflushed memtables |
398 | // (immutable or mutable). | |
399 | // | |
400 | // @param super_version A referenced SuperVersion that will be held for the | |
401 | // duration of this function. | |
402 | // | |
403 | // Thread-safe | |
404 | Status RangesOverlapWithMemtables(const autovector<Range>& ranges, | |
20effc67 TL |
405 | SuperVersion* super_version, |
406 | bool allow_data_in_errors, bool* overlap); | |
11fdf7f2 | 407 | |
7c673cae | 408 | // A flag to tell a manual compaction is to compact all levels together |
11fdf7f2 | 409 | // instead of a specific level. |
7c673cae FG |
410 | static const int kCompactAllLevels; |
411 | // A flag to tell a manual compaction's output is base level. | |
412 | static const int kCompactToBaseLevel; | |
413 | // REQUIRES: DB mutex held | |
414 | Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, | |
20effc67 | 415 | const MutableDBOptions& mutable_db_options, |
7c673cae | 416 | int input_level, int output_level, |
f67539c2 | 417 | const CompactRangeOptions& compact_range_options, |
11fdf7f2 | 418 | const InternalKey* begin, const InternalKey* end, |
f67539c2 | 419 | InternalKey** compaction_end, bool* manual_conflict, |
1e59de90 TL |
420 | uint64_t max_file_num_to_ignore, |
421 | const std::string& trim_ts); | |
7c673cae FG |
422 | |
423 | CompactionPicker* compaction_picker() { return compaction_picker_.get(); } | |
424 | // thread-safe | |
425 | const Comparator* user_comparator() const { | |
426 | return internal_comparator_.user_comparator(); | |
427 | } | |
428 | // thread-safe | |
429 | const InternalKeyComparator& internal_comparator() const { | |
430 | return internal_comparator_; | |
431 | } | |
432 | ||
1e59de90 | 433 | const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const { |
7c673cae FG |
434 | return &int_tbl_prop_collector_factories_; |
435 | } | |
436 | ||
437 | SuperVersion* GetSuperVersion() { return super_version_; } | |
438 | // thread-safe | |
439 | // Return a already referenced SuperVersion to be used safely. | |
f67539c2 | 440 | SuperVersion* GetReferencedSuperVersion(DBImpl* db); |
7c673cae FG |
441 | // thread-safe |
442 | // Get SuperVersion stored in thread local storage. If it does not exist, | |
443 | // get a reference from a current SuperVersion. | |
f67539c2 | 444 | SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); |
1e59de90 | 445 | // Try to return SuperVersion back to thread local storage. Return true on |
7c673cae FG |
446 | // success and false on failure. It fails when the thread local storage |
447 | // contains anything other than SuperVersion::kSVInUse flag. | |
448 | bool ReturnThreadLocalSuperVersion(SuperVersion* sv); | |
449 | // thread-safe | |
450 | uint64_t GetSuperVersionNumber() const { | |
451 | return super_version_number_.load(); | |
452 | } | |
453 | // will return a pointer to SuperVersion* if previous SuperVersion | |
454 | // if its reference count is zero and needs deletion or nullptr if not | |
455 | // As argument takes a pointer to allocated SuperVersion to enable | |
456 | // the clients to allocate SuperVersion outside of mutex. | |
457 | // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() | |
11fdf7f2 | 458 | void InstallSuperVersion(SuperVersionContext* sv_context, |
11fdf7f2 TL |
459 | const MutableCFOptions& mutable_cf_options); |
460 | void InstallSuperVersion(SuperVersionContext* sv_context, | |
461 | InstrumentedMutex* db_mutex); | |
7c673cae FG |
462 | |
463 | void ResetThreadLocalSuperVersions(); | |
464 | ||
465 | // Protected by DB mutex | |
11fdf7f2 TL |
466 | void set_queued_for_flush(bool value) { queued_for_flush_ = value; } |
467 | void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } | |
468 | bool queued_for_flush() { return queued_for_flush_; } | |
469 | bool queued_for_compaction() { return queued_for_compaction_; } | |
470 | ||
471 | enum class WriteStallCause { | |
472 | kNone, | |
473 | kMemtableLimit, | |
474 | kL0FileCountLimit, | |
475 | kPendingCompactionBytes, | |
476 | }; | |
477 | static std::pair<WriteStallCondition, WriteStallCause> | |
1e59de90 TL |
478 | GetWriteStallConditionAndCause( |
479 | int num_unflushed_memtables, int num_l0_files, | |
480 | uint64_t num_compaction_needed_bytes, | |
481 | const MutableCFOptions& mutable_cf_options, | |
482 | const ImmutableCFOptions& immutable_cf_options); | |
483 | ||
484 | // Recalculate some stall conditions, which are changed only during | |
485 | // compaction, adding new memtable and/or recalculation of compaction score. | |
11fdf7f2 | 486 | WriteStallCondition RecalculateWriteStallConditions( |
7c673cae FG |
487 | const MutableCFOptions& mutable_cf_options); |
488 | ||
11fdf7f2 TL |
489 | void set_initialized() { initialized_.store(true); } |
490 | ||
491 | bool initialized() const { return initialized_.load(); } | |
492 | ||
493 | const ColumnFamilyOptions& initial_cf_options() { | |
494 | return initial_cf_options_; | |
495 | } | |
496 | ||
497 | Env::WriteLifeTimeHint CalculateSSTWriteHint(int level); | |
498 | ||
f67539c2 TL |
499 | // created_dirs remembers directory created, so that we don't need to call |
500 | // the same data creation operation again. | |
501 | Status AddDirectories( | |
20effc67 | 502 | std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs); |
11fdf7f2 | 503 | |
20effc67 | 504 | FSDirectory* GetDataDir(size_t path_id) const; |
11fdf7f2 | 505 | |
1e59de90 TL |
506 | // full_history_ts_low_ can only increase. |
507 | void SetFullHistoryTsLow(std::string ts_low) { | |
508 | assert(!ts_low.empty()); | |
509 | const Comparator* ucmp = user_comparator(); | |
510 | assert(ucmp); | |
511 | if (full_history_ts_low_.empty() || | |
512 | ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { | |
513 | full_history_ts_low_ = std::move(ts_low); | |
514 | } | |
515 | } | |
516 | ||
517 | const std::string& GetFullHistoryTsLow() const { | |
518 | return full_history_ts_low_; | |
519 | } | |
520 | ||
494da23a | 521 | ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } |
1e59de90 TL |
522 | WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } |
523 | std::shared_ptr<CacheReservationManager> | |
524 | GetFileMetadataCacheReservationManager() { | |
525 | return file_metadata_cache_res_mgr_; | |
526 | } | |
527 | ||
528 | SequenceNumber GetFirstMemtableSequenceNumber() const; | |
529 | ||
530 | static const uint32_t kDummyColumnFamilyDataId; | |
531 | ||
532 | // Keep track of whether the mempurge feature was ever used. | |
533 | void SetMempurgeUsed() { mempurge_used_ = true; } | |
534 | bool GetMempurgeUsed() { return mempurge_used_; } | |
494da23a | 535 | |
7c673cae FG |
536 | private: |
537 | friend class ColumnFamilySet; | |
538 | ColumnFamilyData(uint32_t id, const std::string& name, | |
539 | Version* dummy_versions, Cache* table_cache, | |
540 | WriteBufferManager* write_buffer_manager, | |
541 | const ColumnFamilyOptions& options, | |
542 | const ImmutableDBOptions& db_options, | |
1e59de90 | 543 | const FileOptions* file_options, |
f67539c2 | 544 | ColumnFamilySet* column_family_set, |
20effc67 | 545 | BlockCacheTracer* const block_cache_tracer, |
1e59de90 TL |
546 | const std::shared_ptr<IOTracer>& io_tracer, |
547 | const std::string& db_id, const std::string& db_session_id); | |
20effc67 TL |
548 | |
549 | std::vector<std::string> GetDbPaths() const; | |
7c673cae FG |
550 | |
551 | uint32_t id_; | |
552 | const std::string name_; | |
553 | Version* dummy_versions_; // Head of circular doubly-linked list of versions. | |
554 | Version* current_; // == dummy_versions->prev_ | |
555 | ||
1e59de90 | 556 | std::atomic<int> refs_; // outstanding references to ColumnFamilyData |
11fdf7f2 TL |
557 | std::atomic<bool> initialized_; |
558 | std::atomic<bool> dropped_; // true if client dropped it | |
7c673cae FG |
559 | |
560 | const InternalKeyComparator internal_comparator_; | |
1e59de90 | 561 | IntTblPropCollectorFactories int_tbl_prop_collector_factories_; |
7c673cae FG |
562 | |
563 | const ColumnFamilyOptions initial_cf_options_; | |
1e59de90 | 564 | const ImmutableOptions ioptions_; |
7c673cae FG |
565 | MutableCFOptions mutable_cf_options_; |
566 | ||
567 | const bool is_delete_range_supported_; | |
568 | ||
569 | std::unique_ptr<TableCache> table_cache_; | |
20effc67 | 570 | std::unique_ptr<BlobFileCache> blob_file_cache_; |
1e59de90 | 571 | std::unique_ptr<BlobSource> blob_source_; |
7c673cae FG |
572 | |
573 | std::unique_ptr<InternalStats> internal_stats_; | |
574 | ||
575 | WriteBufferManager* write_buffer_manager_; | |
576 | ||
577 | MemTable* mem_; | |
578 | MemTableList imm_; | |
579 | SuperVersion* super_version_; | |
580 | ||
581 | // An ordinal representing the current SuperVersion. Updated by | |
582 | // InstallSuperVersion(), i.e. incremented every time super_version_ | |
583 | // changes. | |
584 | std::atomic<uint64_t> super_version_number_; | |
585 | ||
586 | // Thread's local copy of SuperVersion pointer | |
587 | // This needs to be destructed before mutex_ | |
588 | std::unique_ptr<ThreadLocalPtr> local_sv_; | |
589 | ||
590 | // pointers for a circular linked list. we use it to support iterations over | |
591 | // all column families that are alive (note: dropped column families can also | |
592 | // be alive as long as client holds a reference) | |
593 | ColumnFamilyData* next_; | |
594 | ColumnFamilyData* prev_; | |
595 | ||
596 | // This is the earliest log file number that contains data from this | |
597 | // Column Family. All earlier log files must be ignored and not | |
598 | // recovered from | |
599 | uint64_t log_number_; | |
600 | ||
11fdf7f2 TL |
601 | std::atomic<FlushReason> flush_reason_; |
602 | ||
7c673cae FG |
603 | // An object that keeps all the compaction stats |
604 | // and picks the next compaction | |
605 | std::unique_ptr<CompactionPicker> compaction_picker_; | |
606 | ||
607 | ColumnFamilySet* column_family_set_; | |
608 | ||
609 | std::unique_ptr<WriteControllerToken> write_controller_token_; | |
610 | ||
611 | // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ | |
11fdf7f2 | 612 | bool queued_for_flush_; |
7c673cae FG |
613 | |
614 | // If true --> this ColumnFamily is currently present in | |
615 | // DBImpl::compaction_queue_ | |
11fdf7f2 | 616 | bool queued_for_compaction_; |
7c673cae FG |
617 | |
618 | uint64_t prev_compaction_needed_bytes_; | |
619 | ||
620 | // if the database was opened with 2pc enabled | |
621 | bool allow_2pc_; | |
11fdf7f2 TL |
622 | |
623 | // Memtable id to track flush. | |
624 | std::atomic<uint64_t> last_memtable_id_; | |
625 | ||
626 | // Directories corresponding to cf_paths. | |
20effc67 TL |
627 | std::vector<std::shared_ptr<FSDirectory>> data_dirs_; |
628 | ||
629 | bool db_paths_registered_; | |
1e59de90 TL |
630 | |
631 | std::string full_history_ts_low_; | |
632 | ||
633 | // For charging memory usage of file metadata created for newly added files to | |
634 | // a Version associated with this CFD | |
635 | std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_; | |
636 | bool mempurge_used_; | |
7c673cae FG |
637 | }; |
638 | ||
639 | // ColumnFamilySet has interesting thread-safety requirements | |
640 | // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB | |
641 | // mutex AND executed in the write thread. | |
642 | // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND | |
643 | // single-threaded write thread. It is also called during Recovery and in | |
644 | // DumpManifest(). | |
645 | // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be | |
646 | // held and it needs to be executed from the write thread. SetDropped() also | |
647 | // guarantees that it will be called only from single-threaded LogAndApply(), | |
648 | // but this condition is not that important. | |
1e59de90 TL |
649 | // * Iteration -- hold DB mutex. If you want to release the DB mutex in the |
650 | // body of the iteration, wrap in a RefedColumnFamilySet. | |
7c673cae FG |
651 | // * GetDefault() -- thread safe |
652 | // * GetColumnFamily() -- either inside of DB mutex or from a write thread | |
653 | // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), | |
654 | // NumberOfColumnFamilies -- inside of DB mutex | |
655 | class ColumnFamilySet { | |
656 | public: | |
657 | // ColumnFamilySet supports iteration | |
658 | class iterator { | |
659 | public: | |
1e59de90 TL |
660 | explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} |
661 | // NOTE: minimum operators for for-loop iteration | |
7c673cae | 662 | iterator& operator++() { |
1e59de90 | 663 | current_ = current_->next_; |
7c673cae FG |
664 | return *this; |
665 | } | |
1e59de90 | 666 | bool operator!=(const iterator& other) const { |
7c673cae FG |
667 | return this->current_ != other.current_; |
668 | } | |
669 | ColumnFamilyData* operator*() { return current_; } | |
670 | ||
671 | private: | |
672 | ColumnFamilyData* current_; | |
673 | }; | |
674 | ||
675 | ColumnFamilySet(const std::string& dbname, | |
676 | const ImmutableDBOptions* db_options, | |
f67539c2 | 677 | const FileOptions& file_options, Cache* table_cache, |
20effc67 TL |
678 | WriteBufferManager* _write_buffer_manager, |
679 | WriteController* _write_controller, | |
680 | BlockCacheTracer* const block_cache_tracer, | |
1e59de90 TL |
681 | const std::shared_ptr<IOTracer>& io_tracer, |
682 | const std::string& db_id, const std::string& db_session_id); | |
7c673cae FG |
683 | ~ColumnFamilySet(); |
684 | ||
685 | ColumnFamilyData* GetDefault() const; | |
686 | // GetColumnFamily() calls return nullptr if column family is not found | |
687 | ColumnFamilyData* GetColumnFamily(uint32_t id) const; | |
688 | ColumnFamilyData* GetColumnFamily(const std::string& name) const; | |
689 | // this call will return the next available column family ID. it guarantees | |
690 | // that there is no column family with id greater than or equal to the | |
691 | // returned value in the current running instance or anytime in RocksDB | |
692 | // instance history. | |
693 | uint32_t GetNextColumnFamilyID(); | |
694 | uint32_t GetMaxColumnFamily(); | |
695 | void UpdateMaxColumnFamily(uint32_t new_max_column_family); | |
696 | size_t NumberOfColumnFamilies() const; | |
697 | ||
698 | ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, | |
699 | Version* dummy_version, | |
700 | const ColumnFamilyOptions& options); | |
701 | ||
702 | iterator begin() { return iterator(dummy_cfd_->next_); } | |
703 | iterator end() { return iterator(dummy_cfd_); } | |
704 | ||
7c673cae FG |
705 | Cache* get_table_cache() { return table_cache_; } |
706 | ||
20effc67 TL |
707 | WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } |
708 | ||
709 | WriteController* write_controller() { return write_controller_; } | |
710 | ||
7c673cae FG |
711 | private: |
712 | friend class ColumnFamilyData; | |
713 | // helper function that gets called from cfd destructor | |
714 | // REQUIRES: DB mutex held | |
715 | void RemoveColumnFamily(ColumnFamilyData* cfd); | |
716 | ||
717 | // column_families_ and column_family_data_ need to be protected: | |
718 | // * when mutating both conditions have to be satisfied: | |
719 | // 1. DB mutex locked | |
720 | // 2. thread currently in single-threaded write thread | |
721 | // * when reading, at least one condition needs to be satisfied: | |
722 | // 1. DB mutex locked | |
723 | // 2. accessed from a single-threaded write thread | |
1e59de90 TL |
724 | UnorderedMap<std::string, uint32_t> column_families_; |
725 | UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_; | |
7c673cae FG |
726 | |
727 | uint32_t max_column_family_; | |
1e59de90 TL |
728 | const FileOptions file_options_; |
729 | ||
7c673cae FG |
730 | ColumnFamilyData* dummy_cfd_; |
731 | // We don't hold the refcount here, since default column family always exists | |
732 | // We are also not responsible for cleaning up default_cfd_cache_. This is | |
733 | // just a cache that makes common case (accessing default column family) | |
734 | // faster | |
735 | ColumnFamilyData* default_cfd_cache_; | |
736 | ||
737 | const std::string db_name_; | |
738 | const ImmutableDBOptions* const db_options_; | |
7c673cae FG |
739 | Cache* table_cache_; |
740 | WriteBufferManager* write_buffer_manager_; | |
741 | WriteController* write_controller_; | |
f67539c2 | 742 | BlockCacheTracer* const block_cache_tracer_; |
20effc67 | 743 | std::shared_ptr<IOTracer> io_tracer_; |
1e59de90 TL |
744 | const std::string& db_id_; |
745 | std::string db_session_id_; | |
746 | }; | |
747 | ||
748 | // A wrapper for ColumnFamilySet that supports releasing DB mutex during each | |
749 | // iteration over the iterator, because the cfd is Refed and Unrefed during | |
750 | // each iteration to prevent concurrent CF drop from destroying it (until | |
751 | // Unref). | |
752 | class RefedColumnFamilySet { | |
753 | public: | |
754 | explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {} | |
755 | ||
756 | class iterator { | |
757 | public: | |
758 | explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) { | |
759 | MaybeRef(*wrapped_); | |
760 | } | |
761 | ~iterator() { MaybeUnref(*wrapped_); } | |
762 | inline void MaybeRef(ColumnFamilyData* cfd) { | |
763 | if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { | |
764 | cfd->Ref(); | |
765 | } | |
766 | } | |
767 | inline void MaybeUnref(ColumnFamilyData* cfd) { | |
768 | if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { | |
769 | cfd->UnrefAndTryDelete(); | |
770 | } | |
771 | } | |
772 | // NOTE: minimum operators for for-loop iteration | |
773 | inline iterator& operator++() { | |
774 | ColumnFamilyData* old = *wrapped_; | |
775 | ++wrapped_; | |
776 | // Can only unref & potentially free cfd after accessing its next_ | |
777 | MaybeUnref(old); | |
778 | MaybeRef(*wrapped_); | |
779 | return *this; | |
780 | } | |
781 | inline bool operator!=(const iterator& other) const { | |
782 | return this->wrapped_ != other.wrapped_; | |
783 | } | |
784 | inline ColumnFamilyData* operator*() { return *wrapped_; } | |
785 | ||
786 | private: | |
787 | ColumnFamilySet::iterator wrapped_; | |
788 | }; | |
789 | ||
790 | iterator begin() { return iterator(wrapped_->begin()); } | |
791 | iterator end() { return iterator(wrapped_->end()); } | |
792 | ||
793 | private: | |
794 | ColumnFamilySet* wrapped_; | |
7c673cae FG |
795 | }; |
796 | ||
797 | // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access | |
798 | // memtables of different column families (specified by ID in the write batch) | |
799 | class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { | |
800 | public: | |
801 | explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) | |
802 | : column_family_set_(column_family_set), current_(nullptr) {} | |
803 | ||
804 | // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed | |
805 | // with the arguments used to construct *orig. | |
806 | explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) | |
807 | : column_family_set_(orig->column_family_set_), current_(nullptr) {} | |
808 | ||
809 | // sets current_ to ColumnFamilyData with column_family_id | |
810 | // returns false if column family doesn't exist | |
811 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
812 | // under a DB mutex OR from a write thread | |
813 | bool Seek(uint32_t column_family_id) override; | |
814 | ||
815 | // Returns log number of the selected column family | |
816 | // REQUIRES: under a DB mutex OR from a write thread | |
817 | uint64_t GetLogNumber() const override; | |
818 | ||
819 | // REQUIRES: Seek() called first | |
820 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
821 | // under a DB mutex OR from a write thread | |
822 | virtual MemTable* GetMemTable() const override; | |
823 | ||
824 | // Returns column family handle for the selected column family | |
825 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
826 | // under a DB mutex OR from a write thread | |
827 | virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; | |
828 | ||
829 | // Cannot be called while another thread is calling Seek(). | |
830 | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be | |
831 | // under a DB mutex OR from a write thread | |
832 | virtual ColumnFamilyData* current() override { return current_; } | |
833 | ||
834 | private: | |
835 | ColumnFamilySet* column_family_set_; | |
836 | ColumnFamilyData* current_; | |
837 | ColumnFamilyHandleInternal handle_; | |
838 | }; | |
839 | ||
840 | extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); | |
841 | ||
842 | extern const Comparator* GetColumnFamilyUserComparator( | |
843 | ColumnFamilyHandle* column_family); | |
844 | ||
f67539c2 | 845 | } // namespace ROCKSDB_NAMESPACE |