]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/utilities/transaction_db.h
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction_db.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#pragma once
7#ifndef ROCKSDB_LITE
8
9#include <string>
10#include <utility>
11#include <vector>
12
13#include "rocksdb/comparator.h"
14#include "rocksdb/db.h"
15#include "rocksdb/utilities/stackable_db.h"
16#include "rocksdb/utilities/transaction.h"
17
18// Database with Transaction support.
19//
20// See transaction.h and examples/transaction_example.cc
21
f67539c2 22namespace ROCKSDB_NAMESPACE {
7c673cae
FG
23
24class TransactionDBMutexFactory;
25
11fdf7f2
TL
26enum TxnDBWritePolicy {
27 WRITE_COMMITTED = 0, // write only the committed data
1e59de90
TL
28 WRITE_PREPARED, // write data after the prepare phase of 2pc
29 WRITE_UNPREPARED // write data before the prepare phase of 2pc
11fdf7f2
TL
30};
31
1e59de90
TL
32constexpr uint32_t kInitialMaxDeadlocks = 5;
33
34class LockManager;
35struct RangeLockInfo;
36
37// A lock manager handle
38// The workflow is as follows:
39// * Use a factory method (like NewRangeLockManager()) to create a lock
40// manager and get its handle.
41// * A Handle for a particular kind of lock manager will have extra
42// methods and parameters to control the lock manager
43// * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
44// will be used to perform locking.
45class LockManagerHandle {
46 public:
47 // PessimisticTransactionDB will call this to get the Lock Manager it's going
48 // to use.
49 virtual LockManager* getLockManager() = 0;
50
51 virtual ~LockManagerHandle() {}
52};
53
54// Same as class Endpoint, but use std::string to manage the buffer allocation
55struct EndpointWithString {
56 std::string slice;
57 bool inf_suffix;
58};
59
60struct RangeDeadlockInfo {
61 TransactionID m_txn_id;
62 uint32_t m_cf_id;
63 bool m_exclusive;
64
65 EndpointWithString m_start;
66 EndpointWithString m_end;
67};
68
69struct RangeDeadlockPath {
70 std::vector<RangeDeadlockInfo> path;
71 bool limit_exceeded;
72 int64_t deadlock_time;
73
74 explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
75 const int64_t& dl_time)
76 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
77
78 // empty path, limit exceeded constructor and default constructor
79 explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
80 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
81
82 bool empty() { return path.empty() && !limit_exceeded; }
83};
84
85// A handle to control RangeLockManager (Range-based lock manager) from outside
86// RocksDB
87class RangeLockManagerHandle : public LockManagerHandle {
88 public:
89 // Set total amount of lock memory to use.
90 //
91 // @return 0 Ok
92 // @return EDOM Failed to set because currently using more memory than
93 // specified
94 virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
95 virtual size_t GetMaxLockMemory() = 0;
96
97 using RangeLockStatus =
98 std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
99
100 // Lock Escalation barrier check function.
101 // It is called for a couple of endpoints A and B, such that A < B.
102 // If escalation_barrier_check_func(A, B)==true, then there's a lock
103 // escalation barrier between A and B, and lock escalation is not allowed
104 // to bridge the gap between A and B.
105 //
106 // The function may be called from any thread that acquires or releases
107 // locks. It should not throw exceptions. There is currently no way to return
108 // an error.
109 using EscalationBarrierFunc =
110 std::function<bool(const Endpoint& a, const Endpoint& b)>;
111
112 // Set the user-provided barrier check function
113 virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0;
114
115 virtual RangeLockStatus GetRangeLockStatusData() = 0;
116
117 class Counters {
118 public:
119 // Number of times lock escalation was triggered (for all column families)
120 uint64_t escalation_count;
121
122 // Number of times lock acquisition had to wait for a conflicting lock
123 // to be released. This counts both successful waits (where the desired
124 // lock was acquired) and waits that timed out or got other error.
125 uint64_t lock_wait_count;
126
127 // How much memory is currently used for locks (total for all column
128 // families)
129 uint64_t current_lock_memory;
130 };
131
132 // Get the current counter values
133 virtual Counters GetStatus() = 0;
134
135 // Functions for range-based Deadlock reporting.
136 virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
137 virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
138
139 virtual ~RangeLockManagerHandle() {}
140};
141
142// A factory function to create a Range Lock Manager. The created object should
143// be:
144// 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
145// range-locking mode
146// 2. Used to control the lock manager when the DB is already open.
147RangeLockManagerHandle* NewRangeLockManager(
148 std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
11fdf7f2 149
7c673cae
FG
150struct TransactionDBOptions {
151 // Specifies the maximum number of keys that can be locked at the same time
152 // per column family.
153 // If the number of locked keys is greater than max_num_locks, transaction
154 // writes (or GetForUpdate) will return an error.
155 // If this value is not positive, no limit will be enforced.
156 int64_t max_num_locks = -1;
157
11fdf7f2
TL
158 // Stores the number of latest deadlocks to track
159 uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
160
7c673cae
FG
161 // Increasing this value will increase the concurrency by dividing the lock
162 // table (per column family) into more sub-tables, each with their own
1e59de90 163 // separate mutex.
7c673cae
FG
164 size_t num_stripes = 16;
165
166 // If positive, specifies the default wait timeout in milliseconds when
167 // a transaction attempts to lock a key if not specified by
168 // TransactionOptions::lock_timeout.
169 //
170 // If 0, no waiting is done if a lock cannot instantly be acquired.
171 // If negative, there is no timeout. Not using a timeout is not recommended
172 // as it can lead to deadlocks. Currently, there is no deadlock-detection to
1e59de90 173 // recover from a deadlock.
7c673cae
FG
174 int64_t transaction_lock_timeout = 1000; // 1 second
175
176 // If positive, specifies the wait timeout in milliseconds when writing a key
177 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
178 // directly).
179 // If 0, no waiting is done if a lock cannot instantly be acquired.
180 // If negative, there is no timeout and will block indefinitely when acquiring
181 // a lock.
182 //
183 // Not using a timeout can lead to deadlocks. Currently, there
184 // is no deadlock-detection to recover from a deadlock. While DB writes
185 // cannot deadlock with other DB writes, they can deadlock with a transaction.
186 // A negative timeout should only be used if all transactions have a small
187 // expiration set.
188 int64_t default_lock_timeout = 1000; // 1 second
189
11fdf7f2 190 // If set, the TransactionDB will use this implementation of a mutex and
7c673cae
FG
191 // condition variable for all transaction locking instead of the default
192 // mutex/condvar implementation.
193 std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
11fdf7f2
TL
194
195 // The policy for when to write the data into the DB. The default policy is to
196 // write only the committed data (WRITE_COMMITTED). The data could be written
197 // before the commit phase. The DB then needs to provide the mechanisms to
198 // tell apart committed from uncommitted data.
199 TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
200
201 // TODO(myabandeh): remove this option
202 // Note: this is a temporary option as a hot fix in rollback of writeprepared
203 // txns in myrocks. MyRocks uses merge operands for autoinc column id without
204 // however obtaining locks. This breaks the assumption behind the rollback
205 // logic in myrocks. This hack of simply not rolling back merge operands works
206 // for the special way that myrocks uses this operands.
207 bool rollback_merge_operands = false;
494da23a 208
1e59de90
TL
209 // nullptr means use default lock manager.
210 // Other value means the user provides a custom lock manager.
211 std::shared_ptr<LockManagerHandle> lock_mgr_handle;
212
f67539c2
TL
213 // If true, the TransactionDB implementation might skip concurrency control
214 // unless it is overridden by TransactionOptions or
1e59de90 215 // TransactionDBWriteOptimizations. This can be used in conjunction with
f67539c2
TL
216 // DBOptions::unordered_write when the TransactionDB is used solely for write
217 // ordering rather than concurrency control.
218 bool skip_concurrency_control = false;
219
220 // This option is only valid for write unprepared. If a write batch exceeds
221 // this threshold, then the transaction will implicitly flush the currently
222 // pending writes into the database. A value of 0 or less means no limit.
223 int64_t default_write_batch_flush_threshold = 0;
224
1e59de90
TL
225 // This option is valid only for write-prepared/write-unprepared. Transaction
226 // will rely on this callback to determine if a key should be rolled back
227 // with Delete or SingleDelete when necessary. If the callback returns true,
228 // then SingleDelete should be used. If the callback is not callable or the
229 // callback returns false, then a Delete is used.
230 // The application should ensure thread-safety of this callback.
231 // The callback should not throw because RocksDB is not exception-safe.
232 // The callback may be removed if we allow mixing Delete and SingleDelete in
233 // the future.
234 std::function<bool(TransactionDB* /*db*/,
235 ColumnFamilyHandle* /*column_family*/,
236 const Slice& /*key*/)>
237 rollback_deletion_type_callback;
238
494da23a
TL
239 private:
240 // 128 entries
1e59de90
TL
241 // Should the default value change, please also update wp_snapshot_cache_bits
242 // in db_stress_gflags.cc
494da23a
TL
243 size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
244 // 8m entry, 64MB size
1e59de90
TL
245 // Should the default value change, please also update wp_commit_cache_bits
246 // in db_stress_gflags.cc
494da23a
TL
247 size_t wp_commit_cache_bits = static_cast<size_t>(23);
248
f67539c2
TL
249 // For testing, whether transaction name should be auto-generated or not. This
250 // is useful for write unprepared which requires named transactions.
251 bool autogenerate_name = false;
252
494da23a 253 friend class WritePreparedTxnDB;
f67539c2 254 friend class WriteUnpreparedTxn;
494da23a 255 friend class WritePreparedTransactionTestBase;
f67539c2 256 friend class TransactionTestBase;
494da23a 257 friend class MySQLStyleTransactionTest;
1e59de90 258 friend class StressTest;
7c673cae
FG
259};
260
261struct TransactionOptions {
262 // Setting set_snapshot=true is the same as calling
263 // Transaction::SetSnapshot().
264 bool set_snapshot = false;
265
266 // Setting to true means that before acquiring locks, this transaction will
267 // check if doing so will cause a deadlock. If so, it will return with
268 // Status::Busy. The user should retry their transaction.
269 bool deadlock_detect = false;
270
11fdf7f2
TL
271 // If set, it states that the CommitTimeWriteBatch represents the latest state
272 // of the application, has only one sub-batch, i.e., no duplicate keys, and
273 // meant to be used later during recovery. It enables an optimization to
274 // postpone updating the memtable with CommitTimeWriteBatch to only
275 // SwitchMemtable or recovery.
1e59de90
TL
276 // This option does not affect write-committed. Only
277 // write-prepared/write-unprepared transactions will be affected.
11fdf7f2
TL
278 bool use_only_the_last_commit_time_batch_for_recovery = false;
279
7c673cae
FG
280 // TODO(agiardullo): TransactionDB does not yet support comparators that allow
281 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
282 // return 0 if
283 // a.compare(b) returns 0.
284
7c673cae
FG
285 // If positive, specifies the wait timeout in milliseconds when
286 // a transaction attempts to lock a key.
287 //
288 // If 0, no waiting is done if a lock cannot instantly be acquired.
289 // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
290 int64_t lock_timeout = -1;
291
292 // Expiration duration in milliseconds. If non-negative, transactions that
293 // last longer than this many milliseconds will fail to commit. If not set,
294 // a forgotten transaction that is never committed, rolled back, or deleted
295 // will never relinquish any locks it holds. This could prevent keys from
296 // being written by other writers.
297 int64_t expiration = -1;
298
299 // The number of traversals to make during deadlock detection.
300 int64_t deadlock_detect_depth = 50;
301
302 // The maximum number of bytes used for the write batch. 0 means no limit.
303 size_t max_write_batch_size = 0;
11fdf7f2
TL
304
305 // Skip Concurrency Control. This could be as an optimization if the
306 // application knows that the transaction would not have any conflict with
307 // concurrent transactions. It could also be used during recovery if (i)
308 // application guarantees no conflict between prepared transactions in the WAL
309 // (ii) application guarantees that recovered transactions will be rolled
310 // back/commit before new transactions start.
311 // Default: false
312 bool skip_concurrency_control = false;
f67539c2 313
20effc67
TL
314 // In pessimistic transaction, if this is true, then you can skip Prepare
315 // before Commit, otherwise, you must Prepare before Commit.
316 bool skip_prepare = true;
317
f67539c2
TL
318 // See TransactionDBOptions::default_write_batch_flush_threshold for
319 // description. If a negative value is specified, then the default value from
320 // TransactionDBOptions is used.
321 int64_t write_batch_flush_threshold = -1;
11fdf7f2
TL
322};
323
324// The per-write optimizations that do not involve transactions. TransactionDB
325// implementation might or might not make use of the specified optimizations.
326struct TransactionDBWriteOptimizations {
327 // If it is true it means that the application guarantees that the
328 // key-set in the write batch do not conflict with any concurrent transaction
329 // and hence the concurrency control mechanism could be skipped for this
330 // write.
331 bool skip_concurrency_control = false;
332 // If true, the application guarantees that there is no duplicate <column
333 // family, key> in the write batch and any employed mechanism to handle
334 // duplicate keys could be skipped.
335 bool skip_duplicate_key_check = false;
7c673cae
FG
336};
337
338struct KeyLockInfo {
339 std::string key;
340 std::vector<TransactionID> ids;
341 bool exclusive;
342};
343
20effc67 344struct RangeLockInfo {
1e59de90
TL
345 EndpointWithString start;
346 EndpointWithString end;
20effc67
TL
347 std::vector<TransactionID> ids;
348 bool exclusive;
349};
350
11fdf7f2
TL
351struct DeadlockInfo {
352 TransactionID m_txn_id;
353 uint32_t m_cf_id;
11fdf7f2 354 bool m_exclusive;
494da23a 355 std::string m_waiting_key;
11fdf7f2
TL
356};
357
358struct DeadlockPath {
359 std::vector<DeadlockInfo> path;
360 bool limit_exceeded;
361 int64_t deadlock_time;
362
363 explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
364 const int64_t& dl_time)
365 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
366
367 // empty path, limit exceeded constructor and default constructor
368 explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
369 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
370
371 bool empty() { return path.empty() && !limit_exceeded; }
372};
373
7c673cae
FG
374class TransactionDB : public StackableDB {
375 public:
11fdf7f2
TL
376 // Optimized version of ::Write that receives more optimization request such
377 // as skip_concurrency_control.
378 using StackableDB::Write;
379 virtual Status Write(const WriteOptions& opts,
380 const TransactionDBWriteOptimizations&,
381 WriteBatch* updates) {
382 // The default implementation ignores TransactionDBWriteOptimizations and
383 // falls back to the un-optimized version of ::Write
384 return Write(opts, updates);
385 }
20effc67
TL
386 // Transactional `DeleteRange()` is not yet supported.
387 // However, users who know their deleted range does not conflict with
388 // anything can still use it via the `Write()` API. In all cases, the
389 // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
390 // used and `skip_concurrency_control` must be set. When using either
391 // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
392 // additionally be set.
1e59de90 393 using StackableDB::DeleteRange;
20effc67
TL
394 virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
395 const Slice&, const Slice&) override {
396 return Status::NotSupported();
397 }
7c673cae
FG
398 // Open a TransactionDB similar to DB::Open().
399 // Internally call PrepareWrap() and WrapDB()
11fdf7f2 400 // If the return status is not ok, then dbptr is set to nullptr.
7c673cae
FG
401 static Status Open(const Options& options,
402 const TransactionDBOptions& txn_db_options,
403 const std::string& dbname, TransactionDB** dbptr);
404
405 static Status Open(const DBOptions& db_options,
406 const TransactionDBOptions& txn_db_options,
407 const std::string& dbname,
408 const std::vector<ColumnFamilyDescriptor>& column_families,
409 std::vector<ColumnFamilyHandle*>* handles,
410 TransactionDB** dbptr);
7c673cae
FG
411 // Note: PrepareWrap() may change parameters, make copies before the
412 // invocation if needed.
7c673cae
FG
413 static void PrepareWrap(DBOptions* db_options,
414 std::vector<ColumnFamilyDescriptor>* column_families,
415 std::vector<size_t>* compaction_enabled_cf_indices);
11fdf7f2
TL
416 // If the return status is not ok, then dbptr will bet set to nullptr. The
417 // input db parameter might or might not be deleted as a result of the
418 // failure. If it is properly deleted it will be set to nullptr. If the return
419 // status is ok, the ownership of db is transferred to dbptr.
7c673cae
FG
420 static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
421 const std::vector<size_t>& compaction_enabled_cf_indices,
422 const std::vector<ColumnFamilyHandle*>& handles,
423 TransactionDB** dbptr);
11fdf7f2
TL
424 // If the return status is not ok, then dbptr will bet set to nullptr. The
425 // input db parameter might or might not be deleted as a result of the
426 // failure. If it is properly deleted it will be set to nullptr. If the return
427 // status is ok, the ownership of db is transferred to dbptr.
7c673cae
FG
428 static Status WrapStackableDB(
429 StackableDB* db, const TransactionDBOptions& txn_db_options,
430 const std::vector<size_t>& compaction_enabled_cf_indices,
431 const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
11fdf7f2
TL
432 // Since the destructor in StackableDB is virtual, this destructor is virtual
433 // too. The root db will be deleted by the base's destructor.
434 ~TransactionDB() override {}
7c673cae
FG
435
436 // Starts a new Transaction.
437 //
438 // Caller is responsible for deleting the returned transaction when no
439 // longer needed.
440 //
441 // If old_txn is not null, BeginTransaction will reuse this Transaction
442 // handle instead of allocating a new one. This is an optimization to avoid
443 // extra allocations when repeatedly creating transactions.
444 virtual Transaction* BeginTransaction(
445 const WriteOptions& write_options,
446 const TransactionOptions& txn_options = TransactionOptions(),
447 Transaction* old_txn = nullptr) = 0;
448
449 virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
450 virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
451
452 // Returns set of all locks held.
453 //
454 // The mapping is column family id -> KeyLockInfo
455 virtual std::unordered_multimap<uint32_t, KeyLockInfo>
456 GetLockStatusData() = 0;
20effc67 457
11fdf7f2
TL
458 virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
459 virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
7c673cae 460
1e59de90
TL
461 // Create a snapshot and assign ts to it. Return the snapshot to caller. The
462 // snapshot-timestamp mapping is also tracked by the database.
463 // Caller must ensure there are no active writes when this API is called.
464 virtual std::pair<Status, std::shared_ptr<const Snapshot>>
465 CreateTimestampedSnapshot(TxnTimestamp ts) = 0;
466
467 // Return the latest timestamped snapshot if present.
468 std::shared_ptr<const Snapshot> GetLatestTimestampedSnapshot() const {
469 return GetTimestampedSnapshot(kMaxTxnTimestamp);
470 }
471 // Return the snapshot correponding to given timestamp. If ts is
472 // kMaxTxnTimestamp, then we return the latest timestamped snapshot if
473 // present. Othersise, we return the snapshot whose timestamp is equal to
474 // `ts`. If no such snapshot exists, then we return null.
475 virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot(
476 TxnTimestamp ts) const = 0;
477 // Release timestamped snapshots whose timestamps are less than or equal to
478 // ts.
479 virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0;
480
481 // Get all timestamped snapshots which will be stored in
482 // timestamped_snapshots.
483 Status GetAllTimestampedSnapshots(
484 std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
485 const {
486 return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp,
487 timestamped_snapshots);
488 }
489
490 // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub).
491 // timestamped_snapshots will be cleared and contain returned snapshots.
492 virtual Status GetTimestampedSnapshots(
493 TxnTimestamp ts_lb, TxnTimestamp ts_ub,
494 std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
495 const = 0;
496
7c673cae
FG
497 protected:
498 // To Create an TransactionDB, call Open()
11fdf7f2 499 // The ownership of db is transferred to the base StackableDB
7c673cae 500 explicit TransactionDB(DB* db) : StackableDB(db) {}
7c673cae 501 // No copying allowed
f67539c2
TL
502 TransactionDB(const TransactionDB&) = delete;
503 void operator=(const TransactionDB&) = delete;
7c673cae
FG
504};
505
f67539c2 506} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
507
508#endif // ROCKSDB_LITE