]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/utilities/transaction_db.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction_db.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #pragma once
7 #ifndef ROCKSDB_LITE
8
9 #include <string>
10 #include <utility>
11 #include <vector>
12
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/utilities/stackable_db.h"
16 #include "rocksdb/utilities/transaction.h"
17
18 // Database with Transaction support.
19 //
20 // See transaction.h and examples/transaction_example.cc
21
22 namespace rocksdb {
23
24 class TransactionDBMutexFactory;
25
26 enum TxnDBWritePolicy {
27 WRITE_COMMITTED = 0, // write only the committed data
28 // TODO(myabandeh): Not implemented yet
29 WRITE_PREPARED, // write data after the prepare phase of 2pc
30 // TODO(myabandeh): Not implemented yet
31 WRITE_UNPREPARED // write data before the prepare phase of 2pc
32 };
33
34 const uint32_t kInitialMaxDeadlocks = 5;
35
36 struct TransactionDBOptions {
37 // Specifies the maximum number of keys that can be locked at the same time
38 // per column family.
39 // If the number of locked keys is greater than max_num_locks, transaction
40 // writes (or GetForUpdate) will return an error.
41 // If this value is not positive, no limit will be enforced.
42 int64_t max_num_locks = -1;
43
44 // Stores the number of latest deadlocks to track
45 uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
46
47 // Increasing this value will increase the concurrency by dividing the lock
48 // table (per column family) into more sub-tables, each with their own
49 // separate
50 // mutex.
51 size_t num_stripes = 16;
52
53 // If positive, specifies the default wait timeout in milliseconds when
54 // a transaction attempts to lock a key if not specified by
55 // TransactionOptions::lock_timeout.
56 //
57 // If 0, no waiting is done if a lock cannot instantly be acquired.
58 // If negative, there is no timeout. Not using a timeout is not recommended
59 // as it can lead to deadlocks. Currently, there is no deadlock-detection to
60 // recover
61 // from a deadlock.
62 int64_t transaction_lock_timeout = 1000; // 1 second
63
64 // If positive, specifies the wait timeout in milliseconds when writing a key
65 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
66 // directly).
67 // If 0, no waiting is done if a lock cannot instantly be acquired.
68 // If negative, there is no timeout and will block indefinitely when acquiring
69 // a lock.
70 //
71 // Not using a timeout can lead to deadlocks. Currently, there
72 // is no deadlock-detection to recover from a deadlock. While DB writes
73 // cannot deadlock with other DB writes, they can deadlock with a transaction.
74 // A negative timeout should only be used if all transactions have a small
75 // expiration set.
76 int64_t default_lock_timeout = 1000; // 1 second
77
78 // If set, the TransactionDB will use this implementation of a mutex and
79 // condition variable for all transaction locking instead of the default
80 // mutex/condvar implementation.
81 std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
82
83 // The policy for when to write the data into the DB. The default policy is to
84 // write only the committed data (WRITE_COMMITTED). The data could be written
85 // before the commit phase. The DB then needs to provide the mechanisms to
86 // tell apart committed from uncommitted data.
87 TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
88
89 // TODO(myabandeh): remove this option
90 // Note: this is a temporary option as a hot fix in rollback of writeprepared
91 // txns in myrocks. MyRocks uses merge operands for autoinc column id without
92 // however obtaining locks. This breaks the assumption behind the rollback
93 // logic in myrocks. This hack of simply not rolling back merge operands works
94 // for the special way that myrocks uses this operands.
95 bool rollback_merge_operands = false;
96 };
97
98 struct TransactionOptions {
99 // Setting set_snapshot=true is the same as calling
100 // Transaction::SetSnapshot().
101 bool set_snapshot = false;
102
103 // Setting to true means that before acquiring locks, this transaction will
104 // check if doing so will cause a deadlock. If so, it will return with
105 // Status::Busy. The user should retry their transaction.
106 bool deadlock_detect = false;
107
108 // If set, it states that the CommitTimeWriteBatch represents the latest state
109 // of the application, has only one sub-batch, i.e., no duplicate keys, and
110 // meant to be used later during recovery. It enables an optimization to
111 // postpone updating the memtable with CommitTimeWriteBatch to only
112 // SwitchMemtable or recovery.
113 bool use_only_the_last_commit_time_batch_for_recovery = false;
114
115 // TODO(agiardullo): TransactionDB does not yet support comparators that allow
116 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
117 // return 0 if
118 // a.compare(b) returns 0.
119
120
121 // If positive, specifies the wait timeout in milliseconds when
122 // a transaction attempts to lock a key.
123 //
124 // If 0, no waiting is done if a lock cannot instantly be acquired.
125 // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
126 int64_t lock_timeout = -1;
127
128 // Expiration duration in milliseconds. If non-negative, transactions that
129 // last longer than this many milliseconds will fail to commit. If not set,
130 // a forgotten transaction that is never committed, rolled back, or deleted
131 // will never relinquish any locks it holds. This could prevent keys from
132 // being written by other writers.
133 int64_t expiration = -1;
134
135 // The number of traversals to make during deadlock detection.
136 int64_t deadlock_detect_depth = 50;
137
138 // The maximum number of bytes used for the write batch. 0 means no limit.
139 size_t max_write_batch_size = 0;
140
141 // Skip Concurrency Control. This could be as an optimization if the
142 // application knows that the transaction would not have any conflict with
143 // concurrent transactions. It could also be used during recovery if (i)
144 // application guarantees no conflict between prepared transactions in the WAL
145 // (ii) application guarantees that recovered transactions will be rolled
146 // back/commit before new transactions start.
147 // Default: false
148 bool skip_concurrency_control = false;
149 };
150
151 // The per-write optimizations that do not involve transactions. TransactionDB
152 // implementation might or might not make use of the specified optimizations.
153 struct TransactionDBWriteOptimizations {
154 // If it is true it means that the application guarantees that the
155 // key-set in the write batch do not conflict with any concurrent transaction
156 // and hence the concurrency control mechanism could be skipped for this
157 // write.
158 bool skip_concurrency_control = false;
159 // If true, the application guarantees that there is no duplicate <column
160 // family, key> in the write batch and any employed mechanism to handle
161 // duplicate keys could be skipped.
162 bool skip_duplicate_key_check = false;
163 };
164
165 struct KeyLockInfo {
166 std::string key;
167 std::vector<TransactionID> ids;
168 bool exclusive;
169 };
170
171 struct DeadlockInfo {
172 TransactionID m_txn_id;
173 uint32_t m_cf_id;
174 std::string m_waiting_key;
175 bool m_exclusive;
176 };
177
178 struct DeadlockPath {
179 std::vector<DeadlockInfo> path;
180 bool limit_exceeded;
181 int64_t deadlock_time;
182
183 explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
184 const int64_t& dl_time)
185 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
186
187 // empty path, limit exceeded constructor and default constructor
188 explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
189 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
190
191 bool empty() { return path.empty() && !limit_exceeded; }
192 };
193
194 class TransactionDB : public StackableDB {
195 public:
196 // Optimized version of ::Write that receives more optimization request such
197 // as skip_concurrency_control.
198 using StackableDB::Write;
199 virtual Status Write(const WriteOptions& opts,
200 const TransactionDBWriteOptimizations&,
201 WriteBatch* updates) {
202 // The default implementation ignores TransactionDBWriteOptimizations and
203 // falls back to the un-optimized version of ::Write
204 return Write(opts, updates);
205 }
206 // Open a TransactionDB similar to DB::Open().
207 // Internally call PrepareWrap() and WrapDB()
208 // If the return status is not ok, then dbptr is set to nullptr.
209 static Status Open(const Options& options,
210 const TransactionDBOptions& txn_db_options,
211 const std::string& dbname, TransactionDB** dbptr);
212
213 static Status Open(const DBOptions& db_options,
214 const TransactionDBOptions& txn_db_options,
215 const std::string& dbname,
216 const std::vector<ColumnFamilyDescriptor>& column_families,
217 std::vector<ColumnFamilyHandle*>* handles,
218 TransactionDB** dbptr);
219 // Note: PrepareWrap() may change parameters, make copies before the
220 // invocation if needed.
221 static void PrepareWrap(DBOptions* db_options,
222 std::vector<ColumnFamilyDescriptor>* column_families,
223 std::vector<size_t>* compaction_enabled_cf_indices);
224 // If the return status is not ok, then dbptr will bet set to nullptr. The
225 // input db parameter might or might not be deleted as a result of the
226 // failure. If it is properly deleted it will be set to nullptr. If the return
227 // status is ok, the ownership of db is transferred to dbptr.
228 static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
229 const std::vector<size_t>& compaction_enabled_cf_indices,
230 const std::vector<ColumnFamilyHandle*>& handles,
231 TransactionDB** dbptr);
232 // If the return status is not ok, then dbptr will bet set to nullptr. The
233 // input db parameter might or might not be deleted as a result of the
234 // failure. If it is properly deleted it will be set to nullptr. If the return
235 // status is ok, the ownership of db is transferred to dbptr.
236 static Status WrapStackableDB(
237 StackableDB* db, const TransactionDBOptions& txn_db_options,
238 const std::vector<size_t>& compaction_enabled_cf_indices,
239 const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
240 // Since the destructor in StackableDB is virtual, this destructor is virtual
241 // too. The root db will be deleted by the base's destructor.
242 ~TransactionDB() override {}
243
244 // Starts a new Transaction.
245 //
246 // Caller is responsible for deleting the returned transaction when no
247 // longer needed.
248 //
249 // If old_txn is not null, BeginTransaction will reuse this Transaction
250 // handle instead of allocating a new one. This is an optimization to avoid
251 // extra allocations when repeatedly creating transactions.
252 virtual Transaction* BeginTransaction(
253 const WriteOptions& write_options,
254 const TransactionOptions& txn_options = TransactionOptions(),
255 Transaction* old_txn = nullptr) = 0;
256
257 virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
258 virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
259
260 // Returns set of all locks held.
261 //
262 // The mapping is column family id -> KeyLockInfo
263 virtual std::unordered_multimap<uint32_t, KeyLockInfo>
264 GetLockStatusData() = 0;
265 virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
266 virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
267
268 protected:
269 // To Create an TransactionDB, call Open()
270 // The ownership of db is transferred to the base StackableDB
271 explicit TransactionDB(DB* db) : StackableDB(db) {}
272
273 private:
274 // No copying allowed
275 TransactionDB(const TransactionDB&);
276 void operator=(const TransactionDB&);
277 };
278
279 } // namespace rocksdb
280
281 #endif // ROCKSDB_LITE