]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | |
6 | #pragma once | |
7 | #ifndef ROCKSDB_LITE | |
8 | ||
9 | #include <string> | |
10 | #include <utility> | |
11 | #include <vector> | |
12 | ||
13 | #include "rocksdb/comparator.h" | |
14 | #include "rocksdb/db.h" | |
15 | #include "rocksdb/utilities/stackable_db.h" | |
16 | #include "rocksdb/utilities/transaction.h" | |
17 | ||
18 | // Database with Transaction support. | |
19 | // | |
20 | // See transaction.h and examples/transaction_example.cc | |
21 | ||
f67539c2 | 22 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
23 | |
24 | class TransactionDBMutexFactory; | |
25 | ||
11fdf7f2 TL |
26 | enum TxnDBWritePolicy { |
27 | WRITE_COMMITTED = 0, // write only the committed data | |
11fdf7f2 | 28 | WRITE_PREPARED, // write data after the prepare phase of 2pc |
11fdf7f2 TL |
29 | WRITE_UNPREPARED // write data before the prepare phase of 2pc |
30 | }; | |
31 | ||
32 | const uint32_t kInitialMaxDeadlocks = 5; | |
33 | ||
7c673cae FG |
34 | struct TransactionDBOptions { |
35 | // Specifies the maximum number of keys that can be locked at the same time | |
36 | // per column family. | |
37 | // If the number of locked keys is greater than max_num_locks, transaction | |
38 | // writes (or GetForUpdate) will return an error. | |
39 | // If this value is not positive, no limit will be enforced. | |
40 | int64_t max_num_locks = -1; | |
41 | ||
11fdf7f2 TL |
42 | // Stores the number of latest deadlocks to track |
43 | uint32_t max_num_deadlocks = kInitialMaxDeadlocks; | |
44 | ||
7c673cae FG |
45 | // Increasing this value will increase the concurrency by dividing the lock |
46 | // table (per column family) into more sub-tables, each with their own | |
47 | // separate | |
48 | // mutex. | |
49 | size_t num_stripes = 16; | |
50 | ||
51 | // If positive, specifies the default wait timeout in milliseconds when | |
52 | // a transaction attempts to lock a key if not specified by | |
53 | // TransactionOptions::lock_timeout. | |
54 | // | |
55 | // If 0, no waiting is done if a lock cannot instantly be acquired. | |
56 | // If negative, there is no timeout. Not using a timeout is not recommended | |
57 | // as it can lead to deadlocks. Currently, there is no deadlock-detection to | |
58 | // recover | |
59 | // from a deadlock. | |
60 | int64_t transaction_lock_timeout = 1000; // 1 second | |
61 | ||
62 | // If positive, specifies the wait timeout in milliseconds when writing a key | |
63 | // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() | |
64 | // directly). | |
65 | // If 0, no waiting is done if a lock cannot instantly be acquired. | |
66 | // If negative, there is no timeout and will block indefinitely when acquiring | |
67 | // a lock. | |
68 | // | |
69 | // Not using a timeout can lead to deadlocks. Currently, there | |
70 | // is no deadlock-detection to recover from a deadlock. While DB writes | |
71 | // cannot deadlock with other DB writes, they can deadlock with a transaction. | |
72 | // A negative timeout should only be used if all transactions have a small | |
73 | // expiration set. | |
74 | int64_t default_lock_timeout = 1000; // 1 second | |
75 | ||
11fdf7f2 | 76 | // If set, the TransactionDB will use this implementation of a mutex and |
7c673cae FG |
77 | // condition variable for all transaction locking instead of the default |
78 | // mutex/condvar implementation. | |
79 | std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory; | |
11fdf7f2 TL |
80 | |
81 | // The policy for when to write the data into the DB. The default policy is to | |
82 | // write only the committed data (WRITE_COMMITTED). The data could be written | |
83 | // before the commit phase. The DB then needs to provide the mechanisms to | |
84 | // tell apart committed from uncommitted data. | |
85 | TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; | |
86 | ||
87 | // TODO(myabandeh): remove this option | |
88 | // Note: this is a temporary option as a hot fix in rollback of writeprepared | |
89 | // txns in myrocks. MyRocks uses merge operands for autoinc column id without | |
90 | // however obtaining locks. This breaks the assumption behind the rollback | |
91 | // logic in myrocks. This hack of simply not rolling back merge operands works | |
92 | // for the special way that myrocks uses this operands. | |
93 | bool rollback_merge_operands = false; | |
494da23a | 94 | |
f67539c2 TL |
95 | // If true, the TransactionDB implementation might skip concurrency control |
96 | // unless it is overridden by TransactionOptions or | |
97 | // TransactionDBWriteOptimizations. This can be used in conjuction with | |
98 | // DBOptions::unordered_write when the TransactionDB is used solely for write | |
99 | // ordering rather than concurrency control. | |
100 | bool skip_concurrency_control = false; | |
101 | ||
102 | // This option is only valid for write unprepared. If a write batch exceeds | |
103 | // this threshold, then the transaction will implicitly flush the currently | |
104 | // pending writes into the database. A value of 0 or less means no limit. | |
105 | int64_t default_write_batch_flush_threshold = 0; | |
106 | ||
494da23a TL |
107 | private: |
108 | // 128 entries | |
109 | size_t wp_snapshot_cache_bits = static_cast<size_t>(7); | |
110 | // 8m entry, 64MB size | |
111 | size_t wp_commit_cache_bits = static_cast<size_t>(23); | |
112 | ||
f67539c2 TL |
113 | // For testing, whether transaction name should be auto-generated or not. This |
114 | // is useful for write unprepared which requires named transactions. | |
115 | bool autogenerate_name = false; | |
116 | ||
494da23a | 117 | friend class WritePreparedTxnDB; |
f67539c2 | 118 | friend class WriteUnpreparedTxn; |
494da23a | 119 | friend class WritePreparedTransactionTestBase; |
f67539c2 | 120 | friend class TransactionTestBase; |
494da23a | 121 | friend class MySQLStyleTransactionTest; |
7c673cae FG |
122 | }; |
123 | ||
124 | struct TransactionOptions { | |
125 | // Setting set_snapshot=true is the same as calling | |
126 | // Transaction::SetSnapshot(). | |
127 | bool set_snapshot = false; | |
128 | ||
129 | // Setting to true means that before acquiring locks, this transaction will | |
130 | // check if doing so will cause a deadlock. If so, it will return with | |
131 | // Status::Busy. The user should retry their transaction. | |
132 | bool deadlock_detect = false; | |
133 | ||
11fdf7f2 TL |
134 | // If set, it states that the CommitTimeWriteBatch represents the latest state |
135 | // of the application, has only one sub-batch, i.e., no duplicate keys, and | |
136 | // meant to be used later during recovery. It enables an optimization to | |
137 | // postpone updating the memtable with CommitTimeWriteBatch to only | |
138 | // SwitchMemtable or recovery. | |
139 | bool use_only_the_last_commit_time_batch_for_recovery = false; | |
140 | ||
7c673cae FG |
141 | // TODO(agiardullo): TransactionDB does not yet support comparators that allow |
142 | // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only | |
143 | // return 0 if | |
144 | // a.compare(b) returns 0. | |
145 | ||
7c673cae FG |
146 | // If positive, specifies the wait timeout in milliseconds when |
147 | // a transaction attempts to lock a key. | |
148 | // | |
149 | // If 0, no waiting is done if a lock cannot instantly be acquired. | |
150 | // If negative, TransactionDBOptions::transaction_lock_timeout will be used. | |
151 | int64_t lock_timeout = -1; | |
152 | ||
153 | // Expiration duration in milliseconds. If non-negative, transactions that | |
154 | // last longer than this many milliseconds will fail to commit. If not set, | |
155 | // a forgotten transaction that is never committed, rolled back, or deleted | |
156 | // will never relinquish any locks it holds. This could prevent keys from | |
157 | // being written by other writers. | |
158 | int64_t expiration = -1; | |
159 | ||
160 | // The number of traversals to make during deadlock detection. | |
161 | int64_t deadlock_detect_depth = 50; | |
162 | ||
163 | // The maximum number of bytes used for the write batch. 0 means no limit. | |
164 | size_t max_write_batch_size = 0; | |
11fdf7f2 TL |
165 | |
166 | // Skip Concurrency Control. This could be as an optimization if the | |
167 | // application knows that the transaction would not have any conflict with | |
168 | // concurrent transactions. It could also be used during recovery if (i) | |
169 | // application guarantees no conflict between prepared transactions in the WAL | |
170 | // (ii) application guarantees that recovered transactions will be rolled | |
171 | // back/commit before new transactions start. | |
172 | // Default: false | |
173 | bool skip_concurrency_control = false; | |
f67539c2 | 174 | |
20effc67 TL |
175 | // In pessimistic transaction, if this is true, then you can skip Prepare |
176 | // before Commit, otherwise, you must Prepare before Commit. | |
177 | bool skip_prepare = true; | |
178 | ||
f67539c2 TL |
179 | // See TransactionDBOptions::default_write_batch_flush_threshold for |
180 | // description. If a negative value is specified, then the default value from | |
181 | // TransactionDBOptions is used. | |
182 | int64_t write_batch_flush_threshold = -1; | |
11fdf7f2 TL |
183 | }; |
184 | ||
185 | // The per-write optimizations that do not involve transactions. TransactionDB | |
186 | // implementation might or might not make use of the specified optimizations. | |
187 | struct TransactionDBWriteOptimizations { | |
188 | // If it is true it means that the application guarantees that the | |
189 | // key-set in the write batch do not conflict with any concurrent transaction | |
190 | // and hence the concurrency control mechanism could be skipped for this | |
191 | // write. | |
192 | bool skip_concurrency_control = false; | |
193 | // If true, the application guarantees that there is no duplicate <column | |
194 | // family, key> in the write batch and any employed mechanism to handle | |
195 | // duplicate keys could be skipped. | |
196 | bool skip_duplicate_key_check = false; | |
7c673cae FG |
197 | }; |
198 | ||
199 | struct KeyLockInfo { | |
200 | std::string key; | |
201 | std::vector<TransactionID> ids; | |
202 | bool exclusive; | |
203 | }; | |
204 | ||
20effc67 TL |
205 | struct RangeLockInfo { |
206 | Endpoint start; | |
207 | Endpoint end; | |
208 | std::vector<TransactionID> ids; | |
209 | bool exclusive; | |
210 | }; | |
211 | ||
11fdf7f2 TL |
212 | struct DeadlockInfo { |
213 | TransactionID m_txn_id; | |
214 | uint32_t m_cf_id; | |
11fdf7f2 | 215 | bool m_exclusive; |
494da23a | 216 | std::string m_waiting_key; |
11fdf7f2 TL |
217 | }; |
218 | ||
219 | struct DeadlockPath { | |
220 | std::vector<DeadlockInfo> path; | |
221 | bool limit_exceeded; | |
222 | int64_t deadlock_time; | |
223 | ||
224 | explicit DeadlockPath(std::vector<DeadlockInfo> path_entry, | |
225 | const int64_t& dl_time) | |
226 | : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} | |
227 | ||
228 | // empty path, limit exceeded constructor and default constructor | |
229 | explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) | |
230 | : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} | |
231 | ||
232 | bool empty() { return path.empty() && !limit_exceeded; } | |
233 | }; | |
234 | ||
7c673cae FG |
235 | class TransactionDB : public StackableDB { |
236 | public: | |
11fdf7f2 TL |
237 | // Optimized version of ::Write that receives more optimization request such |
238 | // as skip_concurrency_control. | |
239 | using StackableDB::Write; | |
240 | virtual Status Write(const WriteOptions& opts, | |
241 | const TransactionDBWriteOptimizations&, | |
242 | WriteBatch* updates) { | |
243 | // The default implementation ignores TransactionDBWriteOptimizations and | |
244 | // falls back to the un-optimized version of ::Write | |
245 | return Write(opts, updates); | |
246 | } | |
20effc67 TL |
247 | // Transactional `DeleteRange()` is not yet supported. |
248 | // However, users who know their deleted range does not conflict with | |
249 | // anything can still use it via the `Write()` API. In all cases, the | |
250 | // `Write()` overload specifying `TransactionDBWriteOptimizations` must be | |
251 | // used and `skip_concurrency_control` must be set. When using either | |
252 | // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must | |
253 | // additionally be set. | |
254 | virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, | |
255 | const Slice&, const Slice&) override { | |
256 | return Status::NotSupported(); | |
257 | } | |
7c673cae FG |
258 | // Open a TransactionDB similar to DB::Open(). |
259 | // Internally call PrepareWrap() and WrapDB() | |
11fdf7f2 | 260 | // If the return status is not ok, then dbptr is set to nullptr. |
7c673cae FG |
261 | static Status Open(const Options& options, |
262 | const TransactionDBOptions& txn_db_options, | |
263 | const std::string& dbname, TransactionDB** dbptr); | |
264 | ||
265 | static Status Open(const DBOptions& db_options, | |
266 | const TransactionDBOptions& txn_db_options, | |
267 | const std::string& dbname, | |
268 | const std::vector<ColumnFamilyDescriptor>& column_families, | |
269 | std::vector<ColumnFamilyHandle*>* handles, | |
270 | TransactionDB** dbptr); | |
7c673cae FG |
271 | // Note: PrepareWrap() may change parameters, make copies before the |
272 | // invocation if needed. | |
7c673cae FG |
273 | static void PrepareWrap(DBOptions* db_options, |
274 | std::vector<ColumnFamilyDescriptor>* column_families, | |
275 | std::vector<size_t>* compaction_enabled_cf_indices); | |
11fdf7f2 TL |
276 | // If the return status is not ok, then dbptr will bet set to nullptr. The |
277 | // input db parameter might or might not be deleted as a result of the | |
278 | // failure. If it is properly deleted it will be set to nullptr. If the return | |
279 | // status is ok, the ownership of db is transferred to dbptr. | |
7c673cae FG |
280 | static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, |
281 | const std::vector<size_t>& compaction_enabled_cf_indices, | |
282 | const std::vector<ColumnFamilyHandle*>& handles, | |
283 | TransactionDB** dbptr); | |
11fdf7f2 TL |
284 | // If the return status is not ok, then dbptr will bet set to nullptr. The |
285 | // input db parameter might or might not be deleted as a result of the | |
286 | // failure. If it is properly deleted it will be set to nullptr. If the return | |
287 | // status is ok, the ownership of db is transferred to dbptr. | |
7c673cae FG |
288 | static Status WrapStackableDB( |
289 | StackableDB* db, const TransactionDBOptions& txn_db_options, | |
290 | const std::vector<size_t>& compaction_enabled_cf_indices, | |
291 | const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr); | |
11fdf7f2 TL |
292 | // Since the destructor in StackableDB is virtual, this destructor is virtual |
293 | // too. The root db will be deleted by the base's destructor. | |
294 | ~TransactionDB() override {} | |
7c673cae FG |
295 | |
296 | // Starts a new Transaction. | |
297 | // | |
298 | // Caller is responsible for deleting the returned transaction when no | |
299 | // longer needed. | |
300 | // | |
301 | // If old_txn is not null, BeginTransaction will reuse this Transaction | |
302 | // handle instead of allocating a new one. This is an optimization to avoid | |
303 | // extra allocations when repeatedly creating transactions. | |
304 | virtual Transaction* BeginTransaction( | |
305 | const WriteOptions& write_options, | |
306 | const TransactionOptions& txn_options = TransactionOptions(), | |
307 | Transaction* old_txn = nullptr) = 0; | |
308 | ||
309 | virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; | |
310 | virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0; | |
311 | ||
312 | // Returns set of all locks held. | |
313 | // | |
314 | // The mapping is column family id -> KeyLockInfo | |
315 | virtual std::unordered_multimap<uint32_t, KeyLockInfo> | |
316 | GetLockStatusData() = 0; | |
20effc67 | 317 | |
11fdf7f2 TL |
318 | virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; |
319 | virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; | |
7c673cae FG |
320 | |
321 | protected: | |
322 | // To Create an TransactionDB, call Open() | |
11fdf7f2 | 323 | // The ownership of db is transferred to the base StackableDB |
7c673cae | 324 | explicit TransactionDB(DB* db) : StackableDB(db) {} |
7c673cae | 325 | // No copying allowed |
f67539c2 TL |
326 | TransactionDB(const TransactionDB&) = delete; |
327 | void operator=(const TransactionDB&) = delete; | |
7c673cae FG |
328 | }; |
329 | ||
f67539c2 | 330 | } // namespace ROCKSDB_NAMESPACE |
7c673cae FG |
331 | |
332 | #endif // ROCKSDB_LITE |