]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | |
6 | #pragma once | |
7 | #ifndef ROCKSDB_LITE | |
8 | ||
9 | #include <string> | |
10 | #include <utility> | |
11 | #include <vector> | |
12 | ||
13 | #include "rocksdb/comparator.h" | |
14 | #include "rocksdb/db.h" | |
15 | #include "rocksdb/utilities/stackable_db.h" | |
16 | #include "rocksdb/utilities/transaction.h" | |
17 | ||
18 | // Database with Transaction support. | |
19 | // | |
20 | // See transaction.h and examples/transaction_example.cc | |
21 | ||
22 | namespace rocksdb { | |
23 | ||
24 | class TransactionDBMutexFactory; | |
25 | ||
11fdf7f2 TL |
26 | enum TxnDBWritePolicy { |
27 | WRITE_COMMITTED = 0, // write only the committed data | |
28 | // TODO(myabandeh): Not implemented yet | |
29 | WRITE_PREPARED, // write data after the prepare phase of 2pc | |
30 | // TODO(myabandeh): Not implemented yet | |
31 | WRITE_UNPREPARED // write data before the prepare phase of 2pc | |
32 | }; | |
33 | ||
34 | const uint32_t kInitialMaxDeadlocks = 5; | |
35 | ||
7c673cae FG |
36 | struct TransactionDBOptions { |
37 | // Specifies the maximum number of keys that can be locked at the same time | |
38 | // per column family. | |
39 | // If the number of locked keys is greater than max_num_locks, transaction | |
40 | // writes (or GetForUpdate) will return an error. | |
41 | // If this value is not positive, no limit will be enforced. | |
42 | int64_t max_num_locks = -1; | |
43 | ||
11fdf7f2 TL |
44 | // Stores the number of latest deadlocks to track |
45 | uint32_t max_num_deadlocks = kInitialMaxDeadlocks; | |
46 | ||
7c673cae FG |
47 | // Increasing this value will increase the concurrency by dividing the lock |
48 | // table (per column family) into more sub-tables, each with their own | |
49 | // separate | |
50 | // mutex. | |
51 | size_t num_stripes = 16; | |
52 | ||
53 | // If positive, specifies the default wait timeout in milliseconds when | |
54 | // a transaction attempts to lock a key if not specified by | |
55 | // TransactionOptions::lock_timeout. | |
56 | // | |
57 | // If 0, no waiting is done if a lock cannot instantly be acquired. | |
58 | // If negative, there is no timeout. Not using a timeout is not recommended | |
59 | // as it can lead to deadlocks. Currently, there is no deadlock-detection to | |
60 | // recover | |
61 | // from a deadlock. | |
62 | int64_t transaction_lock_timeout = 1000; // 1 second | |
63 | ||
64 | // If positive, specifies the wait timeout in milliseconds when writing a key | |
65 | // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() | |
66 | // directly). | |
67 | // If 0, no waiting is done if a lock cannot instantly be acquired. | |
68 | // If negative, there is no timeout and will block indefinitely when acquiring | |
69 | // a lock. | |
70 | // | |
71 | // Not using a timeout can lead to deadlocks. Currently, there | |
72 | // is no deadlock-detection to recover from a deadlock. While DB writes | |
73 | // cannot deadlock with other DB writes, they can deadlock with a transaction. | |
74 | // A negative timeout should only be used if all transactions have a small | |
75 | // expiration set. | |
76 | int64_t default_lock_timeout = 1000; // 1 second | |
77 | ||
11fdf7f2 | 78 | // If set, the TransactionDB will use this implementation of a mutex and |
7c673cae FG |
79 | // condition variable for all transaction locking instead of the default |
80 | // mutex/condvar implementation. | |
81 | std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory; | |
11fdf7f2 TL |
82 | |
83 | // The policy for when to write the data into the DB. The default policy is to | |
84 | // write only the committed data (WRITE_COMMITTED). The data could be written | |
85 | // before the commit phase. The DB then needs to provide the mechanisms to | |
86 | // tell apart committed from uncommitted data. | |
87 | TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; | |
88 | ||
89 | // TODO(myabandeh): remove this option | |
90 | // Note: this is a temporary option as a hot fix in rollback of writeprepared | |
91 | // txns in myrocks. MyRocks uses merge operands for autoinc column id without | |
92 | // however obtaining locks. This breaks the assumption behind the rollback | |
93 | // logic in myrocks. This hack of simply not rolling back merge operands works | |
94 | // for the special way that myrocks uses this operands. | |
95 | bool rollback_merge_operands = false; | |
494da23a TL |
96 | |
97 | private: | |
98 | // 128 entries | |
99 | size_t wp_snapshot_cache_bits = static_cast<size_t>(7); | |
100 | // 8m entry, 64MB size | |
101 | size_t wp_commit_cache_bits = static_cast<size_t>(23); | |
102 | ||
103 | friend class WritePreparedTxnDB; | |
104 | friend class WritePreparedTransactionTestBase; | |
105 | friend class MySQLStyleTransactionTest; | |
7c673cae FG |
106 | }; |
107 | ||
108 | struct TransactionOptions { | |
109 | // Setting set_snapshot=true is the same as calling | |
110 | // Transaction::SetSnapshot(). | |
111 | bool set_snapshot = false; | |
112 | ||
113 | // Setting to true means that before acquiring locks, this transaction will | |
114 | // check if doing so will cause a deadlock. If so, it will return with | |
115 | // Status::Busy. The user should retry their transaction. | |
116 | bool deadlock_detect = false; | |
117 | ||
11fdf7f2 TL |
118 | // If set, it states that the CommitTimeWriteBatch represents the latest state |
119 | // of the application, has only one sub-batch, i.e., no duplicate keys, and | |
120 | // meant to be used later during recovery. It enables an optimization to | |
121 | // postpone updating the memtable with CommitTimeWriteBatch to only | |
122 | // SwitchMemtable or recovery. | |
123 | bool use_only_the_last_commit_time_batch_for_recovery = false; | |
124 | ||
7c673cae FG |
125 | // TODO(agiardullo): TransactionDB does not yet support comparators that allow |
126 | // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only | |
127 | // return 0 if | |
128 | // a.compare(b) returns 0. | |
129 | ||
7c673cae FG |
130 | // If positive, specifies the wait timeout in milliseconds when |
131 | // a transaction attempts to lock a key. | |
132 | // | |
133 | // If 0, no waiting is done if a lock cannot instantly be acquired. | |
134 | // If negative, TransactionDBOptions::transaction_lock_timeout will be used. | |
135 | int64_t lock_timeout = -1; | |
136 | ||
137 | // Expiration duration in milliseconds. If non-negative, transactions that | |
138 | // last longer than this many milliseconds will fail to commit. If not set, | |
139 | // a forgotten transaction that is never committed, rolled back, or deleted | |
140 | // will never relinquish any locks it holds. This could prevent keys from | |
141 | // being written by other writers. | |
142 | int64_t expiration = -1; | |
143 | ||
144 | // The number of traversals to make during deadlock detection. | |
145 | int64_t deadlock_detect_depth = 50; | |
146 | ||
147 | // The maximum number of bytes used for the write batch. 0 means no limit. | |
148 | size_t max_write_batch_size = 0; | |
11fdf7f2 TL |
149 | |
150 | // Skip Concurrency Control. This could be as an optimization if the | |
151 | // application knows that the transaction would not have any conflict with | |
152 | // concurrent transactions. It could also be used during recovery if (i) | |
153 | // application guarantees no conflict between prepared transactions in the WAL | |
154 | // (ii) application guarantees that recovered transactions will be rolled | |
155 | // back/commit before new transactions start. | |
156 | // Default: false | |
157 | bool skip_concurrency_control = false; | |
158 | }; | |
159 | ||
160 | // The per-write optimizations that do not involve transactions. TransactionDB | |
161 | // implementation might or might not make use of the specified optimizations. | |
162 | struct TransactionDBWriteOptimizations { | |
163 | // If it is true it means that the application guarantees that the | |
164 | // key-set in the write batch do not conflict with any concurrent transaction | |
165 | // and hence the concurrency control mechanism could be skipped for this | |
166 | // write. | |
167 | bool skip_concurrency_control = false; | |
168 | // If true, the application guarantees that there is no duplicate <column | |
169 | // family, key> in the write batch and any employed mechanism to handle | |
170 | // duplicate keys could be skipped. | |
171 | bool skip_duplicate_key_check = false; | |
7c673cae FG |
172 | }; |
173 | ||
174 | struct KeyLockInfo { | |
175 | std::string key; | |
176 | std::vector<TransactionID> ids; | |
177 | bool exclusive; | |
178 | }; | |
179 | ||
11fdf7f2 TL |
180 | struct DeadlockInfo { |
181 | TransactionID m_txn_id; | |
182 | uint32_t m_cf_id; | |
11fdf7f2 | 183 | bool m_exclusive; |
494da23a | 184 | std::string m_waiting_key; |
11fdf7f2 TL |
185 | }; |
186 | ||
187 | struct DeadlockPath { | |
188 | std::vector<DeadlockInfo> path; | |
189 | bool limit_exceeded; | |
190 | int64_t deadlock_time; | |
191 | ||
192 | explicit DeadlockPath(std::vector<DeadlockInfo> path_entry, | |
193 | const int64_t& dl_time) | |
194 | : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} | |
195 | ||
196 | // empty path, limit exceeded constructor and default constructor | |
197 | explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false) | |
198 | : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} | |
199 | ||
200 | bool empty() { return path.empty() && !limit_exceeded; } | |
201 | }; | |
202 | ||
7c673cae FG |
203 | class TransactionDB : public StackableDB { |
204 | public: | |
11fdf7f2 TL |
205 | // Optimized version of ::Write that receives more optimization request such |
206 | // as skip_concurrency_control. | |
207 | using StackableDB::Write; | |
208 | virtual Status Write(const WriteOptions& opts, | |
209 | const TransactionDBWriteOptimizations&, | |
210 | WriteBatch* updates) { | |
211 | // The default implementation ignores TransactionDBWriteOptimizations and | |
212 | // falls back to the un-optimized version of ::Write | |
213 | return Write(opts, updates); | |
214 | } | |
7c673cae FG |
215 | // Open a TransactionDB similar to DB::Open(). |
216 | // Internally call PrepareWrap() and WrapDB() | |
11fdf7f2 | 217 | // If the return status is not ok, then dbptr is set to nullptr. |
7c673cae FG |
218 | static Status Open(const Options& options, |
219 | const TransactionDBOptions& txn_db_options, | |
220 | const std::string& dbname, TransactionDB** dbptr); | |
221 | ||
222 | static Status Open(const DBOptions& db_options, | |
223 | const TransactionDBOptions& txn_db_options, | |
224 | const std::string& dbname, | |
225 | const std::vector<ColumnFamilyDescriptor>& column_families, | |
226 | std::vector<ColumnFamilyHandle*>* handles, | |
227 | TransactionDB** dbptr); | |
7c673cae FG |
228 | // Note: PrepareWrap() may change parameters, make copies before the |
229 | // invocation if needed. | |
7c673cae FG |
230 | static void PrepareWrap(DBOptions* db_options, |
231 | std::vector<ColumnFamilyDescriptor>* column_families, | |
232 | std::vector<size_t>* compaction_enabled_cf_indices); | |
11fdf7f2 TL |
233 | // If the return status is not ok, then dbptr will bet set to nullptr. The |
234 | // input db parameter might or might not be deleted as a result of the | |
235 | // failure. If it is properly deleted it will be set to nullptr. If the return | |
236 | // status is ok, the ownership of db is transferred to dbptr. | |
7c673cae FG |
237 | static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options, |
238 | const std::vector<size_t>& compaction_enabled_cf_indices, | |
239 | const std::vector<ColumnFamilyHandle*>& handles, | |
240 | TransactionDB** dbptr); | |
11fdf7f2 TL |
241 | // If the return status is not ok, then dbptr will bet set to nullptr. The |
242 | // input db parameter might or might not be deleted as a result of the | |
243 | // failure. If it is properly deleted it will be set to nullptr. If the return | |
244 | // status is ok, the ownership of db is transferred to dbptr. | |
7c673cae FG |
245 | static Status WrapStackableDB( |
246 | StackableDB* db, const TransactionDBOptions& txn_db_options, | |
247 | const std::vector<size_t>& compaction_enabled_cf_indices, | |
248 | const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr); | |
11fdf7f2 TL |
249 | // Since the destructor in StackableDB is virtual, this destructor is virtual |
250 | // too. The root db will be deleted by the base's destructor. | |
251 | ~TransactionDB() override {} | |
7c673cae FG |
252 | |
253 | // Starts a new Transaction. | |
254 | // | |
255 | // Caller is responsible for deleting the returned transaction when no | |
256 | // longer needed. | |
257 | // | |
258 | // If old_txn is not null, BeginTransaction will reuse this Transaction | |
259 | // handle instead of allocating a new one. This is an optimization to avoid | |
260 | // extra allocations when repeatedly creating transactions. | |
261 | virtual Transaction* BeginTransaction( | |
262 | const WriteOptions& write_options, | |
263 | const TransactionOptions& txn_options = TransactionOptions(), | |
264 | Transaction* old_txn = nullptr) = 0; | |
265 | ||
266 | virtual Transaction* GetTransactionByName(const TransactionName& name) = 0; | |
267 | virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0; | |
268 | ||
269 | // Returns set of all locks held. | |
270 | // | |
271 | // The mapping is column family id -> KeyLockInfo | |
272 | virtual std::unordered_multimap<uint32_t, KeyLockInfo> | |
273 | GetLockStatusData() = 0; | |
11fdf7f2 TL |
274 | virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; |
275 | virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; | |
7c673cae FG |
276 | |
277 | protected: | |
278 | // To Create an TransactionDB, call Open() | |
11fdf7f2 | 279 | // The ownership of db is transferred to the base StackableDB |
7c673cae FG |
280 | explicit TransactionDB(DB* db) : StackableDB(db) {} |
281 | ||
282 | private: | |
283 | // No copying allowed | |
284 | TransactionDB(const TransactionDB&); | |
285 | void operator=(const TransactionDB&); | |
286 | }; | |
287 | ||
288 | } // namespace rocksdb | |
289 | ||
290 | #endif // ROCKSDB_LITE |