1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
13 #include "rocksdb/comparator.h"
14 #include "rocksdb/db.h"
15 #include "rocksdb/utilities/stackable_db.h"
16 #include "rocksdb/utilities/transaction.h"
18 // Database with Transaction support.
20 // See transaction.h and examples/transaction_example.cc
24 class TransactionDBMutexFactory
;
26 enum TxnDBWritePolicy
{
27 WRITE_COMMITTED
= 0, // write only the committed data
28 // TODO(myabandeh): Not implemented yet
29 WRITE_PREPARED
, // write data after the prepare phase of 2pc
30 // TODO(myabandeh): Not implemented yet
31 WRITE_UNPREPARED
// write data before the prepare phase of 2pc
34 const uint32_t kInitialMaxDeadlocks
= 5;
36 struct TransactionDBOptions
{
37 // Specifies the maximum number of keys that can be locked at the same time
39 // If the number of locked keys is greater than max_num_locks, transaction
40 // writes (or GetForUpdate) will return an error.
41 // If this value is not positive, no limit will be enforced.
42 int64_t max_num_locks
= -1;
44 // Stores the number of latest deadlocks to track
45 uint32_t max_num_deadlocks
= kInitialMaxDeadlocks
;
47 // Increasing this value will increase the concurrency by dividing the lock
48 // table (per column family) into more sub-tables, each with their own
51 size_t num_stripes
= 16;
53 // If positive, specifies the default wait timeout in milliseconds when
54 // a transaction attempts to lock a key if not specified by
55 // TransactionOptions::lock_timeout.
57 // If 0, no waiting is done if a lock cannot instantly be acquired.
58 // If negative, there is no timeout. Not using a timeout is not recommended
59 // as it can lead to deadlocks. Currently, there is no deadlock-detection to
62 int64_t transaction_lock_timeout
= 1000; // 1 second
64 // If positive, specifies the wait timeout in milliseconds when writing a key
65 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
67 // If 0, no waiting is done if a lock cannot instantly be acquired.
68 // If negative, there is no timeout and will block indefinitely when acquiring
71 // Not using a timeout can lead to deadlocks. Currently, there
72 // is no deadlock-detection to recover from a deadlock. While DB writes
73 // cannot deadlock with other DB writes, they can deadlock with a transaction.
74 // A negative timeout should only be used if all transactions have a small
76 int64_t default_lock_timeout
= 1000; // 1 second
78 // If set, the TransactionDB will use this implementation of a mutex and
79 // condition variable for all transaction locking instead of the default
80 // mutex/condvar implementation.
81 std::shared_ptr
<TransactionDBMutexFactory
> custom_mutex_factory
;
83 // The policy for when to write the data into the DB. The default policy is to
84 // write only the committed data (WRITE_COMMITTED). The data could be written
85 // before the commit phase. The DB then needs to provide the mechanisms to
86 // tell apart committed from uncommitted data.
87 TxnDBWritePolicy write_policy
= TxnDBWritePolicy::WRITE_COMMITTED
;
89 // TODO(myabandeh): remove this option
90 // Note: this is a temporary option as a hot fix in rollback of writeprepared
91 // txns in myrocks. MyRocks uses merge operands for autoinc column id without
92 // however obtaining locks. This breaks the assumption behind the rollback
93 // logic in myrocks. This hack of simply not rolling back merge operands works
94 // for the special way that myrocks uses this operands.
95 bool rollback_merge_operands
= false;
99 size_t wp_snapshot_cache_bits
= static_cast<size_t>(7);
100 // 8m entry, 64MB size
101 size_t wp_commit_cache_bits
= static_cast<size_t>(23);
103 friend class WritePreparedTxnDB
;
104 friend class WritePreparedTransactionTestBase
;
105 friend class MySQLStyleTransactionTest
;
108 struct TransactionOptions
{
109 // Setting set_snapshot=true is the same as calling
110 // Transaction::SetSnapshot().
111 bool set_snapshot
= false;
113 // Setting to true means that before acquiring locks, this transaction will
114 // check if doing so will cause a deadlock. If so, it will return with
115 // Status::Busy. The user should retry their transaction.
116 bool deadlock_detect
= false;
118 // If set, it states that the CommitTimeWriteBatch represents the latest state
119 // of the application, has only one sub-batch, i.e., no duplicate keys, and
120 // meant to be used later during recovery. It enables an optimization to
121 // postpone updating the memtable with CommitTimeWriteBatch to only
122 // SwitchMemtable or recovery.
123 bool use_only_the_last_commit_time_batch_for_recovery
= false;
125 // TODO(agiardullo): TransactionDB does not yet support comparators that allow
126 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
128 // a.compare(b) returns 0.
130 // If positive, specifies the wait timeout in milliseconds when
131 // a transaction attempts to lock a key.
133 // If 0, no waiting is done if a lock cannot instantly be acquired.
134 // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
135 int64_t lock_timeout
= -1;
137 // Expiration duration in milliseconds. If non-negative, transactions that
138 // last longer than this many milliseconds will fail to commit. If not set,
139 // a forgotten transaction that is never committed, rolled back, or deleted
140 // will never relinquish any locks it holds. This could prevent keys from
141 // being written by other writers.
142 int64_t expiration
= -1;
144 // The number of traversals to make during deadlock detection.
145 int64_t deadlock_detect_depth
= 50;
147 // The maximum number of bytes used for the write batch. 0 means no limit.
148 size_t max_write_batch_size
= 0;
150 // Skip Concurrency Control. This could be as an optimization if the
151 // application knows that the transaction would not have any conflict with
152 // concurrent transactions. It could also be used during recovery if (i)
153 // application guarantees no conflict between prepared transactions in the WAL
154 // (ii) application guarantees that recovered transactions will be rolled
155 // back/commit before new transactions start.
157 bool skip_concurrency_control
= false;
160 // The per-write optimizations that do not involve transactions. TransactionDB
161 // implementation might or might not make use of the specified optimizations.
162 struct TransactionDBWriteOptimizations
{
163 // If it is true it means that the application guarantees that the
164 // key-set in the write batch do not conflict with any concurrent transaction
165 // and hence the concurrency control mechanism could be skipped for this
167 bool skip_concurrency_control
= false;
168 // If true, the application guarantees that there is no duplicate <column
169 // family, key> in the write batch and any employed mechanism to handle
170 // duplicate keys could be skipped.
171 bool skip_duplicate_key_check
= false;
176 std::vector
<TransactionID
> ids
;
180 struct DeadlockInfo
{
181 TransactionID m_txn_id
;
184 std::string m_waiting_key
;
187 struct DeadlockPath
{
188 std::vector
<DeadlockInfo
> path
;
190 int64_t deadlock_time
;
192 explicit DeadlockPath(std::vector
<DeadlockInfo
> path_entry
,
193 const int64_t& dl_time
)
194 : path(path_entry
), limit_exceeded(false), deadlock_time(dl_time
) {}
196 // empty path, limit exceeded constructor and default constructor
197 explicit DeadlockPath(const int64_t& dl_time
= 0, bool limit
= false)
198 : path(0), limit_exceeded(limit
), deadlock_time(dl_time
) {}
200 bool empty() { return path
.empty() && !limit_exceeded
; }
203 class TransactionDB
: public StackableDB
{
205 // Optimized version of ::Write that receives more optimization request such
206 // as skip_concurrency_control.
207 using StackableDB::Write
;
208 virtual Status
Write(const WriteOptions
& opts
,
209 const TransactionDBWriteOptimizations
&,
210 WriteBatch
* updates
) {
211 // The default implementation ignores TransactionDBWriteOptimizations and
212 // falls back to the un-optimized version of ::Write
213 return Write(opts
, updates
);
215 // Open a TransactionDB similar to DB::Open().
216 // Internally call PrepareWrap() and WrapDB()
217 // If the return status is not ok, then dbptr is set to nullptr.
218 static Status
Open(const Options
& options
,
219 const TransactionDBOptions
& txn_db_options
,
220 const std::string
& dbname
, TransactionDB
** dbptr
);
222 static Status
Open(const DBOptions
& db_options
,
223 const TransactionDBOptions
& txn_db_options
,
224 const std::string
& dbname
,
225 const std::vector
<ColumnFamilyDescriptor
>& column_families
,
226 std::vector
<ColumnFamilyHandle
*>* handles
,
227 TransactionDB
** dbptr
);
228 // Note: PrepareWrap() may change parameters, make copies before the
229 // invocation if needed.
230 static void PrepareWrap(DBOptions
* db_options
,
231 std::vector
<ColumnFamilyDescriptor
>* column_families
,
232 std::vector
<size_t>* compaction_enabled_cf_indices
);
233 // If the return status is not ok, then dbptr will bet set to nullptr. The
234 // input db parameter might or might not be deleted as a result of the
235 // failure. If it is properly deleted it will be set to nullptr. If the return
236 // status is ok, the ownership of db is transferred to dbptr.
237 static Status
WrapDB(DB
* db
, const TransactionDBOptions
& txn_db_options
,
238 const std::vector
<size_t>& compaction_enabled_cf_indices
,
239 const std::vector
<ColumnFamilyHandle
*>& handles
,
240 TransactionDB
** dbptr
);
241 // If the return status is not ok, then dbptr will bet set to nullptr. The
242 // input db parameter might or might not be deleted as a result of the
243 // failure. If it is properly deleted it will be set to nullptr. If the return
244 // status is ok, the ownership of db is transferred to dbptr.
245 static Status
WrapStackableDB(
246 StackableDB
* db
, const TransactionDBOptions
& txn_db_options
,
247 const std::vector
<size_t>& compaction_enabled_cf_indices
,
248 const std::vector
<ColumnFamilyHandle
*>& handles
, TransactionDB
** dbptr
);
249 // Since the destructor in StackableDB is virtual, this destructor is virtual
250 // too. The root db will be deleted by the base's destructor.
251 ~TransactionDB() override
{}
253 // Starts a new Transaction.
255 // Caller is responsible for deleting the returned transaction when no
258 // If old_txn is not null, BeginTransaction will reuse this Transaction
259 // handle instead of allocating a new one. This is an optimization to avoid
260 // extra allocations when repeatedly creating transactions.
261 virtual Transaction
* BeginTransaction(
262 const WriteOptions
& write_options
,
263 const TransactionOptions
& txn_options
= TransactionOptions(),
264 Transaction
* old_txn
= nullptr) = 0;
266 virtual Transaction
* GetTransactionByName(const TransactionName
& name
) = 0;
267 virtual void GetAllPreparedTransactions(std::vector
<Transaction
*>* trans
) = 0;
269 // Returns set of all locks held.
271 // The mapping is column family id -> KeyLockInfo
272 virtual std::unordered_multimap
<uint32_t, KeyLockInfo
>
273 GetLockStatusData() = 0;
274 virtual std::vector
<DeadlockPath
> GetDeadlockInfoBuffer() = 0;
275 virtual void SetDeadlockInfoBufferSize(uint32_t target_size
) = 0;
278 // To Create an TransactionDB, call Open()
279 // The ownership of db is transferred to the base StackableDB
280 explicit TransactionDB(DB
* db
) : StackableDB(db
) {}
283 // No copying allowed
284 TransactionDB(const TransactionDB
&);
285 void operator=(const TransactionDB
&);
288 } // namespace rocksdb
290 #endif // ROCKSDB_LITE