]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/utilities/transaction_db.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction_db.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#pragma once
7#ifndef ROCKSDB_LITE
8
9#include <string>
10#include <utility>
11#include <vector>
12
13#include "rocksdb/comparator.h"
14#include "rocksdb/db.h"
15#include "rocksdb/utilities/stackable_db.h"
16#include "rocksdb/utilities/transaction.h"
17
18// Database with Transaction support.
19//
20// See transaction.h and examples/transaction_example.cc
21
22namespace rocksdb {
23
24class TransactionDBMutexFactory;
25
11fdf7f2
TL
26enum TxnDBWritePolicy {
27 WRITE_COMMITTED = 0, // write only the committed data
28 // TODO(myabandeh): Not implemented yet
29 WRITE_PREPARED, // write data after the prepare phase of 2pc
30 // TODO(myabandeh): Not implemented yet
31 WRITE_UNPREPARED // write data before the prepare phase of 2pc
32};
33
34const uint32_t kInitialMaxDeadlocks = 5;
35
7c673cae
FG
36struct TransactionDBOptions {
37 // Specifies the maximum number of keys that can be locked at the same time
38 // per column family.
39 // If the number of locked keys is greater than max_num_locks, transaction
40 // writes (or GetForUpdate) will return an error.
41 // If this value is not positive, no limit will be enforced.
42 int64_t max_num_locks = -1;
43
11fdf7f2
TL
44 // Stores the number of latest deadlocks to track
45 uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
46
7c673cae
FG
47 // Increasing this value will increase the concurrency by dividing the lock
48 // table (per column family) into more sub-tables, each with their own
49 // separate
50 // mutex.
51 size_t num_stripes = 16;
52
53 // If positive, specifies the default wait timeout in milliseconds when
54 // a transaction attempts to lock a key if not specified by
55 // TransactionOptions::lock_timeout.
56 //
57 // If 0, no waiting is done if a lock cannot instantly be acquired.
58 // If negative, there is no timeout. Not using a timeout is not recommended
59 // as it can lead to deadlocks. Currently, there is no deadlock-detection to
60 // recover
61 // from a deadlock.
62 int64_t transaction_lock_timeout = 1000; // 1 second
63
64 // If positive, specifies the wait timeout in milliseconds when writing a key
65 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
66 // directly).
67 // If 0, no waiting is done if a lock cannot instantly be acquired.
68 // If negative, there is no timeout and will block indefinitely when acquiring
69 // a lock.
70 //
71 // Not using a timeout can lead to deadlocks. Currently, there
72 // is no deadlock-detection to recover from a deadlock. While DB writes
73 // cannot deadlock with other DB writes, they can deadlock with a transaction.
74 // A negative timeout should only be used if all transactions have a small
75 // expiration set.
76 int64_t default_lock_timeout = 1000; // 1 second
77
11fdf7f2 78 // If set, the TransactionDB will use this implementation of a mutex and
7c673cae
FG
79 // condition variable for all transaction locking instead of the default
80 // mutex/condvar implementation.
81 std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
11fdf7f2
TL
82
83 // The policy for when to write the data into the DB. The default policy is to
84 // write only the committed data (WRITE_COMMITTED). The data could be written
85 // before the commit phase. The DB then needs to provide the mechanisms to
86 // tell apart committed from uncommitted data.
87 TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
88
89 // TODO(myabandeh): remove this option
90 // Note: this is a temporary option as a hot fix in rollback of writeprepared
91 // txns in myrocks. MyRocks uses merge operands for autoinc column id without
92 // however obtaining locks. This breaks the assumption behind the rollback
93 // logic in myrocks. This hack of simply not rolling back merge operands works
94 // for the special way that myrocks uses this operands.
95 bool rollback_merge_operands = false;
494da23a
TL
96
97 private:
98 // 128 entries
99 size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
100 // 8m entry, 64MB size
101 size_t wp_commit_cache_bits = static_cast<size_t>(23);
102
103 friend class WritePreparedTxnDB;
104 friend class WritePreparedTransactionTestBase;
105 friend class MySQLStyleTransactionTest;
7c673cae
FG
106};
107
108struct TransactionOptions {
109 // Setting set_snapshot=true is the same as calling
110 // Transaction::SetSnapshot().
111 bool set_snapshot = false;
112
113 // Setting to true means that before acquiring locks, this transaction will
114 // check if doing so will cause a deadlock. If so, it will return with
115 // Status::Busy. The user should retry their transaction.
116 bool deadlock_detect = false;
117
11fdf7f2
TL
118 // If set, it states that the CommitTimeWriteBatch represents the latest state
119 // of the application, has only one sub-batch, i.e., no duplicate keys, and
120 // meant to be used later during recovery. It enables an optimization to
121 // postpone updating the memtable with CommitTimeWriteBatch to only
122 // SwitchMemtable or recovery.
123 bool use_only_the_last_commit_time_batch_for_recovery = false;
124
7c673cae
FG
125 // TODO(agiardullo): TransactionDB does not yet support comparators that allow
126 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
127 // return 0 if
128 // a.compare(b) returns 0.
129
7c673cae
FG
130 // If positive, specifies the wait timeout in milliseconds when
131 // a transaction attempts to lock a key.
132 //
133 // If 0, no waiting is done if a lock cannot instantly be acquired.
134 // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
135 int64_t lock_timeout = -1;
136
137 // Expiration duration in milliseconds. If non-negative, transactions that
138 // last longer than this many milliseconds will fail to commit. If not set,
139 // a forgotten transaction that is never committed, rolled back, or deleted
140 // will never relinquish any locks it holds. This could prevent keys from
141 // being written by other writers.
142 int64_t expiration = -1;
143
144 // The number of traversals to make during deadlock detection.
145 int64_t deadlock_detect_depth = 50;
146
147 // The maximum number of bytes used for the write batch. 0 means no limit.
148 size_t max_write_batch_size = 0;
11fdf7f2
TL
149
150 // Skip Concurrency Control. This could be as an optimization if the
151 // application knows that the transaction would not have any conflict with
152 // concurrent transactions. It could also be used during recovery if (i)
153 // application guarantees no conflict between prepared transactions in the WAL
154 // (ii) application guarantees that recovered transactions will be rolled
155 // back/commit before new transactions start.
156 // Default: false
157 bool skip_concurrency_control = false;
158};
159
160// The per-write optimizations that do not involve transactions. TransactionDB
161// implementation might or might not make use of the specified optimizations.
162struct TransactionDBWriteOptimizations {
163 // If it is true it means that the application guarantees that the
164 // key-set in the write batch do not conflict with any concurrent transaction
165 // and hence the concurrency control mechanism could be skipped for this
166 // write.
167 bool skip_concurrency_control = false;
168 // If true, the application guarantees that there is no duplicate <column
169 // family, key> in the write batch and any employed mechanism to handle
170 // duplicate keys could be skipped.
171 bool skip_duplicate_key_check = false;
7c673cae
FG
172};
173
174struct KeyLockInfo {
175 std::string key;
176 std::vector<TransactionID> ids;
177 bool exclusive;
178};
179
11fdf7f2
TL
180struct DeadlockInfo {
181 TransactionID m_txn_id;
182 uint32_t m_cf_id;
11fdf7f2 183 bool m_exclusive;
494da23a 184 std::string m_waiting_key;
11fdf7f2
TL
185};
186
187struct DeadlockPath {
188 std::vector<DeadlockInfo> path;
189 bool limit_exceeded;
190 int64_t deadlock_time;
191
192 explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
193 const int64_t& dl_time)
194 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
195
196 // empty path, limit exceeded constructor and default constructor
197 explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
198 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
199
200 bool empty() { return path.empty() && !limit_exceeded; }
201};
202
7c673cae
FG
203class TransactionDB : public StackableDB {
204 public:
11fdf7f2
TL
205 // Optimized version of ::Write that receives more optimization request such
206 // as skip_concurrency_control.
207 using StackableDB::Write;
208 virtual Status Write(const WriteOptions& opts,
209 const TransactionDBWriteOptimizations&,
210 WriteBatch* updates) {
211 // The default implementation ignores TransactionDBWriteOptimizations and
212 // falls back to the un-optimized version of ::Write
213 return Write(opts, updates);
214 }
7c673cae
FG
215 // Open a TransactionDB similar to DB::Open().
216 // Internally call PrepareWrap() and WrapDB()
11fdf7f2 217 // If the return status is not ok, then dbptr is set to nullptr.
7c673cae
FG
218 static Status Open(const Options& options,
219 const TransactionDBOptions& txn_db_options,
220 const std::string& dbname, TransactionDB** dbptr);
221
222 static Status Open(const DBOptions& db_options,
223 const TransactionDBOptions& txn_db_options,
224 const std::string& dbname,
225 const std::vector<ColumnFamilyDescriptor>& column_families,
226 std::vector<ColumnFamilyHandle*>* handles,
227 TransactionDB** dbptr);
7c673cae
FG
228 // Note: PrepareWrap() may change parameters, make copies before the
229 // invocation if needed.
7c673cae
FG
230 static void PrepareWrap(DBOptions* db_options,
231 std::vector<ColumnFamilyDescriptor>* column_families,
232 std::vector<size_t>* compaction_enabled_cf_indices);
11fdf7f2
TL
233 // If the return status is not ok, then dbptr will bet set to nullptr. The
234 // input db parameter might or might not be deleted as a result of the
235 // failure. If it is properly deleted it will be set to nullptr. If the return
236 // status is ok, the ownership of db is transferred to dbptr.
7c673cae
FG
237 static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
238 const std::vector<size_t>& compaction_enabled_cf_indices,
239 const std::vector<ColumnFamilyHandle*>& handles,
240 TransactionDB** dbptr);
11fdf7f2
TL
241 // If the return status is not ok, then dbptr will bet set to nullptr. The
242 // input db parameter might or might not be deleted as a result of the
243 // failure. If it is properly deleted it will be set to nullptr. If the return
244 // status is ok, the ownership of db is transferred to dbptr.
7c673cae
FG
245 static Status WrapStackableDB(
246 StackableDB* db, const TransactionDBOptions& txn_db_options,
247 const std::vector<size_t>& compaction_enabled_cf_indices,
248 const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
11fdf7f2
TL
249 // Since the destructor in StackableDB is virtual, this destructor is virtual
250 // too. The root db will be deleted by the base's destructor.
251 ~TransactionDB() override {}
7c673cae
FG
252
253 // Starts a new Transaction.
254 //
255 // Caller is responsible for deleting the returned transaction when no
256 // longer needed.
257 //
258 // If old_txn is not null, BeginTransaction will reuse this Transaction
259 // handle instead of allocating a new one. This is an optimization to avoid
260 // extra allocations when repeatedly creating transactions.
261 virtual Transaction* BeginTransaction(
262 const WriteOptions& write_options,
263 const TransactionOptions& txn_options = TransactionOptions(),
264 Transaction* old_txn = nullptr) = 0;
265
266 virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
267 virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
268
269 // Returns set of all locks held.
270 //
271 // The mapping is column family id -> KeyLockInfo
272 virtual std::unordered_multimap<uint32_t, KeyLockInfo>
273 GetLockStatusData() = 0;
11fdf7f2
TL
274 virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
275 virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
7c673cae
FG
276
277 protected:
278 // To Create an TransactionDB, call Open()
11fdf7f2 279 // The ownership of db is transferred to the base StackableDB
7c673cae
FG
280 explicit TransactionDB(DB* db) : StackableDB(db) {}
281
282 private:
283 // No copying allowed
284 TransactionDB(const TransactionDB&);
285 void operator=(const TransactionDB&);
286};
287
288} // namespace rocksdb
289
290#endif // ROCKSDB_LITE