]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/utilities/transaction_db.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction_db.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#pragma once
7#ifndef ROCKSDB_LITE
8
9#include <string>
10#include <utility>
11#include <vector>
12
13#include "rocksdb/comparator.h"
14#include "rocksdb/db.h"
15#include "rocksdb/utilities/stackable_db.h"
16#include "rocksdb/utilities/transaction.h"
17
18// Database with Transaction support.
19//
20// See transaction.h and examples/transaction_example.cc
21
f67539c2 22namespace ROCKSDB_NAMESPACE {
7c673cae
FG
23
24class TransactionDBMutexFactory;
25
11fdf7f2
TL
26enum TxnDBWritePolicy {
27 WRITE_COMMITTED = 0, // write only the committed data
11fdf7f2 28 WRITE_PREPARED, // write data after the prepare phase of 2pc
11fdf7f2
TL
29 WRITE_UNPREPARED // write data before the prepare phase of 2pc
30};
31
32const uint32_t kInitialMaxDeadlocks = 5;
33
7c673cae
FG
34struct TransactionDBOptions {
35 // Specifies the maximum number of keys that can be locked at the same time
36 // per column family.
37 // If the number of locked keys is greater than max_num_locks, transaction
38 // writes (or GetForUpdate) will return an error.
39 // If this value is not positive, no limit will be enforced.
40 int64_t max_num_locks = -1;
41
11fdf7f2
TL
42 // Stores the number of latest deadlocks to track
43 uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
44
7c673cae
FG
45 // Increasing this value will increase the concurrency by dividing the lock
46 // table (per column family) into more sub-tables, each with their own
47 // separate
48 // mutex.
49 size_t num_stripes = 16;
50
51 // If positive, specifies the default wait timeout in milliseconds when
52 // a transaction attempts to lock a key if not specified by
53 // TransactionOptions::lock_timeout.
54 //
55 // If 0, no waiting is done if a lock cannot instantly be acquired.
56 // If negative, there is no timeout. Not using a timeout is not recommended
57 // as it can lead to deadlocks. Currently, there is no deadlock-detection to
58 // recover
59 // from a deadlock.
60 int64_t transaction_lock_timeout = 1000; // 1 second
61
62 // If positive, specifies the wait timeout in milliseconds when writing a key
63 // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
64 // directly).
65 // If 0, no waiting is done if a lock cannot instantly be acquired.
66 // If negative, there is no timeout and will block indefinitely when acquiring
67 // a lock.
68 //
69 // Not using a timeout can lead to deadlocks. Currently, there
70 // is no deadlock-detection to recover from a deadlock. While DB writes
71 // cannot deadlock with other DB writes, they can deadlock with a transaction.
72 // A negative timeout should only be used if all transactions have a small
73 // expiration set.
74 int64_t default_lock_timeout = 1000; // 1 second
75
11fdf7f2 76 // If set, the TransactionDB will use this implementation of a mutex and
7c673cae
FG
77 // condition variable for all transaction locking instead of the default
78 // mutex/condvar implementation.
79 std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
11fdf7f2
TL
80
81 // The policy for when to write the data into the DB. The default policy is to
82 // write only the committed data (WRITE_COMMITTED). The data could be written
83 // before the commit phase. The DB then needs to provide the mechanisms to
84 // tell apart committed from uncommitted data.
85 TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
86
87 // TODO(myabandeh): remove this option
88 // Note: this is a temporary option as a hot fix in rollback of writeprepared
89 // txns in myrocks. MyRocks uses merge operands for autoinc column id without
90 // however obtaining locks. This breaks the assumption behind the rollback
91 // logic in myrocks. This hack of simply not rolling back merge operands works
92 // for the special way that myrocks uses this operands.
93 bool rollback_merge_operands = false;
494da23a 94
f67539c2
TL
95 // If true, the TransactionDB implementation might skip concurrency control
96 // unless it is overridden by TransactionOptions or
97 // TransactionDBWriteOptimizations. This can be used in conjuction with
98 // DBOptions::unordered_write when the TransactionDB is used solely for write
99 // ordering rather than concurrency control.
100 bool skip_concurrency_control = false;
101
102 // This option is only valid for write unprepared. If a write batch exceeds
103 // this threshold, then the transaction will implicitly flush the currently
104 // pending writes into the database. A value of 0 or less means no limit.
105 int64_t default_write_batch_flush_threshold = 0;
106
494da23a
TL
107 private:
108 // 128 entries
109 size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
110 // 8m entry, 64MB size
111 size_t wp_commit_cache_bits = static_cast<size_t>(23);
112
f67539c2
TL
113 // For testing, whether transaction name should be auto-generated or not. This
114 // is useful for write unprepared which requires named transactions.
115 bool autogenerate_name = false;
116
494da23a 117 friend class WritePreparedTxnDB;
f67539c2 118 friend class WriteUnpreparedTxn;
494da23a 119 friend class WritePreparedTransactionTestBase;
f67539c2 120 friend class TransactionTestBase;
494da23a 121 friend class MySQLStyleTransactionTest;
7c673cae
FG
122};
123
124struct TransactionOptions {
125 // Setting set_snapshot=true is the same as calling
126 // Transaction::SetSnapshot().
127 bool set_snapshot = false;
128
129 // Setting to true means that before acquiring locks, this transaction will
130 // check if doing so will cause a deadlock. If so, it will return with
131 // Status::Busy. The user should retry their transaction.
132 bool deadlock_detect = false;
133
11fdf7f2
TL
134 // If set, it states that the CommitTimeWriteBatch represents the latest state
135 // of the application, has only one sub-batch, i.e., no duplicate keys, and
136 // meant to be used later during recovery. It enables an optimization to
137 // postpone updating the memtable with CommitTimeWriteBatch to only
138 // SwitchMemtable or recovery.
139 bool use_only_the_last_commit_time_batch_for_recovery = false;
140
7c673cae
FG
141 // TODO(agiardullo): TransactionDB does not yet support comparators that allow
142 // two non-equal keys to be equivalent. Ie, cmp->Compare(a,b) should only
143 // return 0 if
144 // a.compare(b) returns 0.
145
7c673cae
FG
146 // If positive, specifies the wait timeout in milliseconds when
147 // a transaction attempts to lock a key.
148 //
149 // If 0, no waiting is done if a lock cannot instantly be acquired.
150 // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
151 int64_t lock_timeout = -1;
152
153 // Expiration duration in milliseconds. If non-negative, transactions that
154 // last longer than this many milliseconds will fail to commit. If not set,
155 // a forgotten transaction that is never committed, rolled back, or deleted
156 // will never relinquish any locks it holds. This could prevent keys from
157 // being written by other writers.
158 int64_t expiration = -1;
159
160 // The number of traversals to make during deadlock detection.
161 int64_t deadlock_detect_depth = 50;
162
163 // The maximum number of bytes used for the write batch. 0 means no limit.
164 size_t max_write_batch_size = 0;
11fdf7f2
TL
165
166 // Skip Concurrency Control. This could be as an optimization if the
167 // application knows that the transaction would not have any conflict with
168 // concurrent transactions. It could also be used during recovery if (i)
169 // application guarantees no conflict between prepared transactions in the WAL
170 // (ii) application guarantees that recovered transactions will be rolled
171 // back/commit before new transactions start.
172 // Default: false
173 bool skip_concurrency_control = false;
f67539c2 174
20effc67
TL
175 // In pessimistic transaction, if this is true, then you can skip Prepare
176 // before Commit, otherwise, you must Prepare before Commit.
177 bool skip_prepare = true;
178
f67539c2
TL
179 // See TransactionDBOptions::default_write_batch_flush_threshold for
180 // description. If a negative value is specified, then the default value from
181 // TransactionDBOptions is used.
182 int64_t write_batch_flush_threshold = -1;
11fdf7f2
TL
183};
184
185// The per-write optimizations that do not involve transactions. TransactionDB
186// implementation might or might not make use of the specified optimizations.
187struct TransactionDBWriteOptimizations {
188 // If it is true it means that the application guarantees that the
189 // key-set in the write batch do not conflict with any concurrent transaction
190 // and hence the concurrency control mechanism could be skipped for this
191 // write.
192 bool skip_concurrency_control = false;
193 // If true, the application guarantees that there is no duplicate <column
194 // family, key> in the write batch and any employed mechanism to handle
195 // duplicate keys could be skipped.
196 bool skip_duplicate_key_check = false;
7c673cae
FG
197};
198
199struct KeyLockInfo {
200 std::string key;
201 std::vector<TransactionID> ids;
202 bool exclusive;
203};
204
20effc67
TL
205struct RangeLockInfo {
206 Endpoint start;
207 Endpoint end;
208 std::vector<TransactionID> ids;
209 bool exclusive;
210};
211
11fdf7f2
TL
212struct DeadlockInfo {
213 TransactionID m_txn_id;
214 uint32_t m_cf_id;
11fdf7f2 215 bool m_exclusive;
494da23a 216 std::string m_waiting_key;
11fdf7f2
TL
217};
218
219struct DeadlockPath {
220 std::vector<DeadlockInfo> path;
221 bool limit_exceeded;
222 int64_t deadlock_time;
223
224 explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
225 const int64_t& dl_time)
226 : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
227
228 // empty path, limit exceeded constructor and default constructor
229 explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
230 : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
231
232 bool empty() { return path.empty() && !limit_exceeded; }
233};
234
7c673cae
FG
235class TransactionDB : public StackableDB {
236 public:
11fdf7f2
TL
237 // Optimized version of ::Write that receives more optimization request such
238 // as skip_concurrency_control.
239 using StackableDB::Write;
240 virtual Status Write(const WriteOptions& opts,
241 const TransactionDBWriteOptimizations&,
242 WriteBatch* updates) {
243 // The default implementation ignores TransactionDBWriteOptimizations and
244 // falls back to the un-optimized version of ::Write
245 return Write(opts, updates);
246 }
20effc67
TL
247 // Transactional `DeleteRange()` is not yet supported.
248 // However, users who know their deleted range does not conflict with
249 // anything can still use it via the `Write()` API. In all cases, the
250 // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
251 // used and `skip_concurrency_control` must be set. When using either
252 // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
253 // additionally be set.
254 virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
255 const Slice&, const Slice&) override {
256 return Status::NotSupported();
257 }
7c673cae
FG
258 // Open a TransactionDB similar to DB::Open().
259 // Internally call PrepareWrap() and WrapDB()
11fdf7f2 260 // If the return status is not ok, then dbptr is set to nullptr.
7c673cae
FG
261 static Status Open(const Options& options,
262 const TransactionDBOptions& txn_db_options,
263 const std::string& dbname, TransactionDB** dbptr);
264
265 static Status Open(const DBOptions& db_options,
266 const TransactionDBOptions& txn_db_options,
267 const std::string& dbname,
268 const std::vector<ColumnFamilyDescriptor>& column_families,
269 std::vector<ColumnFamilyHandle*>* handles,
270 TransactionDB** dbptr);
7c673cae
FG
271 // Note: PrepareWrap() may change parameters, make copies before the
272 // invocation if needed.
7c673cae
FG
273 static void PrepareWrap(DBOptions* db_options,
274 std::vector<ColumnFamilyDescriptor>* column_families,
275 std::vector<size_t>* compaction_enabled_cf_indices);
11fdf7f2
TL
276 // If the return status is not ok, then dbptr will bet set to nullptr. The
277 // input db parameter might or might not be deleted as a result of the
278 // failure. If it is properly deleted it will be set to nullptr. If the return
279 // status is ok, the ownership of db is transferred to dbptr.
7c673cae
FG
280 static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
281 const std::vector<size_t>& compaction_enabled_cf_indices,
282 const std::vector<ColumnFamilyHandle*>& handles,
283 TransactionDB** dbptr);
11fdf7f2
TL
284 // If the return status is not ok, then dbptr will bet set to nullptr. The
285 // input db parameter might or might not be deleted as a result of the
286 // failure. If it is properly deleted it will be set to nullptr. If the return
287 // status is ok, the ownership of db is transferred to dbptr.
7c673cae
FG
288 static Status WrapStackableDB(
289 StackableDB* db, const TransactionDBOptions& txn_db_options,
290 const std::vector<size_t>& compaction_enabled_cf_indices,
291 const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
11fdf7f2
TL
292 // Since the destructor in StackableDB is virtual, this destructor is virtual
293 // too. The root db will be deleted by the base's destructor.
294 ~TransactionDB() override {}
7c673cae
FG
295
296 // Starts a new Transaction.
297 //
298 // Caller is responsible for deleting the returned transaction when no
299 // longer needed.
300 //
301 // If old_txn is not null, BeginTransaction will reuse this Transaction
302 // handle instead of allocating a new one. This is an optimization to avoid
303 // extra allocations when repeatedly creating transactions.
304 virtual Transaction* BeginTransaction(
305 const WriteOptions& write_options,
306 const TransactionOptions& txn_options = TransactionOptions(),
307 Transaction* old_txn = nullptr) = 0;
308
309 virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
310 virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
311
312 // Returns set of all locks held.
313 //
314 // The mapping is column family id -> KeyLockInfo
315 virtual std::unordered_multimap<uint32_t, KeyLockInfo>
316 GetLockStatusData() = 0;
20effc67 317
11fdf7f2
TL
318 virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
319 virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
7c673cae
FG
320
321 protected:
322 // To Create an TransactionDB, call Open()
11fdf7f2 323 // The ownership of db is transferred to the base StackableDB
7c673cae 324 explicit TransactionDB(DB* db) : StackableDB(db) {}
7c673cae 325 // No copying allowed
f67539c2
TL
326 TransactionDB(const TransactionDB&) = delete;
327 void operator=(const TransactionDB&) = delete;
7c673cae
FG
328};
329
f67539c2 330} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
331
332#endif // ROCKSDB_LITE