]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/utilities/transaction.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / include / rocksdb / utilities / transaction.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#pragma once
7
8#ifndef ROCKSDB_LITE
9
10#include <string>
11#include <vector>
12
13#include "rocksdb/comparator.h"
14#include "rocksdb/db.h"
15#include "rocksdb/status.h"
16
17namespace rocksdb {
18
19class Iterator;
20class TransactionDB;
21class WriteBatchWithIndex;
22
23using TransactionName = std::string;
24
25using TransactionID = uint64_t;
26
27// Provides notification to the caller of SetSnapshotOnNextOperation when
28// the actual snapshot gets created
29class TransactionNotifier {
30 public:
31 virtual ~TransactionNotifier() {}
32
33 // Implement this method to receive notification when a snapshot is
34 // requested via SetSnapshotOnNextOperation.
35 virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
36};
37
38// Provides BEGIN/COMMIT/ROLLBACK transactions.
39//
40// To use transactions, you must first create either an OptimisticTransactionDB
41// or a TransactionDB. See examples/[optimistic_]transaction_example.cc for
42// more information.
43//
44// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
45//
46// It is up to the caller to synchronize access to this object.
47//
48// See examples/transaction_example.cc for some simple examples.
49//
50// TODO(agiardullo): Not yet implemented
51// -PerfContext statistics
52// -Support for using Transactions with DBWithTTL
53class Transaction {
54 public:
55 virtual ~Transaction() {}
56
57 // If a transaction has a snapshot set, the transaction will ensure that
58 // any keys successfully written(or fetched via GetForUpdate()) have not
59 // been modified outside of this transaction since the time the snapshot was
60 // set.
61 // If a snapshot has not been set, the transaction guarantees that keys have
62 // not been modified since the time each key was first written (or fetched via
63 // GetForUpdate()).
64 //
65 // Using SetSnapshot() will provide stricter isolation guarantees at the
66 // expense of potentially more transaction failures due to conflicts with
67 // other writes.
68 //
69 // Calling SetSnapshot() has no effect on keys written before this function
70 // has been called.
71 //
72 // SetSnapshot() may be called multiple times if you would like to change
73 // the snapshot used for different operations in this transaction.
74 //
75 // Calling SetSnapshot will not affect the version of Data returned by Get()
76 // methods. See Transaction::Get() for more details.
77 virtual void SetSnapshot() = 0;
78
79 // Similar to SetSnapshot(), but will not change the current snapshot
80 // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
81 // By calling this function, the transaction will essentially call
82 // SetSnapshot() for you right before performing the next write/GetForUpdate.
83 //
84 // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
85 // returned by GetSnapshot() until the next write/GetForUpdate is executed.
86 //
87 // When the snapshot is created the notifier's SnapshotCreated method will
88 // be called so that the caller can get access to the snapshot.
89 //
90 // This is an optimization to reduce the likelihood of conflicts that
91 // could occur in between the time SetSnapshot() is called and the first
92 // write/GetForUpdate operation. Eg, this prevents the following
93 // race-condition:
94 //
95 // txn1->SetSnapshot();
96 // txn2->Put("A", ...);
97 // txn2->Commit();
98 // txn1->GetForUpdate(opts, "A", ...); // FAIL!
99 virtual void SetSnapshotOnNextOperation(
100 std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
101
102 // Returns the Snapshot created by the last call to SetSnapshot().
103 //
104 // REQUIRED: The returned Snapshot is only valid up until the next time
105 // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
106 // is called, or the Transaction is deleted.
107 virtual const Snapshot* GetSnapshot() const = 0;
108
109 // Clears the current snapshot (i.e. no snapshot will be 'set')
110 //
111 // This removes any snapshot that currently exists or is set to be created
112 // on the next update operation (SetSnapshotOnNextOperation).
113 //
114 // Calling ClearSnapshot() has no effect on keys written before this function
115 // has been called.
116 //
117 // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
118 // longer be valid and should be discarded after a call to ClearSnapshot().
119 virtual void ClearSnapshot() = 0;
120
11fdf7f2 121 // Prepare the current transaction for 2PC
7c673cae
FG
122 virtual Status Prepare() = 0;
123
124 // Write all batched keys to the db atomically.
125 //
126 // Returns OK on success.
127 //
128 // May return any error status that could be returned by DB:Write().
129 //
130 // If this transaction was created by an OptimisticTransactionDB(),
131 // Status::Busy() may be returned if the transaction could not guarantee
132 // that there are no write conflicts. Status::TryAgain() may be returned
133 // if the memtable history size is not large enough
134 // (See max_write_buffer_number_to_maintain).
135 //
136 // If this transaction was created by a TransactionDB(), Status::Expired()
137 // may be returned if this transaction has lived for longer than
138 // TransactionOptions.expiration.
139 virtual Status Commit() = 0;
140
141 // Discard all batched writes in this transaction.
142 virtual Status Rollback() = 0;
143
144 // Records the state of the transaction for future calls to
145 // RollbackToSavePoint(). May be called multiple times to set multiple save
146 // points.
147 virtual void SetSavePoint() = 0;
148
149 // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
150 // since the most recent call to SetSavePoint() and removes the most recent
151 // SetSavePoint().
152 // If there is no previous call to SetSavePoint(), returns Status::NotFound()
153 virtual Status RollbackToSavePoint() = 0;
154
11fdf7f2
TL
155 // Pop the most recent save point.
156 // If there is no previous call to SetSavePoint(), Status::NotFound()
157 // will be returned.
158 // Otherwise returns Status::OK().
159 virtual Status PopSavePoint() = 0;
160
7c673cae
FG
161 // This function is similar to DB::Get() except it will also read pending
162 // changes in this transaction. Currently, this function will return
163 // Status::MergeInProgress if the most recent write to the queried key in
164 // this batch is a Merge.
165 //
166 // If read_options.snapshot is not set, the current version of the key will
167 // be read. Calling SetSnapshot() does not affect the version of the data
168 // returned.
169 //
170 // Note that setting read_options.snapshot will affect what is read from the
171 // DB but will NOT change which keys are read from this transaction (the keys
172 // in this transaction do not yet belong to any snapshot and will be fetched
173 // regardless).
174 virtual Status Get(const ReadOptions& options,
175 ColumnFamilyHandle* column_family, const Slice& key,
176 std::string* value) = 0;
177
11fdf7f2
TL
178 // An overload of the above method that receives a PinnableSlice
179 // For backward compatibility a default implementation is provided
180 virtual Status Get(const ReadOptions& options,
181 ColumnFamilyHandle* column_family, const Slice& key,
182 PinnableSlice* pinnable_val) {
183 assert(pinnable_val != nullptr);
184 auto s = Get(options, column_family, key, pinnable_val->GetSelf());
185 pinnable_val->PinSelf();
186 return s;
187 }
188
7c673cae
FG
189 virtual Status Get(const ReadOptions& options, const Slice& key,
190 std::string* value) = 0;
11fdf7f2
TL
191 virtual Status Get(const ReadOptions& options, const Slice& key,
192 PinnableSlice* pinnable_val) {
193 assert(pinnable_val != nullptr);
194 auto s = Get(options, key, pinnable_val->GetSelf());
195 pinnable_val->PinSelf();
196 return s;
197 }
7c673cae
FG
198
199 virtual std::vector<Status> MultiGet(
200 const ReadOptions& options,
201 const std::vector<ColumnFamilyHandle*>& column_family,
202 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
203
204 virtual std::vector<Status> MultiGet(const ReadOptions& options,
205 const std::vector<Slice>& keys,
206 std::vector<std::string>* values) = 0;
207
208 // Read this key and ensure that this transaction will only
209 // be able to be committed if this key is not written outside this
210 // transaction after it has first been read (or after the snapshot if a
211 // snapshot is set in this transaction). The transaction behavior is the
212 // same regardless of whether the key exists or not.
213 //
214 // Note: Currently, this function will return Status::MergeInProgress
215 // if the most recent write to the queried key in this batch is a Merge.
216 //
217 // The values returned by this function are similar to Transaction::Get().
218 // If value==nullptr, then this function will not read any data, but will
219 // still ensure that this key cannot be written to by outside of this
220 // transaction.
221 //
222 // If this transaction was created by an OptimisticTransaction, GetForUpdate()
223 // could cause commit() to fail. Otherwise, it could return any error
224 // that could be returned by DB::Get().
225 //
226 // If this transaction was created by a TransactionDB, it can return
227 // Status::OK() on success,
228 // Status::Busy() if there is a write conflict,
229 // Status::TimedOut() if a lock could not be acquired,
230 // Status::TryAgain() if the memtable history size is not large enough
231 // (See max_write_buffer_number_to_maintain)
232 // Status::MergeInProgress() if merge operations cannot be resolved.
233 // or other errors if this key could not be read.
234 virtual Status GetForUpdate(const ReadOptions& options,
235 ColumnFamilyHandle* column_family,
236 const Slice& key, std::string* value,
237 bool exclusive = true) = 0;
238
11fdf7f2
TL
239 // An overload of the above method that receives a PinnableSlice
240 // For backward compatibility a default implementation is provided
241 virtual Status GetForUpdate(const ReadOptions& options,
242 ColumnFamilyHandle* /*column_family*/,
243 const Slice& key, PinnableSlice* pinnable_val,
244 bool /*exclusive*/ = true) {
245 if (pinnable_val == nullptr) {
246 std::string* null_str = nullptr;
247 return GetForUpdate(options, key, null_str);
248 } else {
249 auto s = GetForUpdate(options, key, pinnable_val->GetSelf());
250 pinnable_val->PinSelf();
251 return s;
252 }
253 }
254
7c673cae
FG
255 virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
256 std::string* value, bool exclusive = true) = 0;
257
258 virtual std::vector<Status> MultiGetForUpdate(
259 const ReadOptions& options,
260 const std::vector<ColumnFamilyHandle*>& column_family,
261 const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
262
263 virtual std::vector<Status> MultiGetForUpdate(
264 const ReadOptions& options, const std::vector<Slice>& keys,
265 std::vector<std::string>* values) = 0;
266
267 // Returns an iterator that will iterate on all keys in the default
268 // column family including both keys in the DB and uncommitted keys in this
269 // transaction.
270 //
271 // Setting read_options.snapshot will affect what is read from the
272 // DB but will NOT change which keys are read from this transaction (the keys
273 // in this transaction do not yet belong to any snapshot and will be fetched
274 // regardless).
275 //
276 // Caller is responsible for deleting the returned Iterator.
277 //
278 // The returned iterator is only valid until Commit(), Rollback(), or
279 // RollbackToSavePoint() is called.
280 virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
281
282 virtual Iterator* GetIterator(const ReadOptions& read_options,
283 ColumnFamilyHandle* column_family) = 0;
284
285 // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
286 // functions in WriteBatch, but will also do conflict checking on the
287 // keys being written.
288 //
289 // If this Transaction was created on an OptimisticTransactionDB, these
290 // functions should always return Status::OK().
291 //
292 // If this Transaction was created on a TransactionDB, the status returned
293 // can be:
294 // Status::OK() on success,
295 // Status::Busy() if there is a write conflict,
296 // Status::TimedOut() if a lock could not be acquired,
297 // Status::TryAgain() if the memtable history size is not large enough
298 // (See max_write_buffer_number_to_maintain)
299 // or other errors on unexpected failures.
300 virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
301 const Slice& value) = 0;
302 virtual Status Put(const Slice& key, const Slice& value) = 0;
303 virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
304 const SliceParts& value) = 0;
305 virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
306
307 virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
308 const Slice& value) = 0;
309 virtual Status Merge(const Slice& key, const Slice& value) = 0;
310
311 virtual Status Delete(ColumnFamilyHandle* column_family,
312 const Slice& key) = 0;
313 virtual Status Delete(const Slice& key) = 0;
314 virtual Status Delete(ColumnFamilyHandle* column_family,
315 const SliceParts& key) = 0;
316 virtual Status Delete(const SliceParts& key) = 0;
317
318 virtual Status SingleDelete(ColumnFamilyHandle* column_family,
319 const Slice& key) = 0;
320 virtual Status SingleDelete(const Slice& key) = 0;
321 virtual Status SingleDelete(ColumnFamilyHandle* column_family,
322 const SliceParts& key) = 0;
323 virtual Status SingleDelete(const SliceParts& key) = 0;
324
325 // PutUntracked() will write a Put to the batch of operations to be committed
326 // in this transaction. This write will only happen if this transaction
327 // gets committed successfully. But unlike Transaction::Put(),
328 // no conflict checking will be done for this key.
329 //
11fdf7f2
TL
330 // If this Transaction was created on a PessimisticTransactionDB, this
331 // function will still acquire locks necessary to make sure this write doesn't
332 // cause conflicts in other transactions and may return Status::Busy().
7c673cae
FG
333 virtual Status PutUntracked(ColumnFamilyHandle* column_family,
334 const Slice& key, const Slice& value) = 0;
335 virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
336 virtual Status PutUntracked(ColumnFamilyHandle* column_family,
337 const SliceParts& key,
338 const SliceParts& value) = 0;
339 virtual Status PutUntracked(const SliceParts& key,
340 const SliceParts& value) = 0;
341
342 virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
343 const Slice& key, const Slice& value) = 0;
344 virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
345
346 virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
347 const Slice& key) = 0;
348
349 virtual Status DeleteUntracked(const Slice& key) = 0;
350 virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
351 const SliceParts& key) = 0;
352 virtual Status DeleteUntracked(const SliceParts& key) = 0;
11fdf7f2
TL
353 virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
354 const Slice& key) = 0;
355
356 virtual Status SingleDeleteUntracked(const Slice& key) = 0;
7c673cae
FG
357
358 // Similar to WriteBatch::PutLogData
359 virtual void PutLogData(const Slice& blob) = 0;
360
361 // By default, all Put/Merge/Delete operations will be indexed in the
362 // transaction so that Get/GetForUpdate/GetIterator can search for these
363 // keys.
364 //
365 // If the caller does not want to fetch the keys about to be written,
366 // they may want to avoid indexing as a performance optimization.
367 // Calling DisableIndexing() will turn off indexing for all future
368 // Put/Merge/Delete operations until EnableIndexing() is called.
369 //
370 // If a key is Put/Merge/Deleted after DisableIndexing is called and then
371 // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
372 // undefined.
373 virtual void DisableIndexing() = 0;
374 virtual void EnableIndexing() = 0;
375
376 // Returns the number of distinct Keys being tracked by this transaction.
11fdf7f2 377 // If this transaction was created by a TransactionDB, this is the number of
7c673cae
FG
378 // keys that are currently locked by this transaction.
379 // If this transaction was created by an OptimisticTransactionDB, this is the
380 // number of keys that need to be checked for conflicts at commit time.
381 virtual uint64_t GetNumKeys() const = 0;
382
383 // Returns the number of Puts/Deletes/Merges that have been applied to this
384 // transaction so far.
385 virtual uint64_t GetNumPuts() const = 0;
386 virtual uint64_t GetNumDeletes() const = 0;
387 virtual uint64_t GetNumMerges() const = 0;
388
389 // Returns the elapsed time in milliseconds since this Transaction began.
390 virtual uint64_t GetElapsedTime() const = 0;
391
392 // Fetch the underlying write batch that contains all pending changes to be
393 // committed.
394 //
395 // Note: You should not write or delete anything from the batch directly and
396 // should only use the functions in the Transaction class to
397 // write to this transaction.
398 virtual WriteBatchWithIndex* GetWriteBatch() = 0;
399
400 // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
401 // this transaction.
402 // Has no effect on OptimisticTransactions.
403 virtual void SetLockTimeout(int64_t timeout) = 0;
404
405 // Return the WriteOptions that will be used during Commit()
406 virtual WriteOptions* GetWriteOptions() = 0;
407
408 // Reset the WriteOptions that will be used during Commit().
409 virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
410
411 // If this key was previously fetched in this transaction using
412 // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
413 // the transaction that it no longer needs to do any conflict checking
414 // for this key.
415 //
416 // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
417 // then UndoGetForUpdate will only have an effect if it is also called N
418 // times. If this key has been written to in this transaction,
419 // UndoGetForUpdate() will have no effect.
420 //
421 // If SetSavePoint() has been called after the GetForUpdate(),
422 // UndoGetForUpdate() will not have any effect.
423 //
424 // If this Transaction was created by an OptimisticTransactionDB,
425 // calling UndoGetForUpdate can affect whether this key is conflict checked
426 // at commit time.
427 // If this Transaction was created by a TransactionDB,
428 // calling UndoGetForUpdate may release any held locks for this key.
429 virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
430 const Slice& key) = 0;
431 virtual void UndoGetForUpdate(const Slice& key) = 0;
432
433 virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
434
435 virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
436
437 virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
438
439 virtual uint64_t GetLogNumber() const { return log_number_; }
440
441 virtual Status SetName(const TransactionName& name) = 0;
442
443 virtual TransactionName GetName() const { return name_; }
444
445 virtual TransactionID GetID() const { return 0; }
446
447 virtual bool IsDeadlockDetect() const { return false; }
448
11fdf7f2
TL
449 virtual std::vector<TransactionID> GetWaitingTxns(
450 uint32_t* /*column_family_id*/, std::string* /*key*/) const {
7c673cae
FG
451 assert(false);
452 return std::vector<TransactionID>();
453 }
454
455 enum TransactionState {
456 STARTED = 0,
457 AWAITING_PREPARE = 1,
458 PREPARED = 2,
459 AWAITING_COMMIT = 3,
460 COMMITED = 4,
461 AWAITING_ROLLBACK = 5,
462 ROLLEDBACK = 6,
463 LOCKS_STOLEN = 7,
464 };
465
466 TransactionState GetState() const { return txn_state_; }
467 void SetState(TransactionState state) { txn_state_ = state; }
468
11fdf7f2
TL
469 // NOTE: Experimental feature
470 // The globally unique id with which the transaction is identified. This id
471 // might or might not be set depending on the implementation. Similarly the
472 // implementation decides the point in lifetime of a transaction at which it
473 // assigns the id. Although currently it is the case, the id is not guaranteed
474 // to remain the same across restarts.
475 uint64_t GetId() { return id_; }
476
7c673cae 477 protected:
11fdf7f2
TL
478 explicit Transaction(const TransactionDB* /*db*/) {}
479 Transaction() : log_number_(0), txn_state_(STARTED) {}
7c673cae
FG
480
481 // the log in which the prepared section for this txn resides
482 // (for two phase commit)
483 uint64_t log_number_;
484 TransactionName name_;
485
486 // Execution status of the transaction.
487 std::atomic<TransactionState> txn_state_;
488
11fdf7f2
TL
489 uint64_t id_ = 0;
490 virtual void SetId(uint64_t id) {
491 assert(id_ == 0);
492 id_ = id;
493 }
494
7c673cae 495 private:
11fdf7f2
TL
496 friend class PessimisticTransactionDB;
497 friend class WriteUnpreparedTxnDB;
7c673cae
FG
498 // No copying allowed
499 Transaction(const Transaction&);
500 void operator=(const Transaction&);
501};
502
503} // namespace rocksdb
504
505#endif // ROCKSDB_LITE